aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds2022-08-03 16:29:08 -0700
committerLinus Torvalds2022-08-03 16:29:08 -0700
commitf86d1fbbe7858884d6754534a0afbb74fc30bc26 (patch)
treef61796870edefbe77d495e9d719c68af1d14275b /kernel
parent526942b8134cc34d25d27f95dfff98b8ce2f6fcd (diff)
parent7c6327c77d509e78bff76f2a4551fcfee851682e (diff)
Merge tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next
Pull networking changes from Paolo Abeni: "Core: - Refactor the forward memory allocation to better cope with memory pressure with many open sockets, moving from a per socket cache to a per-CPU one - Replace rwlocks with RCU for better fairness in ping, raw sockets and IP multicast router. - Network-side support for IO uring zero-copy send. - A few skb drop reason improvements, including codegen the source file with string mapping instead of using macro magic. - Rename reference tracking helpers to a more consistent netdev_* schema. - Adapt u64_stats_t type to address load/store tearing issues. - Refine debug helper usage to reduce the log noise caused by bots. BPF: - Improve socket map performance, avoiding skb cloning on read operation. - Add support for 64 bits enum, to match types exposed by kernel. - Introduce support for sleepable uprobes program. - Introduce support for enum textual representation in libbpf. - New helpers to implement synproxy with eBPF/XDP. - Improve loop performances, inlining indirect calls when possible. - Removed all the deprecated libbpf APIs. - Implement new eBPF-based LSM flavor. - Add type match support, which allow accurate queries to the eBPF used types. - A few TCP congetsion control framework usability improvements. - Add new infrastructure to manipulate CT entries via eBPF programs. - Allow for livepatch (KLP) and BPF trampolines to attach to the same kernel function. Protocols: - Introduce per network namespace lookup tables for unix sockets, increasing scalability and reducing contention. - Preparation work for Wi-Fi 7 Multi-Link Operation (MLO) support. - Add support to forciby close TIME_WAIT TCP sockets via user-space tools. - Significant performance improvement for the TLS 1.3 receive path, both for zero-copy and not-zero-copy. - Support for changing the initial MTPCP subflow priority/backup status - Introduce virtually contingus buffers for sockets over RDMA, to cope better with memory pressure. - Extend CAN ethtool support with timestamping capabilities - Refactor CAN build infrastructure to allow building only the needed features. Driver API: - Remove devlink mutex to allow parallel commands on multiple links. - Add support for pause stats in distributed switch. - Implement devlink helpers to query and flash line cards. - New helper for phy mode to register conversion. New hardware / drivers: - Ethernet DSA driver for the rockchip mt7531 on BPI-R2 Pro. - Ethernet DSA driver for the Renesas RZ/N1 A5PSW switch. - Ethernet DSA driver for the Microchip LAN937x switch. - Ethernet PHY driver for the Aquantia AQR113C EPHY. - CAN driver for the OBD-II ELM327 interface. - CAN driver for RZ/N1 SJA1000 CAN controller. - Bluetooth: Infineon CYW55572 Wi-Fi plus Bluetooth combo device. Drivers: - Intel Ethernet NICs: - i40e: add support for vlan pruning - i40e: add support for XDP framented packets - ice: improved vlan offload support - ice: add support for PPPoE offload - Mellanox Ethernet (mlx5) - refactor packet steering offload for performance and scalability - extend support for TC offload - refactor devlink code to clean-up the locking schema - support stacked vlans for bridge offloads - use TLS objects pool to improve connection rate - Netronome Ethernet NICs (nfp): - extend support for IPv6 fields mangling offload - add support for vepa mode in HW bridge - better support for virtio data path acceleration (VDPA) - enable TSO by default - Microsoft vNIC driver (mana) - add support for XDP redirect - Others Ethernet drivers: - bonding: add per-port priority support - microchip lan743x: extend phy support - Fungible funeth: support UDP segmentation offload and XDP xmit - Solarflare EF100: add support for virtual function representors - MediaTek SoC: add XDP support - Mellanox Ethernet/IB switch (mlxsw): - dropped support for unreleased H/W (XM router). - improved stats accuracy - unified bridge model coversion improving scalability (parts 1-6) - support for PTP in Spectrum-2 asics - Broadcom PHYs - add PTP support for BCM54210E - add support for the BCM53128 internal PHY - Marvell Ethernet switches (prestera): - implement support for multicast forwarding offload - Embedded Ethernet switches: - refactor OcteonTx MAC filter for better scalability - improve TC H/W offload for the Felix driver - refactor the Microchip ksz8 and ksz9477 drivers to share the probe code (parts 1, 2), add support for phylink mac configuration - Other WiFi: - Microchip wilc1000: diable WEP support and enable WPA3 - Atheros ath10k: encapsulation offload support Old code removal: - Neterion vxge ethernet driver: this is untouched since more than 10 years" * tag 'net-next-6.0' of git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next: (1890 commits) doc: sfp-phylink: Fix a broken reference wireguard: selftests: support UML wireguard: allowedips: don't corrupt stack when detecting overflow wireguard: selftests: update config fragments wireguard: ratelimiter: use hrtimer in selftest net/mlx5e: xsk: Discard unaligned XSK frames on striding RQ net: usb: ax88179_178a: Bind only to vendor-specific interface selftests: net: fix IOAM test skip return code net: usb: make USB_RTL8153_ECM non user configurable net: marvell: prestera: remove reduntant code octeontx2-pf: Reduce minimum mtu size to 60 net: devlink: Fix missing mutex_unlock() call net/tls: Remove redundant workqueue flush before destroy net: txgbe: Fix an error handling path in txgbe_probe() net: dsa: Fix spelling mistakes and cleanup code Documentation: devlink: add add devlink-selftests to the table of contents dccp: put dccp_qpolicy_full() and dccp_qpolicy_push() in the same lock net: ionic: fix error check for vlan flags in ionic_set_nic_features() net: ice: fix error NETIF_F_HW_VLAN_CTAG_FILTER check in ice_vsi_sync_fltr() nfp: flower: add support for tunnel offload without key ID ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c40
-rw-r--r--kernel/bpf/bpf_iter.c9
-rw-r--r--kernel/bpf/bpf_lsm.c85
-rw-r--r--kernel/bpf/bpf_struct_ops.c10
-rw-r--r--kernel/bpf/btf.c362
-rw-r--r--kernel/bpf/cgroup.c416
-rw-r--r--kernel/bpf/core.c132
-rw-r--r--kernel/bpf/devmap.c6
-rw-r--r--kernel/bpf/hashtab.c6
-rw-r--r--kernel/bpf/helpers.c12
-rw-r--r--kernel/bpf/local_storage.c2
-rw-r--r--kernel/bpf/lpm_trie.c2
-rw-r--r--kernel/bpf/percpu_freelist.c20
-rw-r--r--kernel/bpf/preload/iterators/Makefile10
-rw-r--r--kernel/bpf/syscall.c61
-rw-r--r--kernel/bpf/trampoline.c426
-rw-r--r--kernel/bpf/verifier.c374
-rw-r--r--kernel/events/core.c16
-rw-r--r--kernel/kallsyms.c91
-rw-r--r--kernel/sysctl.c41
-rw-r--r--kernel/time/hrtimer.c1
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/ftrace.c328
-rw-r--r--kernel/trace/trace_uprobe.c7
24 files changed, 1911 insertions, 550 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index fe40d3b9458f..d3e734bf8056 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -70,10 +70,8 @@ int array_map_alloc_check(union bpf_attr *attr)
attr->map_flags & BPF_F_PRESERVE_ELEMS)
return -EINVAL;
- if (attr->value_size > KMALLOC_MAX_SIZE)
- /* if value_size is bigger, the user space won't be able to
- * access the elements.
- */
+ /* avoid overflow on round_up(map->value_size) */
+ if (attr->value_size > INT_MAX)
return -E2BIG;
return 0;
@@ -156,6 +154,11 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
return &array->map;
}
+static void *array_map_elem_ptr(struct bpf_array* array, u32 index)
+{
+ return array->value + (u64)array->elem_size * index;
+}
+
/* Called from syscall or from eBPF program */
static void *array_map_lookup_elem(struct bpf_map *map, void *key)
{
@@ -165,7 +168,7 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key)
if (unlikely(index >= array->map.max_entries))
return NULL;
- return array->value + array->elem_size * (index & array->index_mask);
+ return array->value + (u64)array->elem_size * (index & array->index_mask);
}
static int array_map_direct_value_addr(const struct bpf_map *map, u64 *imm,
@@ -203,7 +206,7 @@ static int array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_insn *insn = insn_buf;
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
const int index = BPF_REG_2;
@@ -272,7 +275,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
* access 'value_size' of them, so copying rounded areas
* will not leak any kernel data
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
@@ -339,7 +342,7 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
value, map->value_size);
} else {
val = array->value +
- array->elem_size * (index & array->index_mask);
+ (u64)array->elem_size * (index & array->index_mask);
if (map_flags & BPF_F_LOCK)
copy_map_value_locked(map, val, value, false);
else
@@ -376,7 +379,7 @@ int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
* returned or zeros which were zero-filled by percpu_alloc,
* so no kernel data leaks possible
*/
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
rcu_read_lock();
pptr = array->pptrs[index & array->index_mask];
for_each_possible_cpu(cpu) {
@@ -408,8 +411,7 @@ static void array_map_free_timers(struct bpf_map *map)
return;
for (i = 0; i < array->map.max_entries; i++)
- bpf_timer_cancel_and_free(array->value + array->elem_size * i +
- map->timer_off);
+ bpf_timer_cancel_and_free(array_map_elem_ptr(array, i) + map->timer_off);
}
/* Called when map->refcnt goes to zero, either from workqueue or from syscall */
@@ -420,7 +422,7 @@ static void array_map_free(struct bpf_map *map)
if (map_value_has_kptrs(map)) {
for (i = 0; i < array->map.max_entries; i++)
- bpf_map_free_kptrs(map, array->value + array->elem_size * i);
+ bpf_map_free_kptrs(map, array_map_elem_ptr(array, i));
bpf_map_free_kptr_off_tab(map);
}
@@ -556,7 +558,7 @@ static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
@@ -575,7 +577,7 @@ static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
index = info->index & array->index_mask;
if (info->percpu_value_buf)
return array->pptrs[index];
- return array->value + array->elem_size * index;
+ return array_map_elem_ptr(array, index);
}
static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
@@ -583,6 +585,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
struct bpf_iter_seq_array_map_info *info = seq->private;
struct bpf_iter__bpf_map_elem ctx = {};
struct bpf_map *map = info->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
struct bpf_iter_meta meta;
struct bpf_prog *prog;
int off = 0, cpu = 0;
@@ -603,7 +606,7 @@ static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
ctx.value = v;
} else {
pptr = v;
- size = round_up(map->value_size, 8);
+ size = array->elem_size;
for_each_possible_cpu(cpu) {
bpf_long_memcpy(info->percpu_value_buf + off,
per_cpu_ptr(pptr, cpu),
@@ -633,11 +636,12 @@ static int bpf_iter_init_array_map(void *priv_data,
{
struct bpf_iter_seq_array_map_info *seq_info = priv_data;
struct bpf_map *map = aux->map;
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
void *value_buf;
u32 buf_size;
if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
- buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+ buf_size = array->elem_size * num_possible_cpus();
value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
if (!value_buf)
return -ENOMEM;
@@ -690,7 +694,7 @@ static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_
if (is_percpu)
val = this_cpu_ptr(array->pptrs[i]);
else
- val = array->value + array->elem_size * i;
+ val = array_map_elem_ptr(array, i);
num_elems++;
key = i;
ret = callback_fn((u64)(long)map, (u64)(long)&key,
@@ -1322,7 +1326,7 @@ static int array_of_map_gen_lookup(struct bpf_map *map,
struct bpf_insn *insn_buf)
{
struct bpf_array *array = container_of(map, struct bpf_array, map);
- u32 elem_size = round_up(map->value_size, 8);
+ u32 elem_size = array->elem_size;
struct bpf_insn *insn = insn_buf;
const int ret = BPF_REG_0;
const int map_ptr = BPF_REG_1;
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8af0cbf9c0cd..2726a5950cfa 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -722,9 +722,6 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg4_type = ARG_ANYTHING,
};
-/* maximum number of loops */
-#define MAX_LOOPS BIT(23)
-
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
@@ -732,9 +729,13 @@ BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64 ret;
u32 i;
+ /* Note: these safety checks are also verified when bpf_loop
+ * is inlined, be careful to modify this code in sync. See
+ * function verifier.c:inline_bpf_loop.
+ */
if (flags)
return -EINVAL;
- if (nr_loops > MAX_LOOPS)
+ if (nr_loops > BPF_MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c
index c1351df9f7ee..fa71d58b7ded 100644
--- a/kernel/bpf/bpf_lsm.c
+++ b/kernel/bpf/bpf_lsm.c
@@ -16,6 +16,7 @@
#include <linux/bpf_local_storage.h>
#include <linux/btf_ids.h>
#include <linux/ima.h>
+#include <linux/bpf-cgroup.h>
/* For every LSM hook that allows attachment of BPF programs, declare a nop
* function where a BPF program can be attached.
@@ -35,6 +36,59 @@ BTF_SET_START(bpf_lsm_hooks)
#undef LSM_HOOK
BTF_SET_END(bpf_lsm_hooks)
+/* List of LSM hooks that should operate on 'current' cgroup regardless
+ * of function signature.
+ */
+BTF_SET_START(bpf_lsm_current_hooks)
+/* operate on freshly allocated sk without any cgroup association */
+BTF_ID(func, bpf_lsm_sk_alloc_security)
+BTF_ID(func, bpf_lsm_sk_free_security)
+BTF_SET_END(bpf_lsm_current_hooks)
+
+/* List of LSM hooks that trigger while the socket is properly locked.
+ */
+BTF_SET_START(bpf_lsm_locked_sockopt_hooks)
+BTF_ID(func, bpf_lsm_socket_sock_rcv_skb)
+BTF_ID(func, bpf_lsm_sock_graft)
+BTF_ID(func, bpf_lsm_inet_csk_clone)
+BTF_ID(func, bpf_lsm_inet_conn_established)
+BTF_SET_END(bpf_lsm_locked_sockopt_hooks)
+
+/* List of LSM hooks that trigger while the socket is _not_ locked,
+ * but it's ok to call bpf_{g,s}etsockopt because the socket is still
+ * in the early init phase.
+ */
+BTF_SET_START(bpf_lsm_unlocked_sockopt_hooks)
+BTF_ID(func, bpf_lsm_socket_post_create)
+BTF_ID(func, bpf_lsm_socket_socketpair)
+BTF_SET_END(bpf_lsm_unlocked_sockopt_hooks)
+
+#ifdef CONFIG_CGROUP_BPF
+void bpf_lsm_find_cgroup_shim(const struct bpf_prog *prog,
+ bpf_func_t *bpf_func)
+{
+ const struct btf_param *args __maybe_unused;
+
+ if (btf_type_vlen(prog->aux->attach_func_proto) < 1 ||
+ btf_id_set_contains(&bpf_lsm_current_hooks,
+ prog->aux->attach_btf_id)) {
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+ return;
+ }
+
+#ifdef CONFIG_NET
+ args = btf_params(prog->aux->attach_func_proto);
+
+ if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCKET])
+ *bpf_func = __cgroup_bpf_run_lsm_socket;
+ else if (args[0].type == btf_sock_ids[BTF_SOCK_TYPE_SOCK])
+ *bpf_func = __cgroup_bpf_run_lsm_sock;
+ else
+#endif
+ *bpf_func = __cgroup_bpf_run_lsm_current;
+}
+#endif
+
int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog,
const struct bpf_prog *prog)
{
@@ -158,6 +212,37 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return prog->aux->sleepable ? &bpf_ima_file_hash_proto : NULL;
case BPF_FUNC_get_attach_cookie:
return bpf_prog_has_trampoline(prog) ? &bpf_get_attach_cookie_proto : NULL;
+ case BPF_FUNC_get_local_storage:
+ return prog->expected_attach_type == BPF_LSM_CGROUP ?
+ &bpf_get_local_storage_proto : NULL;
+ case BPF_FUNC_set_retval:
+ return prog->expected_attach_type == BPF_LSM_CGROUP ?
+ &bpf_set_retval_proto : NULL;
+ case BPF_FUNC_get_retval:
+ return prog->expected_attach_type == BPF_LSM_CGROUP ?
+ &bpf_get_retval_proto : NULL;
+#ifdef CONFIG_NET
+ case BPF_FUNC_setsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_setsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_setsockopt_proto;
+ return NULL;
+ case BPF_FUNC_getsockopt:
+ if (prog->expected_attach_type != BPF_LSM_CGROUP)
+ return NULL;
+ if (btf_id_set_contains(&bpf_lsm_locked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_sk_getsockopt_proto;
+ if (btf_id_set_contains(&bpf_lsm_unlocked_sockopt_hooks,
+ prog->aux->attach_btf_id))
+ return &bpf_unlocked_sk_getsockopt_proto;
+ return NULL;
+#endif
default:
return tracing_prog_func_proto(func_id, prog);
}
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index d9a3c9207240..84b2d9dba79a 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -341,6 +341,9 @@ int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_links *tlinks,
tlinks[BPF_TRAMP_FENTRY].links[0] = link;
tlinks[BPF_TRAMP_FENTRY].nr_links = 1;
+ /* BPF_TRAMP_F_RET_FENTRY_RET is only used by bpf_struct_ops,
+ * and it must be used alone.
+ */
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
model, flags, tlinks, NULL);
@@ -503,10 +506,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto unlock;
}
- /* Error during st_ops->reg(). It is very unlikely since
- * the above init_member() should have caught it earlier
- * before reg(). The only possibility is if there was a race
- * in registering the struct_ops (under the same name) to
+ /* Error during st_ops->reg(). Can happen if this struct_ops needs to be
+ * verified as a whole, after all init_member() calls. Can also happen if
+ * there was a race in registering the struct_ops (under the same name) to
* a sub-system through different struct_ops's maps.
*/
set_memory_nx((long)st_map->image, 1);
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index eb12d4f705cc..7e64447659f3 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -213,7 +213,7 @@ enum {
};
struct btf_kfunc_set_tab {
- struct btf_id_set *sets[BTF_KFUNC_HOOK_MAX][BTF_KFUNC_TYPE_MAX];
+ struct btf_id_set8 *sets[BTF_KFUNC_HOOK_MAX];
};
struct btf_id_dtor_kfunc_tab {
@@ -309,6 +309,7 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
+ [BTF_KIND_ENUM64] = "ENUM64",
};
const char *btf_type_str(const struct btf_type *t)
@@ -666,6 +667,7 @@ static bool btf_type_has_size(const struct btf_type *t)
case BTF_KIND_ENUM:
case BTF_KIND_DATASEC:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
return true;
}
@@ -711,6 +713,11 @@ static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
return (const struct btf_decl_tag *)(t + 1);
}
+static const struct btf_enum64 *btf_type_enum64(const struct btf_type *t)
+{
+ return (const struct btf_enum64 *)(t + 1);
+}
+
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@@ -1019,6 +1026,7 @@ static const char *btf_show_name(struct btf_show *show)
parens = "{";
break;
case BTF_KIND_ENUM:
+ case BTF_KIND_ENUM64:
prefix = "enum";
break;
default:
@@ -1108,7 +1116,8 @@ __printf(2, 3) static void btf_show(struct btf_show *show, const char *fmt, ...)
*/
#define btf_show_type_value(show, fmt, value) \
do { \
- if ((value) != 0 || (show->flags & BTF_SHOW_ZERO) || \
+ if ((value) != (__typeof__(value))0 || \
+ (show->flags & BTF_SHOW_ZERO) || \
show->state.depth == 0) { \
btf_show(show, "%s%s" fmt "%s%s", \
btf_show_indent(show), \
@@ -1607,7 +1616,7 @@ static void btf_free_id(struct btf *btf)
static void btf_free_kfunc_set_tab(struct btf *btf)
{
struct btf_kfunc_set_tab *tab = btf->kfunc_set_tab;
- int hook, type;
+ int hook;
if (!tab)
return;
@@ -1616,10 +1625,8 @@ static void btf_free_kfunc_set_tab(struct btf *btf)
*/
if (btf_is_module(btf))
goto free_tab;
- for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++) {
- for (type = 0; type < ARRAY_SIZE(tab->sets[0]); type++)
- kfree(tab->sets[hook][type]);
- }
+ for (hook = 0; hook < ARRAY_SIZE(tab->sets); hook++)
+ kfree(tab->sets[hook]);
free_tab:
kfree(tab);
btf->kfunc_set_tab = NULL;
@@ -1834,6 +1841,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_UNION:
case BTF_KIND_ENUM:
case BTF_KIND_FLOAT:
+ case BTF_KIND_ENUM64:
size = type->size;
goto resolved;
@@ -3670,6 +3678,7 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
{
const struct btf_enum *enums = btf_type_enum(t);
struct btf *btf = env->btf;
+ const char *fmt_str;
u16 i, nr_enums;
u32 meta_needed;
@@ -3683,11 +3692,6 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
- if (btf_type_kflag(t)) {
- btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
- return -EINVAL;
- }
-
if (t->size > 8 || !is_power_of_2(t->size)) {
btf_verifier_log_type(env, t, "Unexpected size");
return -EINVAL;
@@ -3718,7 +3722,8 @@ static s32 btf_enum_check_meta(struct btf_verifier_env *env,
if (env->log.level == BPF_LOG_KERNEL)
continue;
- btf_verifier_log(env, "\t%s val=%d\n",
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%d\n" : "\t%s val=%u\n";
+ btf_verifier_log(env, fmt_str,
__btf_name_by_offset(btf, enums[i].name_off),
enums[i].val);
}
@@ -3759,7 +3764,10 @@ static void btf_enum_show(const struct btf *btf, const struct btf_type *t,
return;
}
- btf_show_type_value(show, "%d", v);
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%d", v);
+ else
+ btf_show_type_value(show, "%u", v);
btf_show_end_type(show);
}
@@ -3772,6 +3780,109 @@ static struct btf_kind_operations enum_ops = {
.show = btf_enum_show,
};
+static s32 btf_enum64_check_meta(struct btf_verifier_env *env,
+ const struct btf_type *t,
+ u32 meta_left)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ struct btf *btf = env->btf;
+ const char *fmt_str;
+ u16 i, nr_enums;
+ u32 meta_needed;
+
+ nr_enums = btf_type_vlen(t);
+ meta_needed = nr_enums * sizeof(*enums);
+
+ if (meta_left < meta_needed) {
+ btf_verifier_log_basic(env, t,
+ "meta_left:%u meta_needed:%u",
+ meta_left, meta_needed);
+ return -EINVAL;
+ }
+
+ if (t->size > 8 || !is_power_of_2(t->size)) {
+ btf_verifier_log_type(env, t, "Unexpected size");
+ return -EINVAL;
+ }
+
+ /* enum type either no name or a valid one */
+ if (t->name_off &&
+ !btf_name_valid_identifier(env->btf, t->name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ btf_verifier_log_type(env, t, NULL);
+
+ for (i = 0; i < nr_enums; i++) {
+ if (!btf_name_offset_valid(btf, enums[i].name_off)) {
+ btf_verifier_log(env, "\tInvalid name_offset:%u",
+ enums[i].name_off);
+ return -EINVAL;
+ }
+
+ /* enum member must have a valid name */
+ if (!enums[i].name_off ||
+ !btf_name_valid_identifier(btf, enums[i].name_off)) {
+ btf_verifier_log_type(env, t, "Invalid name");
+ return -EINVAL;
+ }
+
+ if (env->log.level == BPF_LOG_KERNEL)
+ continue;
+
+ fmt_str = btf_type_kflag(t) ? "\t%s val=%lld\n" : "\t%s val=%llu\n";
+ btf_verifier_log(env, fmt_str,
+ __btf_name_by_offset(btf, enums[i].name_off),
+ btf_enum64_value(enums + i));
+ }
+
+ return meta_needed;
+}
+
+static void btf_enum64_show(const struct btf *btf, const struct btf_type *t,
+ u32 type_id, void *data, u8 bits_offset,
+ struct btf_show *show)
+{
+ const struct btf_enum64 *enums = btf_type_enum64(t);
+ u32 i, nr_enums = btf_type_vlen(t);
+ void *safe_data;
+ s64 v;
+
+ safe_data = btf_show_start_type(show, t, type_id, data);
+ if (!safe_data)
+ return;
+
+ v = *(u64 *)safe_data;
+
+ for (i = 0; i < nr_enums; i++) {
+ if (v != btf_enum64_value(enums + i))
+ continue;
+
+ btf_show_type_value(show, "%s",
+ __btf_name_by_offset(btf,
+ enums[i].name_off));
+
+ btf_show_end_type(show);
+ return;
+ }
+
+ if (btf_type_kflag(t))
+ btf_show_type_value(show, "%lld", v);
+ else
+ btf_show_type_value(show, "%llu", v);
+ btf_show_end_type(show);
+}
+
+static struct btf_kind_operations enum64_ops = {
+ .check_meta = btf_enum64_check_meta,
+ .resolve = btf_df_resolve,
+ .check_member = btf_enum_check_member,
+ .check_kflag_member = btf_enum_check_kflag_member,
+ .log_details = btf_enum_log,
+ .show = btf_enum64_show,
+};
+
static s32 btf_func_proto_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
@@ -4438,6 +4549,7 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
[BTF_KIND_TYPE_TAG] = &modifier_ops,
+ [BTF_KIND_ENUM64] = &enum64_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@@ -5255,6 +5367,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
if (arg == nr_args) {
switch (prog->expected_attach_type) {
+ case BPF_LSM_CGROUP:
case BPF_LSM_MAC:
case BPF_TRACE_FEXIT:
/* When LSM programs are attached to void LSM hooks
@@ -5304,7 +5417,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* skip modifiers */
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_small_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_small_int(t) || btf_is_any_enum(t))
/* accessing a scalar */
return true;
if (!btf_type_is_ptr(t)) {
@@ -5768,7 +5881,7 @@ static int __get_type_size(struct btf *btf, u32 btf_id,
if (btf_type_is_ptr(t))
/* kernel size of pointer. Not BPF's size of pointer*/
return sizeof(void *);
- if (btf_type_is_int(t) || btf_type_is_enum(t))
+ if (btf_type_is_int(t) || btf_is_any_enum(t))
return t->size;
*bad_type = t;
return -EINVAL;
@@ -5916,7 +6029,7 @@ static int btf_check_func_type_match(struct bpf_verifier_log *log,
* to context only. And only global functions can be replaced.
* Hence type check only those types.
*/
- if (btf_type_is_int(t1) || btf_type_is_enum(t1))
+ if (btf_type_is_int(t1) || btf_is_any_enum(t1))
continue;
if (!btf_type_is_ptr(t1)) {
bpf_log(log,
@@ -6057,13 +6170,14 @@ static bool is_kfunc_arg_mem_size(const struct btf *btf,
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
- bool ptr_to_mem_ok)
+ bool ptr_to_mem_ok,
+ u32 kfunc_flags)
{
enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
+ bool rel = false, kptr_get = false, trusted_arg = false;
struct bpf_verifier_log *log = &env->log;
u32 i, nargs, ref_id, ref_obj_id = 0;
bool is_kfunc = btf_is_kernel(btf);
- bool rel = false, kptr_get = false;
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
@@ -6095,10 +6209,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (is_kfunc) {
/* Only kfunc can be release func */
- rel = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_RELEASE, func_id);
- kptr_get = btf_kfunc_id_set_contains(btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_KPTR_ACQUIRE, func_id);
+ rel = kfunc_flags & KF_RELEASE;
+ kptr_get = kfunc_flags & KF_KPTR_GET;
+ trusted_arg = kfunc_flags & KF_TRUSTED_ARGS;
}
/* check that BTF function arguments match actual types that the
@@ -6123,10 +6236,19 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
return -EINVAL;
}
+ /* Check if argument must be a referenced pointer, args + i has
+ * been verified to be a pointer (after skipping modifiers).
+ */
+ if (is_kfunc && trusted_arg && !reg->ref_obj_id) {
+ bpf_log(log, "R%d must be referenced\n", regno);
+ return -EINVAL;
+ }
+
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
- if (rel && reg->ref_obj_id)
+ /* Trusted args have the same offset checks as release arguments */
+ if (trusted_arg || (rel && reg->ref_obj_id))
arg_type |= OBJ_RELEASE;
ret = check_func_arg_reg_off(env, reg, regno, arg_type);
if (ret < 0)
@@ -6224,7 +6346,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname = btf_name_by_offset(reg_btf,
reg_ref_t->name_off);
if (!btf_struct_ids_match(log, reg_btf, reg_ref_id,
- reg->off, btf, ref_id, rel && reg->ref_obj_id)) {
+ reg->off, btf, ref_id,
+ trusted_arg || (rel && reg->ref_obj_id))) {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d has a pointer to %s %s\n",
func_name, i,
btf_type_str(ref_t), ref_tname,
@@ -6327,7 +6450,7 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
is_global = prog->aux->func_info_aux[subprog].linkage == BTF_FUNC_GLOBAL;
- err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global);
+ err = btf_check_func_arg_match(env, btf, btf_id, regs, is_global, 0);
/* Compiler optimizations can remove arguments from static functions
* or mismatched type can be passed into a global function.
@@ -6340,9 +6463,10 @@ int btf_check_subprog_arg_match(struct bpf_verifier_env *env, int subprog,
int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
- struct bpf_reg_state *regs)
+ struct bpf_reg_state *regs,
+ u32 kfunc_flags)
{
- return btf_check_func_arg_match(env, btf, func_id, regs, true);
+ return btf_check_func_arg_match(env, btf, func_id, regs, true, kfunc_flags);
}
/* Convert BTF of a function into bpf_reg_state if possible
@@ -6414,7 +6538,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, t->type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (!btf_type_is_int(t) && !btf_type_is_enum(t)) {
+ if (!btf_type_is_int(t) && !btf_is_any_enum(t)) {
bpf_log(log,
"Global function %s() doesn't return scalar. Only those are supported.\n",
tname);
@@ -6429,7 +6553,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
t = btf_type_by_id(btf, args[i].type);
while (btf_type_is_modifier(t))
t = btf_type_by_id(btf, t->type);
- if (btf_type_is_int(t) || btf_type_is_enum(t)) {
+ if (btf_type_is_int(t) || btf_is_any_enum(t)) {
reg->type = SCALAR_VALUE;
continue;
}
@@ -6519,7 +6643,7 @@ static void btf_snprintf_show(struct btf_show *show, const char *fmt,
if (len < 0) {
ssnprintf->len_left = 0;
ssnprintf->len = len;
- } else if (len > ssnprintf->len_left) {
+ } else if (len >= ssnprintf->len_left) {
/* no space, drive on to get length we would have written */
ssnprintf->len_left = 0;
ssnprintf->len += len;
@@ -6739,6 +6863,11 @@ bool btf_id_set_contains(const struct btf_id_set *set, u32 id)
return bsearch(&id, set->ids, set->cnt, sizeof(u32), btf_id_cmp_func) != NULL;
}
+static void *btf_id_set8_contains(const struct btf_id_set8 *set, u32 id)
+{
+ return bsearch(&id, set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func);
+}
+
enum {
BTF_MODULE_F_LIVE = (1 << 0),
};
@@ -6987,16 +7116,16 @@ BTF_TRACING_TYPE_xxx
/* Kernel Function (kfunc) BTF ID set registration API */
-static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- enum btf_kfunc_type type,
- struct btf_id_set *add_set, bool vmlinux_set)
+static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
+ struct btf_id_set8 *add_set)
{
+ bool vmlinux_set = !btf_is_module(btf);
struct btf_kfunc_set_tab *tab;
- struct btf_id_set *set;
+ struct btf_id_set8 *set;
u32 set_cnt;
int ret;
- if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX) {
+ if (hook >= BTF_KFUNC_HOOK_MAX) {
ret = -EINVAL;
goto end;
}
@@ -7012,7 +7141,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
btf->kfunc_set_tab = tab;
}
- set = tab->sets[hook][type];
+ set = tab->sets[hook];
/* Warn when register_btf_kfunc_id_set is called twice for the same hook
* for module sets.
*/
@@ -7026,7 +7155,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
* pointer and return.
*/
if (!vmlinux_set) {
- tab->sets[hook][type] = add_set;
+ tab->sets[hook] = add_set;
return 0;
}
@@ -7035,7 +7164,7 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
* and concatenate all individual sets being registered. While each set
* is individually sorted, they may become unsorted when concatenated,
* hence re-sorting the final set again is required to make binary
- * searching the set using btf_id_set_contains function work.
+ * searching the set using btf_id_set8_contains function work.
*/
set_cnt = set ? set->cnt : 0;
@@ -7050,8 +7179,8 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
}
/* Grow set */
- set = krealloc(tab->sets[hook][type],
- offsetof(struct btf_id_set, ids[set_cnt + add_set->cnt]),
+ set = krealloc(tab->sets[hook],
+ offsetof(struct btf_id_set8, pairs[set_cnt + add_set->cnt]),
GFP_KERNEL | __GFP_NOWARN);
if (!set) {
ret = -ENOMEM;
@@ -7059,15 +7188,15 @@ static int __btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
}
/* For newly allocated set, initialize set->cnt to 0 */
- if (!tab->sets[hook][type])
+ if (!tab->sets[hook])
set->cnt = 0;
- tab->sets[hook][type] = set;
+ tab->sets[hook] = set;
/* Concatenate the two sets */
- memcpy(set->ids + set->cnt, add_set->ids, add_set->cnt * sizeof(set->ids[0]));
+ memcpy(set->pairs + set->cnt, add_set->pairs, add_set->cnt * sizeof(set->pairs[0]));
set->cnt += add_set->cnt;
- sort(set->ids, set->cnt, sizeof(set->ids[0]), btf_id_cmp_func, NULL);
+ sort(set->pairs, set->cnt, sizeof(set->pairs[0]), btf_id_cmp_func, NULL);
return 0;
end:
@@ -7075,38 +7204,25 @@ end:
return ret;
}
-static int btf_populate_kfunc_set(struct btf *btf, enum btf_kfunc_hook hook,
- const struct btf_kfunc_id_set *kset)
-{
- bool vmlinux_set = !btf_is_module(btf);
- int type, ret = 0;
-
- for (type = 0; type < ARRAY_SIZE(kset->sets); type++) {
- if (!kset->sets[type])
- continue;
-
- ret = __btf_populate_kfunc_set(btf, hook, type, kset->sets[type], vmlinux_set);
- if (ret)
- break;
- }
- return ret;
-}
-
-static bool __btf_kfunc_id_set_contains(const struct btf *btf,
+static u32 *__btf_kfunc_id_set_contains(const struct btf *btf,
enum btf_kfunc_hook hook,
- enum btf_kfunc_type type,
u32 kfunc_btf_id)
{
- struct btf_id_set *set;
+ struct btf_id_set8 *set;
+ u32 *id;
- if (hook >= BTF_KFUNC_HOOK_MAX || type >= BTF_KFUNC_TYPE_MAX)
- return false;
+ if (hook >= BTF_KFUNC_HOOK_MAX)
+ return NULL;
if (!btf->kfunc_set_tab)
- return false;
- set = btf->kfunc_set_tab->sets[hook][type];
+ return NULL;
+ set = btf->kfunc_set_tab->sets[hook];
if (!set)
- return false;
- return btf_id_set_contains(set, kfunc_btf_id);
+ return NULL;
+ id = btf_id_set8_contains(set, kfunc_btf_id);
+ if (!id)
+ return NULL;
+ /* The flags for BTF ID are located next to it */
+ return id + 1;
}
static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
@@ -7134,14 +7250,14 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
* keeping the reference for the duration of the call provides the necessary
* protection for looking up a well-formed btf->kfunc_set_tab.
*/
-bool btf_kfunc_id_set_contains(const struct btf *btf,
+u32 *btf_kfunc_id_set_contains(const struct btf *btf,
enum bpf_prog_type prog_type,
- enum btf_kfunc_type type, u32 kfunc_btf_id)
+ u32 kfunc_btf_id)
{
enum btf_kfunc_hook hook;
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- return __btf_kfunc_id_set_contains(btf, hook, type, kfunc_btf_id);
+ return __btf_kfunc_id_set_contains(btf, hook, kfunc_btf_id);
}
/* This function must be invoked only from initcalls/module init functions */
@@ -7168,7 +7284,7 @@ int register_btf_kfunc_id_set(enum bpf_prog_type prog_type,
return PTR_ERR(btf);
hook = bpf_prog_type_to_kfunc_hook(prog_type);
- ret = btf_populate_kfunc_set(btf, hook, kset);
+ ret = btf_populate_kfunc_set(btf, hook, kset->set);
btf_put(btf);
return ret;
}
@@ -7308,95 +7424,15 @@ EXPORT_SYMBOL_GPL(register_btf_id_dtor_kfuncs);
#define MAX_TYPES_ARE_COMPAT_DEPTH 2
-static
-int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
- const struct btf *targ_btf, __u32 targ_id,
- int level)
-{
- const struct btf_type *local_type, *targ_type;
- int depth = 32; /* max recursion depth */
-
- /* caller made sure that names match (ignoring flavor suffix) */
- local_type = btf_type_by_id(local_btf, local_id);
- targ_type = btf_type_by_id(targ_btf, targ_id);
- if (btf_kind(local_type) != btf_kind(targ_type))
- return 0;
-
-recur:
- depth--;
- if (depth < 0)
- return -EINVAL;
-
- local_type = btf_type_skip_modifiers(local_btf, local_id, &local_id);
- targ_type = btf_type_skip_modifiers(targ_btf, targ_id, &targ_id);
- if (!local_type || !targ_type)
- return -EINVAL;
-
- if (btf_kind(local_type) != btf_kind(targ_type))
- return 0;
-
- switch (btf_kind(local_type)) {
- case BTF_KIND_UNKN:
- case BTF_KIND_STRUCT:
- case BTF_KIND_UNION:
- case BTF_KIND_ENUM:
- case BTF_KIND_FWD:
- return 1;
- case BTF_KIND_INT:
- /* just reject deprecated bitfield-like integers; all other
- * integers are by default compatible between each other
- */
- return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0;
- case BTF_KIND_PTR:
- local_id = local_type->type;
- targ_id = targ_type->type;
- goto recur;
- case BTF_KIND_ARRAY:
- local_id = btf_array(local_type)->type;
- targ_id = btf_array(targ_type)->type;
- goto recur;
- case BTF_KIND_FUNC_PROTO: {
- struct btf_param *local_p = btf_params(local_type);
- struct btf_param *targ_p = btf_params(targ_type);
- __u16 local_vlen = btf_vlen(local_type);
- __u16 targ_vlen = btf_vlen(targ_type);
- int i, err;
-
- if (local_vlen != targ_vlen)
- return 0;
-
- for (i = 0; i < local_vlen; i++, local_p++, targ_p++) {
- if (level <= 0)
- return -EINVAL;
-
- btf_type_skip_modifiers(local_btf, local_p->type, &local_id);
- btf_type_skip_modifiers(targ_btf, targ_p->type, &targ_id);
- err = __bpf_core_types_are_compat(local_btf, local_id,
- targ_btf, targ_id,
- level - 1);
- if (err <= 0)
- return err;
- }
-
- /* tail recurse for return type check */
- btf_type_skip_modifiers(local_btf, local_type->type, &local_id);
- btf_type_skip_modifiers(targ_btf, targ_type->type, &targ_id);
- goto recur;
- }
- default:
- return 0;
- }
-}
-
/* Check local and target types for compatibility. This check is used for
* type-based CO-RE relocations and follow slightly different rules than
* field-based relocations. This function assumes that root types were already
* checked for name match. Beyond that initial root-level name check, names
* are completely ignored. Compatibility rules are as follows:
- * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs are considered compatible, but
+ * - any two STRUCTs/UNIONs/FWDs/ENUMs/INTs/ENUM64s are considered compatible, but
* kind should match for local and target types (i.e., STRUCT is not
* compatible with UNION);
- * - for ENUMs, the size is ignored;
+ * - for ENUMs/ENUM64s, the size is ignored;
* - for INT, size and signedness are ignored;
* - for ARRAY, dimensionality is ignored, element types are checked for
* compatibility recursively;
@@ -7410,11 +7446,19 @@ recur:
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
- return __bpf_core_types_are_compat(local_btf, local_id,
- targ_btf, targ_id,
+ return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id,
MAX_TYPES_ARE_COMPAT_DEPTH);
}
+#define MAX_TYPES_MATCH_DEPTH 2
+
+int bpf_core_types_match(const struct btf *local_btf, u32 local_id,
+ const struct btf *targ_btf, u32 targ_id)
+{
+ return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false,
+ MAX_TYPES_MATCH_DEPTH);
+}
+
static bool bpf_core_is_flavor_sep(const char *s)
{
/* check X___Y name pattern, where X and Y are not underscores */
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index afb414b26d01..59b7eb60d5b4 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -14,6 +14,8 @@
#include <linux/string.h>
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
+#include <linux/bpf_lsm.h>
+#include <linux/bpf_verifier.h>
#include <net/sock.h>
#include <net/bpf_sk_storage.h>
@@ -61,6 +63,132 @@ bpf_prog_run_array_cg(const struct cgroup_bpf *cgrp,
return run_ctx.retval;
}
+unsigned int __cgroup_bpf_run_lsm_sock(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct sock *sk;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sk = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_socket(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct socket *sock;
+ struct cgroup *cgrp;
+ int ret = 0;
+ u64 *args;
+
+ args = (u64 *)ctx;
+ sock = (void *)(unsigned long)args[0];
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ cgrp = sock_cgroup_ptr(&sock->sk->sk_cgrp_data);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+unsigned int __cgroup_bpf_run_lsm_current(const void *ctx,
+ const struct bpf_insn *insn)
+{
+ const struct bpf_prog *shim_prog;
+ struct cgroup *cgrp;
+ int ret = 0;
+
+ /*shim_prog = container_of(insn, struct bpf_prog, insnsi);*/
+ shim_prog = (const struct bpf_prog *)((void *)insn - offsetof(struct bpf_prog, insnsi));
+
+ /* We rely on trampoline's __bpf_prog_enter_lsm_cgroup to grab RCU read lock. */
+ cgrp = task_dfl_cgroup(current);
+ if (likely(cgrp))
+ ret = bpf_prog_run_array_cg(&cgrp->bpf,
+ shim_prog->aux->cgroup_atype,
+ ctx, bpf_prog_run, 0, NULL);
+ return ret;
+}
+
+#ifdef CONFIG_BPF_LSM
+struct cgroup_lsm_atype {
+ u32 attach_btf_id;
+ int refcnt;
+};
+
+static struct cgroup_lsm_atype cgroup_lsm_atype[CGROUP_LSM_NUM];
+
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ int i;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == attach_btf_id)
+ return CGROUP_LSM_START + i;
+
+ for (i = 0; i < ARRAY_SIZE(cgroup_lsm_atype); i++)
+ if (cgroup_lsm_atype[i].attach_btf_id == 0)
+ return CGROUP_LSM_START + i;
+
+ return -E2BIG;
+
+}
+
+void bpf_cgroup_atype_get(u32 attach_btf_id, int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ lockdep_assert_held(&cgroup_mutex);
+
+ WARN_ON_ONCE(cgroup_lsm_atype[i].attach_btf_id &&
+ cgroup_lsm_atype[i].attach_btf_id != attach_btf_id);
+
+ cgroup_lsm_atype[i].attach_btf_id = attach_btf_id;
+ cgroup_lsm_atype[i].refcnt++;
+}
+
+void bpf_cgroup_atype_put(int cgroup_atype)
+{
+ int i = cgroup_atype - CGROUP_LSM_START;
+
+ mutex_lock(&cgroup_mutex);
+ if (--cgroup_lsm_atype[i].refcnt <= 0)
+ cgroup_lsm_atype[i].attach_btf_id = 0;
+ WARN_ON_ONCE(cgroup_lsm_atype[i].refcnt < 0);
+ mutex_unlock(&cgroup_mutex);
+}
+#else
+static enum cgroup_bpf_attach_type
+bpf_cgroup_atype_find(enum bpf_attach_type attach_type, u32 attach_btf_id)
+{
+ if (attach_type != BPF_LSM_CGROUP)
+ return to_cgroup_bpf_attach_type(attach_type);
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BPF_LSM */
+
void cgroup_bpf_offline(struct cgroup *cgrp)
{
cgroup_get(cgrp);
@@ -157,15 +285,22 @@ static void cgroup_bpf_release(struct work_struct *work)
mutex_lock(&cgroup_mutex);
for (atype = 0; atype < ARRAY_SIZE(cgrp->bpf.progs); atype++) {
- struct list_head *progs = &cgrp->bpf.progs[atype];
- struct bpf_prog_list *pl, *pltmp;
+ struct hlist_head *progs = &cgrp->bpf.progs[atype];
+ struct bpf_prog_list *pl;
+ struct hlist_node *pltmp;
- list_for_each_entry_safe(pl, pltmp, progs, node) {
- list_del(&pl->node);
- if (pl->prog)
+ hlist_for_each_entry_safe(pl, pltmp, progs, node) {
+ hlist_del(&pl->node);
+ if (pl->prog) {
+ if (pl->prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->prog);
bpf_prog_put(pl->prog);
- if (pl->link)
+ }
+ if (pl->link) {
+ if (pl->link->link.prog->expected_attach_type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(pl->link->link.prog);
bpf_cgroup_link_auto_detach(pl->link);
+ }
kfree(pl);
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
}
@@ -217,12 +352,12 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
/* count number of elements in the list.
* it's slow but the list cannot be long
*/
-static u32 prog_list_length(struct list_head *head)
+static u32 prog_list_length(struct hlist_head *head)
{
struct bpf_prog_list *pl;
u32 cnt = 0;
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
cnt++;
@@ -291,7 +426,7 @@ static int compute_effective_progs(struct cgroup *cgrp,
if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
continue;
- list_for_each_entry(pl, &p->bpf.progs[atype], node) {
+ hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
if (!prog_list_prog(pl))
continue;
@@ -342,7 +477,7 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
cgroup_bpf_get(p);
for (i = 0; i < NR; i++)
- INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
+ INIT_HLIST_HEAD(&cgrp->bpf.progs[i]);
INIT_LIST_HEAD(&cgrp->bpf.storages);
@@ -418,7 +553,7 @@ cleanup:
#define BPF_CGROUP_MAX_PROGS 64
-static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
struct bpf_prog *replace_prog,
@@ -428,12 +563,12 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* single-attach case */
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
return NULL;
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (prog && pl->prog == prog && prog != replace_prog)
/* disallow attaching the same prog twice */
return ERR_PTR(-EINVAL);
@@ -444,7 +579,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
/* direct prog multi-attach w/ replacement case */
if (replace_prog) {
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == replace_prog)
/* a match found */
return pl;
@@ -478,9 +613,10 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *old_prog = NULL;
struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+ struct bpf_prog *new_prog = prog ? : link->link.prog;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
int err;
if (((flags & BPF_F_ALLOW_OVERRIDE) && (flags & BPF_F_ALLOW_MULTI)) ||
@@ -494,7 +630,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
/* replace_prog implies BPF_F_REPLACE, and vice versa */
return -EINVAL;
- atype = to_cgroup_bpf_attach_type(type);
+ atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -503,7 +639,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (!hierarchy_allows_attach(cgrp, atype))
return -EPERM;
- if (!list_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
+ if (!hlist_empty(progs) && cgrp->bpf.flags[atype] != saved_flags)
/* Disallow attaching non-overridable on top
* of existing overridable in this cgroup.
* Disallow attaching multi-prog if overridable or none
@@ -525,12 +661,22 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
if (pl) {
old_prog = pl->prog;
} else {
+ struct hlist_node *last = NULL;
+
pl = kmalloc(sizeof(*pl), GFP_KERNEL);
if (!pl) {
bpf_cgroup_storages_free(new_storage);
return -ENOMEM;
}
- list_add_tail(&pl->node, progs);
+ if (hlist_empty(progs))
+ hlist_add_head(&pl->node, progs);
+ else
+ hlist_for_each(last, progs) {
+ if (last->next)
+ continue;
+ hlist_add_behind(&pl->node, last);
+ break;
+ }
}
pl->prog = prog;
@@ -538,17 +684,30 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
bpf_cgroup_storages_assign(pl->storage, storage);
cgrp->bpf.flags[atype] = saved_flags;
+ if (type == BPF_LSM_CGROUP) {
+ err = bpf_trampoline_link_cgroup_shim(new_prog, atype);
+ if (err)
+ goto cleanup;
+ }
+
err = update_effective_progs(cgrp, atype);
if (err)
- goto cleanup;
+ goto cleanup_trampoline;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
- else
+ } else {
static_branch_inc(&cgroup_bpf_enabled_key[atype]);
+ }
bpf_cgroup_storages_link(new_storage, cgrp, type);
return 0;
+cleanup_trampoline:
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(new_prog);
+
cleanup:
if (old_prog) {
pl->prog = old_prog;
@@ -556,7 +715,7 @@ cleanup:
}
bpf_cgroup_storages_free(new_storage);
if (!old_prog) {
- list_del(&pl->node);
+ hlist_del(&pl->node);
kfree(pl);
}
return err;
@@ -587,7 +746,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
struct cgroup_subsys_state *css;
struct bpf_prog_array *progs;
struct bpf_prog_list *pl;
- struct list_head *head;
+ struct hlist_head *head;
struct cgroup *cg;
int pos;
@@ -603,7 +762,7 @@ static void replace_effective_prog(struct cgroup *cgrp,
continue;
head = &cg->bpf.progs[atype];
- list_for_each_entry(pl, head, node) {
+ hlist_for_each_entry(pl, head, node) {
if (!prog_list_prog(pl))
continue;
if (pl->link == link)
@@ -637,10 +796,10 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
bool found = false;
- atype = to_cgroup_bpf_attach_type(link->type);
+ atype = bpf_cgroup_atype_find(link->type, new_prog->aux->attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -649,7 +808,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
if (link->link.prog->type != new_prog->type)
return -EINVAL;
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->link == link) {
found = true;
break;
@@ -688,7 +847,7 @@ out_unlock:
return ret;
}
-static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
+static struct bpf_prog_list *find_detach_entry(struct hlist_head *progs,
struct bpf_prog *prog,
struct bpf_cgroup_link *link,
bool allow_multi)
@@ -696,14 +855,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
struct bpf_prog_list *pl;
if (!allow_multi) {
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* report error when trying to detach and nothing is attached */
return ERR_PTR(-ENOENT);
/* to maintain backward compatibility NONE and OVERRIDE cgroups
* allow detaching with invalid FD (prog==NULL) in legacy mode
*/
- return list_first_entry(progs, typeof(*pl), node);
+ return hlist_entry(progs->first, typeof(*pl), node);
}
if (!prog && !link)
@@ -713,7 +872,7 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
return ERR_PTR(-EINVAL);
/* find the prog or link and detach it */
- list_for_each_entry(pl, progs, node) {
+ hlist_for_each_entry(pl, progs, node) {
if (pl->prog == prog && pl->link == link)
return pl;
}
@@ -721,6 +880,60 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
}
/**
+ * purge_effective_progs() - After compute_effective_progs fails to alloc new
+ * cgrp->bpf.inactive table we can recover by
+ * recomputing the array in place.
+ *
+ * @cgrp: The cgroup which descendants to travers
+ * @prog: A program to detach or NULL
+ * @link: A link to detach or NULL
+ * @atype: Type of detach operation
+ */
+static void purge_effective_progs(struct cgroup *cgrp, struct bpf_prog *prog,
+ struct bpf_cgroup_link *link,
+ enum cgroup_bpf_attach_type atype)
+{
+ struct cgroup_subsys_state *css;
+ struct bpf_prog_array *progs;
+ struct bpf_prog_list *pl;
+ struct hlist_head *head;
+ struct cgroup *cg;
+ int pos;
+
+ /* recompute effective prog array in place */
+ css_for_each_descendant_pre(css, &cgrp->self) {
+ struct cgroup *desc = container_of(css, struct cgroup, self);
+
+ if (percpu_ref_is_zero(&desc->bpf.refcnt))
+ continue;
+
+ /* find position of link or prog in effective progs array */
+ for (pos = 0, cg = desc; cg; cg = cgroup_parent(cg)) {
+ if (pos && !(cg->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
+ continue;
+
+ head = &cg->bpf.progs[atype];
+ hlist_for_each_entry(pl, head, node) {
+ if (!prog_list_prog(pl))
+ continue;
+ if (pl->prog == prog && pl->link == link)
+ goto found;
+ pos++;
+ }
+ }
+found:
+ BUG_ON(!cg);
+ progs = rcu_dereference_protected(
+ desc->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+
+ /* Remove the program from the array */
+ WARN_ONCE(bpf_prog_array_delete_safe_at(progs, pos),
+ "Failed to purge a prog from array at index %d", pos);
+ }
+}
+
+/**
* __cgroup_bpf_detach() - Detach the program or link from a cgroup, and
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
@@ -737,11 +950,16 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum cgroup_bpf_attach_type atype;
struct bpf_prog *old_prog;
struct bpf_prog_list *pl;
- struct list_head *progs;
+ struct hlist_head *progs;
+ u32 attach_btf_id = 0;
u32 flags;
- int err;
- atype = to_cgroup_bpf_attach_type(type);
+ if (prog)
+ attach_btf_id = prog->aux->attach_btf_id;
+ if (link)
+ attach_btf_id = link->link.prog->aux->attach_btf_id;
+
+ atype = bpf_cgroup_atype_find(type, attach_btf_id);
if (atype < 0)
return -EINVAL;
@@ -761,26 +979,27 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
pl->prog = NULL;
pl->link = NULL;
- err = update_effective_progs(cgrp, atype);
- if (err)
- goto cleanup;
+ if (update_effective_progs(cgrp, atype)) {
+ /* if update effective array failed replace the prog with a dummy prog*/
+ pl->prog = old_prog;
+ pl->link = link;
+ purge_effective_progs(cgrp, old_prog, link, atype);
+ }
/* now can actually delete it from this cgroup list */
- list_del(&pl->node);
+ hlist_del(&pl->node);
+
kfree(pl);
- if (list_empty(progs))
+ if (hlist_empty(progs))
/* last program was detached, reset flags to zero */
cgrp->bpf.flags[atype] = 0;
- if (old_prog)
+ if (old_prog) {
+ if (type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(old_prog);
bpf_prog_put(old_prog);
+ }
static_branch_dec(&cgroup_bpf_enabled_key[atype]);
return 0;
-
-cleanup:
- /* restore back prog or link */
- pl->prog = old_prog;
- pl->link = link;
- return err;
}
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
@@ -798,57 +1017,90 @@ static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
+ __u32 __user *prog_attach_flags = u64_to_user_ptr(attr->query.prog_attach_flags);
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
enum bpf_attach_type type = attr->query.attach_type;
+ enum cgroup_bpf_attach_type from_atype, to_atype;
enum cgroup_bpf_attach_type atype;
struct bpf_prog_array *effective;
- struct list_head *progs;
- struct bpf_prog *prog;
int cnt, ret = 0, i;
+ int total_cnt = 0;
u32 flags;
- atype = to_cgroup_bpf_attach_type(type);
- if (atype < 0)
- return -EINVAL;
-
- progs = &cgrp->bpf.progs[atype];
- flags = cgrp->bpf.flags[atype];
+ if (type == BPF_LSM_CGROUP) {
+ if (attr->query.prog_cnt && prog_ids && !prog_attach_flags)
+ return -EINVAL;
- effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
- lockdep_is_held(&cgroup_mutex));
+ from_atype = CGROUP_LSM_START;
+ to_atype = CGROUP_LSM_END;
+ flags = 0;
+ } else {
+ from_atype = to_cgroup_bpf_attach_type(type);
+ if (from_atype < 0)
+ return -EINVAL;
+ to_atype = from_atype;
+ flags = cgrp->bpf.flags[from_atype];
+ }
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE)
- cnt = bpf_prog_array_length(effective);
- else
- cnt = prog_list_length(progs);
+ for (atype = from_atype; atype <= to_atype; atype++) {
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ total_cnt += bpf_prog_array_length(effective);
+ } else {
+ total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+ }
+ }
if (copy_to_user(&uattr->query.attach_flags, &flags, sizeof(flags)))
return -EFAULT;
- if (copy_to_user(&uattr->query.prog_cnt, &cnt, sizeof(cnt)))
+ if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
return -EFAULT;
- if (attr->query.prog_cnt == 0 || !prog_ids || !cnt)
+ if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
/* return early if user requested only program count + flags */
return 0;
- if (attr->query.prog_cnt < cnt) {
- cnt = attr->query.prog_cnt;
+
+ if (attr->query.prog_cnt < total_cnt) {
+ total_cnt = attr->query.prog_cnt;
ret = -ENOSPC;
}
- if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
- return bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
- } else {
- struct bpf_prog_list *pl;
- u32 id;
-
- i = 0;
- list_for_each_entry(pl, progs, node) {
- prog = prog_list_prog(pl);
- id = prog->aux->id;
- if (copy_to_user(prog_ids + i, &id, sizeof(id)))
- return -EFAULT;
- if (++i == cnt)
- break;
+ for (atype = from_atype; atype <= to_atype && total_cnt; atype++) {
+ if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) {
+ effective = rcu_dereference_protected(cgrp->bpf.effective[atype],
+ lockdep_is_held(&cgroup_mutex));
+ cnt = min_t(int, bpf_prog_array_length(effective), total_cnt);
+ ret = bpf_prog_array_copy_to_user(effective, prog_ids, cnt);
+ } else {
+ struct hlist_head *progs;
+ struct bpf_prog_list *pl;
+ struct bpf_prog *prog;
+ u32 id;
+
+ progs = &cgrp->bpf.progs[atype];
+ cnt = min_t(int, prog_list_length(progs), total_cnt);
+ i = 0;
+ hlist_for_each_entry(pl, progs, node) {
+ prog = prog_list_prog(pl);
+ id = prog->aux->id;
+ if (copy_to_user(prog_ids + i, &id, sizeof(id)))
+ return -EFAULT;
+ if (++i == cnt)
+ break;
+ }
}
+
+ if (prog_attach_flags) {
+ flags = cgrp->bpf.flags[atype];
+
+ for (i = 0; i < cnt; i++)
+ if (copy_to_user(prog_attach_flags + i, &flags, sizeof(flags)))
+ return -EFAULT;
+ prog_attach_flags += cnt;
+ }
+
+ prog_ids += cnt;
+ total_cnt -= cnt;
}
return ret;
}
@@ -937,6 +1189,8 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
cg_link->type));
+ if (cg_link->type == BPF_LSM_CGROUP)
+ bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
cg = cg_link->cgroup;
cg_link->cgroup = NULL;
@@ -1281,7 +1535,7 @@ BPF_CALL_0(bpf_get_retval)
return ctx->retval;
}
-static const struct bpf_func_proto bpf_get_retval_proto = {
+const struct bpf_func_proto bpf_get_retval_proto = {
.func = bpf_get_retval,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1296,7 +1550,7 @@ BPF_CALL_1(bpf_set_retval, int, retval)
return 0;
}
-static const struct bpf_func_proto bpf_set_retval_proto = {
+const struct bpf_func_proto bpf_set_retval_proto = {
.func = bpf_set_retval,
.gpl_only = false,
.ret_type = RET_INTEGER,
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index e7961508a47d..c1e10d088dbb 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -109,6 +109,9 @@ struct bpf_prog *bpf_prog_alloc_no_stats(unsigned int size, gfp_t gfp_extra_flag
fp->aux->prog = fp;
fp->jit_requested = ebpf_jit_enabled();
fp->blinding_requested = bpf_jit_blinding_enabled(fp);
+#ifdef CONFIG_CGROUP_BPF
+ aux->cgroup_atype = CGROUP_BPF_ATTACH_TYPE_INVALID;
+#endif
INIT_LIST_HEAD_RCU(&fp->aux->ksym.lnode);
mutex_init(&fp->aux->used_maps_mutex);
@@ -178,7 +181,7 @@ void bpf_prog_jit_attempt_done(struct bpf_prog *prog)
* here is relative to the prog itself instead of the main prog.
* This array has one entry for each xlated bpf insn.
*
- * jited_off is the byte off to the last byte of the jited insn.
+ * jited_off is the byte off to the end of the jited insn.
*
* Hence, with
* insn_start:
@@ -649,12 +652,6 @@ static bool bpf_prog_kallsyms_candidate(const struct bpf_prog *fp)
return fp->jited && !bpf_prog_was_classic(fp);
}
-static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp)
-{
- return list_empty(&fp->aux->ksym.lnode) ||
- fp->aux->ksym.lnode.prev == LIST_POISON2;
-}
-
void bpf_prog_kallsyms_add(struct bpf_prog *fp)
{
if (!bpf_prog_kallsyms_candidate(fp) ||
@@ -830,15 +827,6 @@ struct bpf_prog_pack {
#define BPF_PROG_SIZE_TO_NBITS(size) (round_up(size, BPF_PROG_CHUNK_SIZE) / BPF_PROG_CHUNK_SIZE)
-static size_t bpf_prog_pack_size = -1;
-static size_t bpf_prog_pack_mask = -1;
-
-static int bpf_prog_chunk_count(void)
-{
- WARN_ON_ONCE(bpf_prog_pack_size == -1);
- return bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE;
-}
-
static DEFINE_MUTEX(pack_mutex);
static LIST_HEAD(pack_list);
@@ -846,55 +834,33 @@ static LIST_HEAD(pack_list);
* CONFIG_MMU=n. Use PAGE_SIZE in these cases.
*/
#ifdef PMD_SIZE
-#define BPF_HPAGE_SIZE PMD_SIZE
-#define BPF_HPAGE_MASK PMD_MASK
+#define BPF_PROG_PACK_SIZE (PMD_SIZE * num_possible_nodes())
#else
-#define BPF_HPAGE_SIZE PAGE_SIZE
-#define BPF_HPAGE_MASK PAGE_MASK
+#define BPF_PROG_PACK_SIZE PAGE_SIZE
#endif
-static size_t select_bpf_prog_pack_size(void)
-{
- size_t size;
- void *ptr;
-
- size = BPF_HPAGE_SIZE * num_online_nodes();
- ptr = module_alloc(size);
-
- /* Test whether we can get huge pages. If not just use PAGE_SIZE
- * packs.
- */
- if (!ptr || !is_vm_area_hugepages(ptr)) {
- size = PAGE_SIZE;
- bpf_prog_pack_mask = PAGE_MASK;
- } else {
- bpf_prog_pack_mask = BPF_HPAGE_MASK;
- }
-
- vfree(ptr);
- return size;
-}
+#define BPF_PROG_CHUNK_COUNT (BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE)
static struct bpf_prog_pack *alloc_new_pack(bpf_jit_fill_hole_t bpf_fill_ill_insns)
{
struct bpf_prog_pack *pack;
- pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(bpf_prog_chunk_count())),
+ pack = kzalloc(struct_size(pack, bitmap, BITS_TO_LONGS(BPF_PROG_CHUNK_COUNT)),
GFP_KERNEL);
if (!pack)
return NULL;
- pack->ptr = module_alloc(bpf_prog_pack_size);
+ pack->ptr = module_alloc(BPF_PROG_PACK_SIZE);
if (!pack->ptr) {
kfree(pack);
return NULL;
}
- bpf_fill_ill_insns(pack->ptr, bpf_prog_pack_size);
- bitmap_zero(pack->bitmap, bpf_prog_pack_size / BPF_PROG_CHUNK_SIZE);
+ bpf_fill_ill_insns(pack->ptr, BPF_PROG_PACK_SIZE);
+ bitmap_zero(pack->bitmap, BPF_PROG_PACK_SIZE / BPF_PROG_CHUNK_SIZE);
list_add_tail(&pack->list, &pack_list);
set_vm_flush_reset_perms(pack->ptr);
- set_memory_ro((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
- set_memory_x((unsigned long)pack->ptr, bpf_prog_pack_size / PAGE_SIZE);
+ set_memory_ro((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
+ set_memory_x((unsigned long)pack->ptr, BPF_PROG_PACK_SIZE / PAGE_SIZE);
return pack;
}
@@ -906,10 +872,7 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
void *ptr = NULL;
mutex_lock(&pack_mutex);
- if (bpf_prog_pack_size == -1)
- bpf_prog_pack_size = select_bpf_prog_pack_size();
-
- if (size > bpf_prog_pack_size) {
+ if (size > BPF_PROG_PACK_SIZE) {
size = round_up(size, PAGE_SIZE);
ptr = module_alloc(size);
if (ptr) {
@@ -921,9 +884,9 @@ static void *bpf_prog_pack_alloc(u32 size, bpf_jit_fill_hole_t bpf_fill_ill_insn
goto out;
}
list_for_each_entry(pack, &pack_list, list) {
- pos = bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
+ pos = bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
nbits, 0);
- if (pos < bpf_prog_chunk_count())
+ if (pos < BPF_PROG_CHUNK_COUNT)
goto found_free_area;
}
@@ -947,18 +910,15 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
struct bpf_prog_pack *pack = NULL, *tmp;
unsigned int nbits;
unsigned long pos;
- void *pack_ptr;
mutex_lock(&pack_mutex);
- if (hdr->size > bpf_prog_pack_size) {
+ if (hdr->size > BPF_PROG_PACK_SIZE) {
module_memfree(hdr);
goto out;
}
- pack_ptr = (void *)((unsigned long)hdr & bpf_prog_pack_mask);
-
list_for_each_entry(tmp, &pack_list, list) {
- if (tmp->ptr == pack_ptr) {
+ if ((void *)hdr >= tmp->ptr && (tmp->ptr + BPF_PROG_PACK_SIZE) > (void *)hdr) {
pack = tmp;
break;
}
@@ -968,14 +928,14 @@ static void bpf_prog_pack_free(struct bpf_binary_header *hdr)
goto out;
nbits = BPF_PROG_SIZE_TO_NBITS(hdr->size);
- pos = ((unsigned long)hdr - (unsigned long)pack_ptr) >> BPF_PROG_CHUNK_SHIFT;
+ pos = ((unsigned long)hdr - (unsigned long)pack->ptr) >> BPF_PROG_CHUNK_SHIFT;
WARN_ONCE(bpf_arch_text_invalidate(hdr, hdr->size),
"bpf_prog_pack bug: missing bpf_arch_text_invalidate?\n");
bitmap_clear(pack->bitmap, pos, nbits);
- if (bitmap_find_next_zero_area(pack->bitmap, bpf_prog_chunk_count(), 0,
- bpf_prog_chunk_count(), 0) == 0) {
+ if (bitmap_find_next_zero_area(pack->bitmap, BPF_PROG_CHUNK_COUNT, 0,
+ BPF_PROG_CHUNK_COUNT, 0) == 0) {
list_del(&pack->list);
module_memfree(pack->ptr);
kfree(pack);
@@ -1152,7 +1112,6 @@ int bpf_jit_binary_pack_finalize(struct bpf_prog *prog,
bpf_prog_pack_free(ro_header);
return PTR_ERR(ptr);
}
- prog->aux->use_bpf_prog_pack = true;
return 0;
}
@@ -1176,17 +1135,23 @@ void bpf_jit_binary_pack_free(struct bpf_binary_header *ro_header,
bpf_jit_uncharge_modmem(size);
}
+struct bpf_binary_header *
+bpf_jit_binary_pack_hdr(const struct bpf_prog *fp)
+{
+ unsigned long real_start = (unsigned long)fp->bpf_func;
+ unsigned long addr;
+
+ addr = real_start & BPF_PROG_CHUNK_MASK;
+ return (void *)addr;
+}
+
static inline struct bpf_binary_header *
bpf_jit_binary_hdr(const struct bpf_prog *fp)
{
unsigned long real_start = (unsigned long)fp->bpf_func;
unsigned long addr;
- if (fp->aux->use_bpf_prog_pack)
- addr = real_start & BPF_PROG_CHUNK_MASK;
- else
- addr = real_start & PAGE_MASK;
-
+ addr = real_start & PAGE_MASK;
return (void *)addr;
}
@@ -1199,11 +1164,7 @@ void __weak bpf_jit_free(struct bpf_prog *fp)
if (fp->jited) {
struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp);
- if (fp->aux->use_bpf_prog_pack)
- bpf_jit_binary_pack_free(hdr, NULL /* rw_buffer */);
- else
- bpf_jit_binary_free(hdr);
-
+ bpf_jit_binary_free(hdr);
WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp));
}
@@ -2281,6 +2242,21 @@ void bpf_prog_array_free(struct bpf_prog_array *progs)
kfree_rcu(progs, rcu);
}
+static void __bpf_prog_array_free_sleepable_cb(struct rcu_head *rcu)
+{
+ struct bpf_prog_array *progs;
+
+ progs = container_of(rcu, struct bpf_prog_array, rcu);
+ kfree_rcu(progs, rcu);
+}
+
+void bpf_prog_array_free_sleepable(struct bpf_prog_array *progs)
+{
+ if (!progs || progs == &bpf_empty_prog_array.hdr)
+ return;
+ call_rcu_tasks_trace(&progs->rcu, __bpf_prog_array_free_sleepable_cb);
+}
+
int bpf_prog_array_length(struct bpf_prog_array *array)
{
struct bpf_prog_array_item *item;
@@ -2557,6 +2533,10 @@ static void bpf_prog_free_deferred(struct work_struct *work)
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
+#ifdef CONFIG_CGROUP_BPF
+ if (aux->cgroup_atype != CGROUP_BPF_ATTACH_TYPE_INVALID)
+ bpf_cgroup_atype_put(aux->cgroup_atype);
+#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
@@ -2653,6 +2633,8 @@ const struct bpf_func_proto bpf_get_local_storage_proto __weak;
const struct bpf_func_proto bpf_get_ns_current_pid_tgid_proto __weak;
const struct bpf_func_proto bpf_snprintf_btf_proto __weak;
const struct bpf_func_proto bpf_seq_printf_btf_proto __weak;
+const struct bpf_func_proto bpf_set_retval_proto __weak;
+const struct bpf_func_proto bpf_get_retval_proto __weak;
const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
{
@@ -2716,6 +2698,12 @@ bool __weak bpf_jit_needs_zext(void)
return false;
}
+/* Return TRUE if the JIT backend supports mixing bpf2bpf and tailcalls. */
+bool __weak bpf_jit_supports_subprog_tailcalls(void)
+{
+ return false;
+}
+
bool __weak bpf_jit_supports_kfunc_call(void)
{
return false;
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index c2867068e5bd..a0e02b009487 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -477,7 +477,7 @@ static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
- err = xdp_ok_fwd_dev(dev, xdpf->len);
+ err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
if (unlikely(err))
return err;
@@ -536,7 +536,7 @@ static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
- if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
+ if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
return false;
return true;
@@ -845,7 +845,7 @@ static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
struct bpf_dtab_netdev *dev;
dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
- GFP_ATOMIC | __GFP_NOWARN,
+ GFP_NOWAIT | __GFP_NOWARN,
dtab->map.numa_node);
if (!dev)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 17fb69c0e0dc..da7578426a46 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -61,7 +61,7 @@
*
* As regular device interrupt handlers and soft interrupts are forced into
* thread context, the existing code which does
- * spin_lock*(); alloc(GPF_ATOMIC); spin_unlock*();
+ * spin_lock*(); alloc(GFP_ATOMIC); spin_unlock*();
* just works.
*
* In theory the BPF locks could be converted to regular spinlocks as well,
@@ -978,7 +978,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
goto dec_count;
}
l_new = bpf_map_kmalloc_node(&htab->map, htab->elem_size,
- GFP_ATOMIC | __GFP_NOWARN,
+ GFP_NOWAIT | __GFP_NOWARN,
htab->map.numa_node);
if (!l_new) {
l_new = ERR_PTR(-ENOMEM);
@@ -996,7 +996,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else {
/* alloc_percpu zero-fills */
pptr = bpf_map_alloc_percpu(&htab->map, size, 8,
- GFP_ATOMIC | __GFP_NOWARN);
+ GFP_NOWAIT | __GFP_NOWARN);
if (!pptr) {
kfree(l_new);
l_new = ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index bb1254f07667..1f961f9982d2 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -584,7 +584,7 @@ BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
return strncmp(s1, s2, s1_sz);
}
-const struct bpf_func_proto bpf_strncmp_proto = {
+static const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1402,7 +1402,7 @@ BPF_CALL_2(bpf_kptr_xchg, void *, map_value, void *, ptr)
*/
#define BPF_PTR_POISON ((void *)((0xeB9FUL << 2) + POISON_POINTER_DELTA))
-const struct bpf_func_proto bpf_kptr_xchg_proto = {
+static const struct bpf_func_proto bpf_kptr_xchg_proto = {
.func = bpf_kptr_xchg,
.gpl_only = false,
.ret_type = RET_PTR_TO_BTF_ID_OR_NULL,
@@ -1487,7 +1487,7 @@ error:
return err;
}
-const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
+static const struct bpf_func_proto bpf_dynptr_from_mem_proto = {
.func = bpf_dynptr_from_mem,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1514,7 +1514,7 @@ BPF_CALL_5(bpf_dynptr_read, void *, dst, u32, len, struct bpf_dynptr_kern *, src
return 0;
}
-const struct bpf_func_proto bpf_dynptr_read_proto = {
+static const struct bpf_func_proto bpf_dynptr_read_proto = {
.func = bpf_dynptr_read,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1542,7 +1542,7 @@ BPF_CALL_5(bpf_dynptr_write, struct bpf_dynptr_kern *, dst, u32, offset, void *,
return 0;
}
-const struct bpf_func_proto bpf_dynptr_write_proto = {
+static const struct bpf_func_proto bpf_dynptr_write_proto = {
.func = bpf_dynptr_write,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1570,7 +1570,7 @@ BPF_CALL_3(bpf_dynptr_data, struct bpf_dynptr_kern *, ptr, u32, offset, u32, len
return (unsigned long)(ptr->data + ptr->offset + offset);
}
-const struct bpf_func_proto bpf_dynptr_data_proto = {
+static const struct bpf_func_proto bpf_dynptr_data_proto = {
.func = bpf_dynptr_data,
.gpl_only = false,
.ret_type = RET_PTR_TO_DYNPTR_MEM_OR_NULL,
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 8654fc97f5fe..49ef0ce040c7 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -165,7 +165,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
}
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
- __GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
+ __GFP_ZERO | GFP_NOWAIT | __GFP_NOWARN,
map->numa_node);
if (!new)
return -ENOMEM;
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c
index f0d05a3cc4b9..d789e3b831ad 100644
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -285,7 +285,7 @@ static struct lpm_trie_node *lpm_trie_node_alloc(const struct lpm_trie *trie,
if (value)
size += trie->map.value_size;
- node = bpf_map_kmalloc_node(&trie->map, size, GFP_ATOMIC | __GFP_NOWARN,
+ node = bpf_map_kmalloc_node(&trie->map, size, GFP_NOWAIT | __GFP_NOWARN,
trie->map.numa_node);
if (!node)
return NULL;
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 3d897de89061..00b874c8e889 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -31,7 +31,7 @@ static inline void pcpu_freelist_push_node(struct pcpu_freelist_head *head,
struct pcpu_freelist_node *node)
{
node->next = head->first;
- head->first = node;
+ WRITE_ONCE(head->first, node);
}
static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
@@ -130,14 +130,17 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ goto next_cpu;
raw_spin_lock(&head->lock);
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
+next_cpu:
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
@@ -146,10 +149,12 @@ static struct pcpu_freelist_node *___pcpu_freelist_pop(struct pcpu_freelist *s)
}
/* per cpu lists are all empty, try extralist */
+ if (!READ_ONCE(s->extralist.first))
+ return NULL;
raw_spin_lock(&s->extralist.lock);
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
@@ -164,15 +169,18 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
+ if (!READ_ONCE(head->first))
+ goto next_cpu;
if (raw_spin_trylock(&head->lock)) {
node = head->first;
if (node) {
- head->first = node->next;
+ WRITE_ONCE(head->first, node->next);
raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
}
+next_cpu:
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
@@ -181,11 +189,11 @@ ___pcpu_freelist_pop_nmi(struct pcpu_freelist *s)
}
/* cannot pop from per cpu lists, try extralist */
- if (!raw_spin_trylock(&s->extralist.lock))
+ if (!READ_ONCE(s->extralist.first) || !raw_spin_trylock(&s->extralist.lock))
return NULL;
node = s->extralist.first;
if (node)
- s->extralist.first = node->next;
+ WRITE_ONCE(s->extralist.first, node->next);
raw_spin_unlock(&s->extralist.lock);
return node;
}
diff --git a/kernel/bpf/preload/iterators/Makefile b/kernel/bpf/preload/iterators/Makefile
index bfe24f8c5a20..6762b1260f2f 100644
--- a/kernel/bpf/preload/iterators/Makefile
+++ b/kernel/bpf/preload/iterators/Makefile
@@ -9,7 +9,7 @@ LLVM_STRIP ?= llvm-strip
TOOLS_PATH := $(abspath ../../../../tools)
BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
BPFTOOL_OUTPUT := $(abs_out)/bpftool
-DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
+DEFAULT_BPFTOOL := $(BPFTOOL_OUTPUT)/bootstrap/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
@@ -61,9 +61,5 @@ $(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OU
OUTPUT=$(abspath $(dir $@))/ prefix= \
DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
-$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
- $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
- OUTPUT=$(BPFTOOL_OUTPUT)/ \
- LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
- LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
- prefix= DESTDIR=$(abs_out)/ install-bin
+$(DEFAULT_BPFTOOL): | $(BPFTOOL_OUTPUT)
+ $(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) OUTPUT=$(BPFTOOL_OUTPUT)/ bootstrap
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 2b69306d3c6e..83c7136c5788 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -419,35 +419,53 @@ void bpf_map_free_id(struct bpf_map *map, bool do_idr_lock)
#ifdef CONFIG_MEMCG_KMEM
static void bpf_map_save_memcg(struct bpf_map *map)
{
- map->memcg = get_mem_cgroup_from_mm(current->mm);
+ /* Currently if a map is created by a process belonging to the root
+ * memory cgroup, get_obj_cgroup_from_current() will return NULL.
+ * So we have to check map->objcg for being NULL each time it's
+ * being used.
+ */
+ map->objcg = get_obj_cgroup_from_current();
}
static void bpf_map_release_memcg(struct bpf_map *map)
{
- mem_cgroup_put(map->memcg);
+ if (map->objcg)
+ obj_cgroup_put(map->objcg);
+}
+
+static struct mem_cgroup *bpf_map_get_memcg(const struct bpf_map *map)
+{
+ if (map->objcg)
+ return get_mem_cgroup_from_objcg(map->objcg);
+
+ return root_mem_cgroup;
}
void *bpf_map_kmalloc_node(const struct bpf_map *map, size_t size, gfp_t flags,
int node)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kmalloc_node(size, flags | __GFP_ACCOUNT, node);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = kzalloc(size, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -455,12 +473,14 @@ void *bpf_map_kzalloc(const struct bpf_map *map, size_t size, gfp_t flags)
void __percpu *bpf_map_alloc_percpu(const struct bpf_map *map, size_t size,
size_t align, gfp_t flags)
{
- struct mem_cgroup *old_memcg;
+ struct mem_cgroup *memcg, *old_memcg;
void __percpu *ptr;
- old_memcg = set_active_memcg(map->memcg);
+ memcg = bpf_map_get_memcg(map);
+ old_memcg = set_active_memcg(memcg);
ptr = __alloc_percpu_gfp(size, align, flags | __GFP_ACCOUNT);
set_active_memcg(old_memcg);
+ mem_cgroup_put(memcg);
return ptr;
}
@@ -3416,6 +3436,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
return BPF_PROG_TYPE_SK_LOOKUP;
case BPF_XDP:
return BPF_PROG_TYPE_XDP;
+ case BPF_LSM_CGROUP:
+ return BPF_PROG_TYPE_LSM;
default:
return BPF_PROG_TYPE_UNSPEC;
}
@@ -3469,6 +3491,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
+ if (ptype == BPF_PROG_TYPE_LSM &&
+ prog->expected_attach_type != BPF_LSM_CGROUP)
+ return -EINVAL;
+
ret = cgroup_bpf_prog_attach(attr, ptype, prog);
break;
default:
@@ -3506,13 +3533,14 @@ static int bpf_prog_detach(const union bpf_attr *attr)
case BPF_PROG_TYPE_CGROUP_SOCKOPT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
case BPF_PROG_TYPE_SOCK_OPS:
+ case BPF_PROG_TYPE_LSM:
return cgroup_bpf_prog_detach(attr, ptype);
default:
return -EINVAL;
}
}
-#define BPF_PROG_QUERY_LAST_FIELD query.prog_cnt
+#define BPF_PROG_QUERY_LAST_FIELD query.prog_attach_flags
static int bpf_prog_query(const union bpf_attr *attr,
union bpf_attr __user *uattr)
@@ -3548,6 +3576,7 @@ static int bpf_prog_query(const union bpf_attr *attr,
case BPF_CGROUP_SYSCTL:
case BPF_CGROUP_GETSOCKOPT:
case BPF_CGROUP_SETSOCKOPT:
+ case BPF_LSM_CGROUP:
return cgroup_bpf_prog_query(attr, uattr);
case BPF_LIRC_MODE2:
return lirc_prog_query(attr, uattr);
@@ -4058,6 +4087,11 @@ static int bpf_prog_get_info_by_fd(struct file *file,
if (prog->aux->btf)
info.btf_id = btf_obj_id(prog->aux->btf);
+ info.attach_btf_id = prog->aux->attach_btf_id;
+ if (prog->aux->attach_btf)
+ info.attach_btf_obj_id = btf_obj_id(prog->aux->attach_btf);
+ else if (prog->aux->dst_prog)
+ info.attach_btf_obj_id = btf_obj_id(prog->aux->dst_prog->aux->attach_btf);
ulen = info.nr_func_info;
info.nr_func_info = prog->aux->func_info_cnt;
@@ -4090,14 +4124,15 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.nr_jited_line_info = 0;
if (info.nr_jited_line_info && ulen) {
if (bpf_dump_raw_ok(file->f_cred)) {
+ unsigned long line_addr;
__u64 __user *user_linfo;
u32 i;
user_linfo = u64_to_user_ptr(info.jited_line_info);
ulen = min_t(u32, info.nr_jited_line_info, ulen);
for (i = 0; i < ulen; i++) {
- if (put_user((__u64)(long)prog->aux->jited_linfo[i],
- &user_linfo[i]))
+ line_addr = (unsigned long)prog->aux->jited_linfo[i];
+ if (put_user((__u64)line_addr, &user_linfo[i]))
return -EFAULT;
}
} else {
@@ -4539,6 +4574,8 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
ret = bpf_raw_tp_link_attach(prog, NULL);
else if (prog->expected_attach_type == BPF_TRACE_ITER)
ret = bpf_iter_link_attach(attr, uattr, prog);
+ else if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ ret = cgroup_bpf_link_attach(attr, prog);
else
ret = bpf_tracing_prog_attach(prog,
attr->link_create.target_fd,
@@ -5130,7 +5167,7 @@ BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flag
return *res ? 0 : -ENOENT;
}
-const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
+static const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c
index 93c7675f0c9e..0f532e6a717f 100644
--- a/kernel/bpf/trampoline.c
+++ b/kernel/bpf/trampoline.c
@@ -11,6 +11,9 @@
#include <linux/rcupdate_wait.h>
#include <linux/module.h>
#include <linux/static_call.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf_lsm.h>
+#include <linux/delay.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@@ -27,6 +30,81 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex);
+
+static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd)
+{
+ struct bpf_trampoline *tr = ops->private;
+ int ret = 0;
+
+ if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) {
+ /* This is called inside register_ftrace_direct_multi(), so
+ * tr->mutex is already locked.
+ */
+ lockdep_assert_held_once(&tr->mutex);
+
+ /* Instead of updating the trampoline here, we propagate
+ * -EAGAIN to register_ftrace_direct_multi(). Then we can
+ * retry register_ftrace_direct_multi() after updating the
+ * trampoline.
+ */
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) {
+ if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY))
+ return -EBUSY;
+
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+ return -EAGAIN;
+ }
+
+ return 0;
+ }
+
+ /* The normal locking order is
+ * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c)
+ *
+ * The following two commands are called from
+ *
+ * prepare_direct_functions_for_ipmodify
+ * cleanup_direct_functions_after_ipmodify
+ *
+ * In both cases, direct_mutex is already locked. Use
+ * mutex_trylock(&tr->mutex) to avoid deadlock in race condition
+ * (something else is making changes to this same trampoline).
+ */
+ if (!mutex_trylock(&tr->mutex)) {
+ /* sleep 1 ms to make sure whatever holding tr->mutex makes
+ * some progress.
+ */
+ msleep(1);
+ return -EAGAIN;
+ }
+
+ switch (cmd) {
+ case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER:
+ tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) &&
+ !(tr->flags & BPF_TRAMP_F_ORIG_STACK))
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER:
+ tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY;
+
+ if (tr->flags & BPF_TRAMP_F_ORIG_STACK)
+ ret = bpf_trampoline_update(tr, false /* lock_direct_mutex */);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ mutex_unlock(&tr->mutex);
+ return ret;
+}
+#endif
+
bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
enum bpf_attach_type eatype = prog->expected_attach_type;
@@ -87,6 +165,16 @@ static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
tr = kzalloc(sizeof(*tr), GFP_KERNEL);
if (!tr)
goto out;
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ tr->fops = kzalloc(sizeof(struct ftrace_ops), GFP_KERNEL);
+ if (!tr->fops) {
+ kfree(tr);
+ tr = NULL;
+ goto out;
+ }
+ tr->fops->private = tr;
+ tr->fops->ops_func = bpf_tramp_ftrace_ops_func;
+#endif
tr->key = key;
INIT_HLIST_NODE(&tr->hlist);
@@ -126,7 +214,7 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
int ret;
if (tr->func.ftrace_managed)
- ret = unregister_ftrace_direct((long)ip, (long)old_addr);
+ ret = unregister_ftrace_direct_multi(tr->fops, (long)old_addr);
else
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
@@ -135,15 +223,20 @@ static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
return ret;
}
-static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
+static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr,
+ bool lock_direct_mutex)
{
void *ip = tr->func.addr;
int ret;
- if (tr->func.ftrace_managed)
- ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ if (lock_direct_mutex)
+ ret = modify_ftrace_direct_multi(tr->fops, (long)new_addr);
+ else
+ ret = modify_ftrace_direct_multi_nolock(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
+ }
return ret;
}
@@ -155,16 +248,21 @@ static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
int ret;
faddr = ftrace_location((unsigned long)ip);
- if (faddr)
+ if (faddr) {
+ if (!tr->fops)
+ return -ENOTSUPP;
tr->func.ftrace_managed = true;
+ }
if (bpf_trampoline_module_get(tr))
return -ENOENT;
- if (tr->func.ftrace_managed)
- ret = register_ftrace_direct((long)ip, (long)new_addr);
- else
+ if (tr->func.ftrace_managed) {
+ ftrace_set_filter_ip(tr->fops, (unsigned long)ip, 0, 1);
+ ret = register_ftrace_direct_multi(tr->fops, (long)new_addr);
+ } else {
ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
+ }
if (ret)
bpf_trampoline_module_put(tr);
@@ -330,11 +428,11 @@ out:
return ERR_PTR(err);
}
-static int bpf_trampoline_update(struct bpf_trampoline *tr)
+static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex)
{
struct bpf_tramp_image *im;
struct bpf_tramp_links *tlinks;
- u32 flags = BPF_TRAMP_F_RESTORE_REGS;
+ u32 orig_flags = tr->flags;
bool ip_arg = false;
int err, total;
@@ -356,15 +454,31 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
goto out;
}
+ /* clear all bits except SHARE_IPMODIFY */
+ tr->flags &= BPF_TRAMP_F_SHARE_IPMODIFY;
+
if (tlinks[BPF_TRAMP_FEXIT].nr_links ||
- tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links)
- flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) {
+ /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME
+ * should not be set together.
+ */
+ tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
+ } else {
+ tr->flags |= BPF_TRAMP_F_RESTORE_REGS;
+ }
if (ip_arg)
- flags |= BPF_TRAMP_F_IP_ARG;
+ tr->flags |= BPF_TRAMP_F_IP_ARG;
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+again:
+ if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) &&
+ (tr->flags & BPF_TRAMP_F_CALL_ORIG))
+ tr->flags |= BPF_TRAMP_F_ORIG_STACK;
+#endif
err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
- &tr->func.model, flags, tlinks,
+ &tr->func.model, tr->flags, tlinks,
tr->func.addr);
if (err < 0)
goto out;
@@ -373,17 +487,34 @@ static int bpf_trampoline_update(struct bpf_trampoline *tr)
WARN_ON(!tr->cur_image && tr->selector);
if (tr->cur_image)
/* progs already running at this address */
- err = modify_fentry(tr, tr->cur_image->image, im->image);
+ err = modify_fentry(tr, tr->cur_image->image, im->image, lock_direct_mutex);
else
/* first time registering */
err = register_fentry(tr, im->image);
+
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+ if (err == -EAGAIN) {
+ /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now
+ * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the
+ * trampoline again, and retry register.
+ */
+ /* reset fops->func and fops->trampoline for re-register */
+ tr->fops->func = NULL;
+ tr->fops->trampoline = 0;
+ goto again;
+ }
+#endif
if (err)
goto out;
+
if (tr->cur_image)
bpf_tramp_image_put(tr->cur_image);
tr->cur_image = im;
tr->selector++;
out:
+ /* If any error happens, restore previous flags */
+ if (err)
+ tr->flags = orig_flags;
kfree(tlinks);
return err;
}
@@ -410,7 +541,7 @@ static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
}
}
-int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
struct bpf_tramp_link *link_exiting;
@@ -418,81 +549,256 @@ int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline
int cnt = 0, i;
kind = bpf_attach_type_to_tramp(link->link.prog);
- mutex_lock(&tr->mutex);
- if (tr->extension_prog) {
+ if (tr->extension_prog)
/* cannot attach fentry/fexit if extension prog is attached.
* cannot overwrite extension prog either.
*/
- err = -EBUSY;
- goto out;
- }
+ return -EBUSY;
for (i = 0; i < BPF_TRAMP_MAX; i++)
cnt += tr->progs_cnt[i];
if (kind == BPF_TRAMP_REPLACE) {
/* Cannot attach extension if fentry/fexit are in use. */
- if (cnt) {
- err = -EBUSY;
- goto out;
- }
+ if (cnt)
+ return -EBUSY;
tr->extension_prog = link->link.prog;
- err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
- link->link.prog->bpf_func);
- goto out;
- }
- if (cnt >= BPF_MAX_TRAMP_LINKS) {
- err = -E2BIG;
- goto out;
+ return bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
+ link->link.prog->bpf_func);
}
- if (!hlist_unhashed(&link->tramp_hlist)) {
+ if (cnt >= BPF_MAX_TRAMP_LINKS)
+ return -E2BIG;
+ if (!hlist_unhashed(&link->tramp_hlist))
/* prog already linked */
- err = -EBUSY;
- goto out;
- }
+ return -EBUSY;
hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) {
if (link_exiting->link.prog != link->link.prog)
continue;
/* prog already linked */
- err = -EBUSY;
- goto out;
+ return -EBUSY;
}
hlist_add_head(&link->tramp_hlist, &tr->progs_hlist[kind]);
tr->progs_cnt[kind]++;
- err = bpf_trampoline_update(tr);
+ err = bpf_trampoline_update(tr, true /* lock_direct_mutex */);
if (err) {
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
}
-out:
+ return err;
+}
+
+int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_link_prog(link, tr);
mutex_unlock(&tr->mutex);
return err;
}
-/* bpf_trampoline_unlink_prog() should never fail. */
-int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
{
enum bpf_tramp_prog_type kind;
int err;
kind = bpf_attach_type_to_tramp(link->link.prog);
- mutex_lock(&tr->mutex);
if (kind == BPF_TRAMP_REPLACE) {
WARN_ON_ONCE(!tr->extension_prog);
err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
tr->extension_prog->bpf_func, NULL);
tr->extension_prog = NULL;
- goto out;
+ return err;
}
hlist_del_init(&link->tramp_hlist);
tr->progs_cnt[kind]--;
- err = bpf_trampoline_update(tr);
-out:
+ return bpf_trampoline_update(tr, true /* lock_direct_mutex */);
+}
+
+/* bpf_trampoline_unlink_prog() should never fail. */
+int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr)
+{
+ int err;
+
+ mutex_lock(&tr->mutex);
+ err = __bpf_trampoline_unlink_prog(link, tr);
+ mutex_unlock(&tr->mutex);
+ return err;
+}
+
+#if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM)
+static void bpf_shim_tramp_link_release(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */
+ if (!shim_link->trampoline)
+ return;
+
+ WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline));
+ bpf_trampoline_put(shim_link->trampoline);
+}
+
+static void bpf_shim_tramp_link_dealloc(struct bpf_link *link)
+{
+ struct bpf_shim_tramp_link *shim_link =
+ container_of(link, struct bpf_shim_tramp_link, link.link);
+
+ kfree(shim_link);
+}
+
+static const struct bpf_link_ops bpf_shim_tramp_link_lops = {
+ .release = bpf_shim_tramp_link_release,
+ .dealloc = bpf_shim_tramp_link_dealloc,
+};
+
+static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog,
+ bpf_func_t bpf_func,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_prog *p;
+
+ shim_link = kzalloc(sizeof(*shim_link), GFP_USER);
+ if (!shim_link)
+ return NULL;
+
+ p = bpf_prog_alloc(1, 0);
+ if (!p) {
+ kfree(shim_link);
+ return NULL;
+ }
+
+ p->jited = false;
+ p->bpf_func = bpf_func;
+
+ p->aux->cgroup_atype = cgroup_atype;
+ p->aux->attach_func_proto = prog->aux->attach_func_proto;
+ p->aux->attach_btf_id = prog->aux->attach_btf_id;
+ p->aux->attach_btf = prog->aux->attach_btf;
+ btf_get(p->aux->attach_btf);
+ p->type = BPF_PROG_TYPE_LSM;
+ p->expected_attach_type = BPF_LSM_MAC;
+ bpf_prog_inc(p);
+ bpf_link_init(&shim_link->link.link, BPF_LINK_TYPE_UNSPEC,
+ &bpf_shim_tramp_link_lops, p);
+ bpf_cgroup_atype_get(p->aux->attach_btf_id, cgroup_atype);
+
+ return shim_link;
+}
+
+static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr,
+ bpf_func_t bpf_func)
+{
+ struct bpf_tramp_link *link;
+ int kind;
+
+ for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
+ hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) {
+ struct bpf_prog *p = link->link.prog;
+
+ if (p->bpf_func == bpf_func)
+ return container_of(link, struct bpf_shim_tramp_link, link);
+ }
+ }
+
+ return NULL;
+}
+
+int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog,
+ int cgroup_atype)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_attach_target_info tgt_info = {};
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+ int err;
+
+ err = bpf_check_attach_target(NULL, prog, NULL,
+ prog->aux->attach_btf_id,
+ &tgt_info);
+ if (err)
+ return err;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_get(key, &tgt_info);
+ if (!tr)
+ return -ENOMEM;
+
+ mutex_lock(&tr->mutex);
+
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ if (shim_link) {
+ /* Reusing existing shim attached by the other program. */
+ bpf_link_inc(&shim_link->link.link);
+
+ mutex_unlock(&tr->mutex);
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+ return 0;
+ }
+
+ /* Allocate and install new shim. */
+
+ shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype);
+ if (!shim_link) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ err = __bpf_trampoline_link_prog(&shim_link->link, tr);
+ if (err)
+ goto err;
+
+ shim_link->trampoline = tr;
+ /* note, we're still holding tr refcnt from above */
+
mutex_unlock(&tr->mutex);
+
+ return 0;
+err:
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ /* have to release tr while _not_ holding its mutex */
+ bpf_trampoline_put(tr); /* bpf_trampoline_get above */
+
return err;
}
+void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog)
+{
+ struct bpf_shim_tramp_link *shim_link = NULL;
+ struct bpf_trampoline *tr;
+ bpf_func_t bpf_func;
+ u64 key;
+
+ key = bpf_trampoline_compute_key(NULL, prog->aux->attach_btf,
+ prog->aux->attach_btf_id);
+
+ bpf_lsm_find_cgroup_shim(prog, &bpf_func);
+ tr = bpf_trampoline_lookup(key);
+ if (WARN_ON_ONCE(!tr))
+ return;
+
+ mutex_lock(&tr->mutex);
+ shim_link = cgroup_shim_find(tr, bpf_func);
+ mutex_unlock(&tr->mutex);
+
+ if (shim_link)
+ bpf_link_put(&shim_link->link.link);
+
+ bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */
+}
+#endif
+
struct bpf_trampoline *bpf_trampoline_get(u64 key,
struct bpf_attach_target_info *tgt_info)
{
@@ -535,6 +841,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
* multiple rcu callbacks.
*/
hlist_del(&tr->hlist);
+ kfree(tr->fops);
kfree(tr);
out:
mutex_unlock(&trampoline_mutex);
@@ -625,6 +932,31 @@ void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, struct bpf_tramp_
rcu_read_unlock();
}
+u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __acquires(RCU)
+{
+ /* Runtime stats are exported via actual BPF_LSM_CGROUP
+ * programs, not the shims.
+ */
+ rcu_read_lock();
+ migrate_disable();
+
+ run_ctx->saved_run_ctx = bpf_set_run_ctx(&run_ctx->run_ctx);
+
+ return NO_START_TIME;
+}
+
+void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start,
+ struct bpf_tramp_run_ctx *run_ctx)
+ __releases(RCU)
+{
+ bpf_reset_run_ctx(run_ctx->saved_run_ctx);
+
+ migrate_enable();
+ rcu_read_unlock();
+}
+
u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx)
{
rcu_read_lock_trace();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 0efbac0fd126..096fdac70165 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5533,17 +5533,6 @@ static bool arg_type_is_mem_size(enum bpf_arg_type type)
type == ARG_CONST_SIZE_OR_ZERO;
}
-static bool arg_type_is_alloc_size(enum bpf_arg_type type)
-{
- return type == ARG_CONST_ALLOC_SIZE_OR_ZERO;
-}
-
-static bool arg_type_is_int_ptr(enum bpf_arg_type type)
-{
- return type == ARG_PTR_TO_INT ||
- type == ARG_PTR_TO_LONG;
-}
-
static bool arg_type_is_release(enum bpf_arg_type type)
{
return type & OBJ_RELEASE;
@@ -5847,6 +5836,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_arg_type arg_type = fn->arg_type[arg];
enum bpf_reg_type type = reg->type;
+ u32 *arg_btf_id = NULL;
int err = 0;
if (arg_type == ARG_DONTCARE)
@@ -5883,7 +5873,11 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
*/
goto skip_type_check;
- err = check_reg_type(env, regno, arg_type, fn->arg_btf_id[arg], meta);
+ /* arg_btf_id and arg_size are in a union. */
+ if (base_type(arg_type) == ARG_PTR_TO_BTF_ID)
+ arg_btf_id = fn->arg_btf_id[arg];
+
+ err = check_reg_type(env, regno, arg_type, arg_btf_id, meta);
if (err)
return err;
@@ -5924,7 +5918,8 @@ skip_type_check:
meta->ref_obj_id = reg->ref_obj_id;
}
- if (arg_type == ARG_CONST_MAP_PTR) {
+ switch (base_type(arg_type)) {
+ case ARG_CONST_MAP_PTR:
/* bpf_map_xxx(map_ptr) call: remember that map_ptr */
if (meta->map_ptr) {
/* Use map_uid (which is unique id of inner map) to reject:
@@ -5949,7 +5944,8 @@ skip_type_check:
}
meta->map_ptr = reg->map_ptr;
meta->map_uid = reg->map_uid;
- } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ break;
+ case ARG_PTR_TO_MAP_KEY:
/* bpf_map_xxx(..., map_ptr, ..., key) call:
* check that [key, key + map->key_size) are within
* stack limits and initialized
@@ -5966,7 +5962,8 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->key_size, false,
NULL);
- } else if (base_type(arg_type) == ARG_PTR_TO_MAP_VALUE) {
+ break;
+ case ARG_PTR_TO_MAP_VALUE:
if (type_may_be_null(arg_type) && register_is_null(reg))
return 0;
@@ -5982,14 +5979,16 @@ skip_type_check:
err = check_helper_mem_access(env, regno,
meta->map_ptr->value_size, false,
meta);
- } else if (arg_type == ARG_PTR_TO_PERCPU_BTF_ID) {
+ break;
+ case ARG_PTR_TO_PERCPU_BTF_ID:
if (!reg->btf_id) {
verbose(env, "Helper has invalid btf_id in R%d\n", regno);
return -EACCES;
}
meta->ret_btf = reg->btf;
meta->ret_btf_id = reg->btf_id;
- } else if (arg_type == ARG_PTR_TO_SPIN_LOCK) {
+ break;
+ case ARG_PTR_TO_SPIN_LOCK:
if (meta->func_id == BPF_FUNC_spin_lock) {
if (process_spin_lock(env, regno, true))
return -EACCES;
@@ -6000,21 +5999,32 @@ skip_type_check:
verbose(env, "verifier internal error\n");
return -EFAULT;
}
- } else if (arg_type == ARG_PTR_TO_TIMER) {
+ break;
+ case ARG_PTR_TO_TIMER:
if (process_timer_func(env, regno, meta))
return -EACCES;
- } else if (arg_type == ARG_PTR_TO_FUNC) {
+ break;
+ case ARG_PTR_TO_FUNC:
meta->subprogno = reg->subprogno;
- } else if (base_type(arg_type) == ARG_PTR_TO_MEM) {
+ break;
+ case ARG_PTR_TO_MEM:
/* The access to this pointer is only checked when we hit the
* next is_mem_size argument below.
*/
meta->raw_mode = arg_type & MEM_UNINIT;
- } else if (arg_type_is_mem_size(arg_type)) {
- bool zero_size_allowed = (arg_type == ARG_CONST_SIZE_OR_ZERO);
-
- err = check_mem_size_reg(env, reg, regno, zero_size_allowed, meta);
- } else if (arg_type_is_dynptr(arg_type)) {
+ if (arg_type & MEM_FIXED_SIZE) {
+ err = check_helper_mem_access(env, regno,
+ fn->arg_size[arg], false,
+ meta);
+ }
+ break;
+ case ARG_CONST_SIZE:
+ err = check_mem_size_reg(env, reg, regno, false, meta);
+ break;
+ case ARG_CONST_SIZE_OR_ZERO:
+ err = check_mem_size_reg(env, reg, regno, true, meta);
+ break;
+ case ARG_PTR_TO_DYNPTR:
if (arg_type & MEM_UNINIT) {
if (!is_dynptr_reg_valid_uninit(env, reg)) {
verbose(env, "Dynptr has to be an uninitialized dynptr\n");
@@ -6048,21 +6058,28 @@ skip_type_check:
err_extra, arg + 1);
return -EINVAL;
}
- } else if (arg_type_is_alloc_size(arg_type)) {
+ break;
+ case ARG_CONST_ALLOC_SIZE_OR_ZERO:
if (!tnum_is_const(reg->var_off)) {
verbose(env, "R%d is not a known constant'\n",
regno);
return -EACCES;
}
meta->mem_size = reg->var_off.value;
- } else if (arg_type_is_int_ptr(arg_type)) {
+ break;
+ case ARG_PTR_TO_INT:
+ case ARG_PTR_TO_LONG:
+ {
int size = int_ptr_type_to_size(arg_type);
err = check_helper_mem_access(env, regno, size, false, meta);
if (err)
return err;
err = check_ptr_alignment(env, reg, 0, size, true);
- } else if (arg_type == ARG_PTR_TO_CONST_STR) {
+ break;
+ }
+ case ARG_PTR_TO_CONST_STR:
+ {
struct bpf_map *map = reg->map_ptr;
int map_off;
u64 map_addr;
@@ -6101,9 +6118,12 @@ skip_type_check:
verbose(env, "string is not zero-terminated\n");
return -EINVAL;
}
- } else if (arg_type == ARG_PTR_TO_KPTR) {
+ break;
+ }
+ case ARG_PTR_TO_KPTR:
if (process_kptr_func(env, regno, meta))
return -EACCES;
+ break;
}
return err;
@@ -6143,7 +6163,8 @@ static bool may_update_sockmap(struct bpf_verifier_env *env, int func_id)
static bool allow_tail_call_in_subprogs(struct bpf_verifier_env *env)
{
- return env->prog->jit_requested && IS_ENABLED(CONFIG_X86_64);
+ return env->prog->jit_requested &&
+ bpf_jit_supports_subprog_tailcalls();
}
static int check_map_func_compatibility(struct bpf_verifier_env *env,
@@ -6399,11 +6420,19 @@ static bool check_raw_mode_ok(const struct bpf_func_proto *fn)
return count <= 1;
}
-static bool check_args_pair_invalid(enum bpf_arg_type arg_curr,
- enum bpf_arg_type arg_next)
+static bool check_args_pair_invalid(const struct bpf_func_proto *fn, int arg)
{
- return (base_type(arg_curr) == ARG_PTR_TO_MEM) !=
- arg_type_is_mem_size(arg_next);
+ bool is_fixed = fn->arg_type[arg] & MEM_FIXED_SIZE;
+ bool has_size = fn->arg_size[arg] != 0;
+ bool is_next_size = false;
+
+ if (arg + 1 < ARRAY_SIZE(fn->arg_type))
+ is_next_size = arg_type_is_mem_size(fn->arg_type[arg + 1]);
+
+ if (base_type(fn->arg_type[arg]) != ARG_PTR_TO_MEM)
+ return is_next_size;
+
+ return has_size == is_next_size || is_next_size == is_fixed;
}
static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
@@ -6414,11 +6443,11 @@ static bool check_arg_pair_ok(const struct bpf_func_proto *fn)
* helper function specification.
*/
if (arg_type_is_mem_size(fn->arg1_type) ||
- base_type(fn->arg5_type) == ARG_PTR_TO_MEM ||
- check_args_pair_invalid(fn->arg1_type, fn->arg2_type) ||
- check_args_pair_invalid(fn->arg2_type, fn->arg3_type) ||
- check_args_pair_invalid(fn->arg3_type, fn->arg4_type) ||
- check_args_pair_invalid(fn->arg4_type, fn->arg5_type))
+ check_args_pair_invalid(fn, 0) ||
+ check_args_pair_invalid(fn, 1) ||
+ check_args_pair_invalid(fn, 2) ||
+ check_args_pair_invalid(fn, 3) ||
+ check_args_pair_invalid(fn, 4))
return false;
return true;
@@ -6459,7 +6488,10 @@ static bool check_btf_id_ok(const struct bpf_func_proto *fn)
if (base_type(fn->arg_type[i]) == ARG_PTR_TO_BTF_ID && !fn->arg_btf_id[i])
return false;
- if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i])
+ if (base_type(fn->arg_type[i]) != ARG_PTR_TO_BTF_ID && fn->arg_btf_id[i] &&
+ /* arg_btf_id and arg_size are in a union. */
+ (base_type(fn->arg_type[i]) != ARG_PTR_TO_MEM ||
+ !(fn->arg_type[i] & MEM_FIXED_SIZE)))
return false;
}
@@ -7100,9 +7132,45 @@ static int check_get_func_ip(struct bpf_verifier_env *env)
return -ENOTSUPP;
}
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static bool loop_flag_is_zero(struct bpf_verifier_env *env)
+{
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = &regs[BPF_REG_4];
+ bool reg_is_null = register_is_null(reg);
+
+ if (reg_is_null)
+ mark_chain_precision(env, BPF_REG_4);
+
+ return reg_is_null;
+}
+
+static void update_loop_inline_state(struct bpf_verifier_env *env, u32 subprogno)
+{
+ struct bpf_loop_inline_state *state = &cur_aux(env)->loop_inline_state;
+
+ if (!state->initialized) {
+ state->initialized = 1;
+ state->fit_for_inline = loop_flag_is_zero(env);
+ state->callback_subprogno = subprogno;
+ return;
+ }
+
+ if (!state->fit_for_inline)
+ return;
+
+ state->fit_for_inline = (loop_flag_is_zero(env) &&
+ state->callback_subprogno == subprogno);
+}
+
static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int *insn_idx_p)
{
+ enum bpf_prog_type prog_type = resolve_prog_type(env->prog);
const struct bpf_func_proto *fn = NULL;
enum bpf_return_type ret_type;
enum bpf_type_flag ret_flag;
@@ -7252,6 +7320,7 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
+ update_loop_inline_state(env, meta.subprogno);
err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
set_loop_callback_state);
break;
@@ -7261,6 +7330,19 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
reg_type_str(env, regs[BPF_REG_1].type));
return -EACCES;
}
+ break;
+ case BPF_FUNC_set_retval:
+ if (prog_type == BPF_PROG_TYPE_LSM &&
+ env->prog->expected_attach_type == BPF_LSM_CGROUP) {
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ verbose(env, "BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
+ return -EINVAL;
+ }
+ }
+ break;
}
if (err)
@@ -7480,6 +7562,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
int err, insn_idx = *insn_idx_p;
const struct btf_param *args;
struct btf *desc_btf;
+ u32 *kfunc_flags;
bool acq;
/* skip for now, but return error when we find this in fixup_kfunc_call */
@@ -7495,18 +7578,16 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
func_name = btf_name_by_offset(desc_btf, func->name_off);
func_proto = btf_type_by_id(desc_btf, func->type);
- if (!btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_CHECK, func_id)) {
+ kfunc_flags = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog), func_id);
+ if (!kfunc_flags) {
verbose(env, "calling kernel function %s is not allowed\n",
func_name);
return -EACCES;
}
-
- acq = btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_ACQUIRE, func_id);
+ acq = *kfunc_flags & KF_ACQUIRE;
/* Check the arguments */
- err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs);
+ err = btf_check_kfunc_arg_match(env, desc_btf, func_id, regs, *kfunc_flags);
if (err < 0)
return err;
/* In case of release function, we get register number of refcounted
@@ -7550,8 +7631,7 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
regs[BPF_REG_0].btf = desc_btf;
regs[BPF_REG_0].type = PTR_TO_BTF_ID;
regs[BPF_REG_0].btf_id = ptr_type_id;
- if (btf_kfunc_id_set_contains(desc_btf, resolve_prog_type(env->prog),
- BTF_KFUNC_TYPE_RET_NULL, func_id)) {
+ if (*kfunc_flags & KF_RET_NULL) {
regs[BPF_REG_0].type |= PTR_MAYBE_NULL;
/* For mark_ptr_or_null_reg, see 93c230e3f5bd6 */
regs[BPF_REG_0].id = ++env->id_gen;
@@ -7658,11 +7738,6 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
-static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
-{
- return &env->insn_aux_data[env->insn_idx];
-}
-
enum {
REASON_BOUNDS = -1,
REASON_TYPE = -2,
@@ -9033,7 +9108,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (opcode == BPF_END || opcode == BPF_NEG) {
if (opcode == BPF_NEG) {
- if (BPF_SRC(insn->code) != 0 ||
+ if (BPF_SRC(insn->code) != BPF_K ||
insn->src_reg != BPF_REG_0 ||
insn->off != 0 || insn->imm != 0) {
verbose(env, "BPF_NEG uses reserved fields\n");
@@ -10360,11 +10435,21 @@ static int check_return_code(struct bpf_verifier_env *env)
const bool is_subprog = frame->subprogno;
/* LSM and struct_ops func-ptr's return type could be "void" */
- if (!is_subprog &&
- (prog_type == BPF_PROG_TYPE_STRUCT_OPS ||
- prog_type == BPF_PROG_TYPE_LSM) &&
- !prog->aux->attach_func_proto->type)
- return 0;
+ if (!is_subprog) {
+ switch (prog_type) {
+ case BPF_PROG_TYPE_LSM:
+ if (prog->expected_attach_type == BPF_LSM_CGROUP)
+ /* See below, can be 0 or 0-1 depending on hook. */
+ break;
+ fallthrough;
+ case BPF_PROG_TYPE_STRUCT_OPS:
+ if (!prog->aux->attach_func_proto->type)
+ return 0;
+ break;
+ default:
+ break;
+ }
+ }
/* eBPF calling convention is such that R0 is used
* to return the value from eBPF program.
@@ -10455,6 +10540,22 @@ static int check_return_code(struct bpf_verifier_env *env)
case BPF_PROG_TYPE_SK_LOOKUP:
range = tnum_range(SK_DROP, SK_PASS);
break;
+
+ case BPF_PROG_TYPE_LSM:
+ if (env->prog->expected_attach_type != BPF_LSM_CGROUP) {
+ /* Regular BPF_PROG_TYPE_LSM programs can return
+ * any value.
+ */
+ return 0;
+ }
+ if (!env->prog->aux->attach_func_proto->type) {
+ /* Make sure programs that attach to void
+ * hooks don't try to modify return value.
+ */
+ range = tnum_range(1, 1);
+ }
+ break;
+
case BPF_PROG_TYPE_EXT:
/* freplace program can return anything as its return value
* depends on the to-be-replaced kernel func or bpf program.
@@ -10471,6 +10572,10 @@ static int check_return_code(struct bpf_verifier_env *env)
if (!tnum_in(range, reg->var_off)) {
verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
+ if (prog->expected_attach_type == BPF_LSM_CGROUP &&
+ prog_type == BPF_PROG_TYPE_LSM &&
+ !prog->aux->attach_func_proto->type)
+ verbose(env, "Note, BPF_LSM_CGROUP that attach to void LSM hooks can't modify return value!\n");
return -EINVAL;
}
@@ -10882,7 +10987,7 @@ static int check_btf_func(struct bpf_verifier_env *env,
goto err_free;
ret_type = btf_type_skip_modifiers(btf, func_proto->type, NULL);
scalar_return =
- btf_type_is_small_int(ret_type) || btf_type_is_enum(ret_type);
+ btf_type_is_small_int(ret_type) || btf_is_any_enum(ret_type);
if (i && !scalar_return && env->subprog_info[i].has_ld_abs) {
verbose(env, "LD_ABS is only allowed in functions that return 'int'.\n");
goto err_free;
@@ -12467,6 +12572,7 @@ static bool is_tracing_prog_type(enum bpf_prog_type type)
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_RAW_TRACEPOINT:
+ case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
return true;
default:
return false;
@@ -13525,6 +13631,7 @@ static int jit_subprogs(struct bpf_verifier_env *env)
/* Below members will be freed only at prog->aux */
func[i]->aux->btf = prog->aux->btf;
func[i]->aux->func_info = prog->aux->func_info;
+ func[i]->aux->func_info_cnt = prog->aux->func_info_cnt;
func[i]->aux->poke_tab = prog->aux->poke_tab;
func[i]->aux->size_poke_tab = prog->aux->size_poke_tab;
@@ -13537,9 +13644,6 @@ static int jit_subprogs(struct bpf_verifier_env *env)
poke->aux = func[i]->aux;
}
- /* Use bpf_prog_F_tag to indicate functions in stack traces.
- * Long term would need debug info to populate names
- */
func[i]->aux->name[0] = 'F';
func[i]->aux->stack_depth = env->subprog_info[i].stack_depth;
func[i]->jit_requested = 1;
@@ -14275,6 +14379,142 @@ patch_call_imm:
return 0;
}
+static struct bpf_prog *inline_bpf_loop(struct bpf_verifier_env *env,
+ int position,
+ s32 stack_base,
+ u32 callback_subprogno,
+ u32 *cnt)
+{
+ s32 r6_offset = stack_base + 0 * BPF_REG_SIZE;
+ s32 r7_offset = stack_base + 1 * BPF_REG_SIZE;
+ s32 r8_offset = stack_base + 2 * BPF_REG_SIZE;
+ int reg_loop_max = BPF_REG_6;
+ int reg_loop_cnt = BPF_REG_7;
+ int reg_loop_ctx = BPF_REG_8;
+
+ struct bpf_prog *new_prog;
+ u32 callback_start;
+ u32 call_insn_offset;
+ s32 callback_offset;
+
+ /* This represents an inlined version of bpf_iter.c:bpf_loop,
+ * be careful to modify this code in sync.
+ */
+ struct bpf_insn insn_buf[] = {
+ /* Return error and jump to the end of the patch if
+ * expected number of iterations is too big.
+ */
+ BPF_JMP_IMM(BPF_JLE, BPF_REG_1, BPF_MAX_LOOPS, 2),
+ BPF_MOV32_IMM(BPF_REG_0, -E2BIG),
+ BPF_JMP_IMM(BPF_JA, 0, 0, 16),
+ /* spill R6, R7, R8 to use these as loop vars */
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, r6_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_7, r7_offset),
+ BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_8, r8_offset),
+ /* initialize loop vars */
+ BPF_MOV64_REG(reg_loop_max, BPF_REG_1),
+ BPF_MOV32_IMM(reg_loop_cnt, 0),
+ BPF_MOV64_REG(reg_loop_ctx, BPF_REG_3),
+ /* loop header,
+ * if reg_loop_cnt >= reg_loop_max skip the loop body
+ */
+ BPF_JMP_REG(BPF_JGE, reg_loop_cnt, reg_loop_max, 5),
+ /* callback call,
+ * correct callback offset would be set after patching
+ */
+ BPF_MOV64_REG(BPF_REG_1, reg_loop_cnt),
+ BPF_MOV64_REG(BPF_REG_2, reg_loop_ctx),
+ BPF_CALL_REL(0),
+ /* increment loop counter */
+ BPF_ALU64_IMM(BPF_ADD, reg_loop_cnt, 1),
+ /* jump to loop header if callback returned 0 */
+ BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, -6),
+ /* return value of bpf_loop,
+ * set R0 to the number of iterations
+ */
+ BPF_MOV64_REG(BPF_REG_0, reg_loop_cnt),
+ /* restore original values of R6, R7, R8 */
+ BPF_LDX_MEM(BPF_DW, BPF_REG_6, BPF_REG_10, r6_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, r7_offset),
+ BPF_LDX_MEM(BPF_DW, BPF_REG_8, BPF_REG_10, r8_offset),
+ };
+
+ *cnt = ARRAY_SIZE(insn_buf);
+ new_prog = bpf_patch_insn_data(env, position, insn_buf, *cnt);
+ if (!new_prog)
+ return new_prog;
+
+ /* callback start is known only after patching */
+ callback_start = env->subprog_info[callback_subprogno].start;
+ /* Note: insn_buf[12] is an offset of BPF_CALL_REL instruction */
+ call_insn_offset = position + 12;
+ callback_offset = callback_start - call_insn_offset - 1;
+ new_prog->insnsi[call_insn_offset].imm = callback_offset;
+
+ return new_prog;
+}
+
+static bool is_bpf_loop_call(struct bpf_insn *insn)
+{
+ return insn->code == (BPF_JMP | BPF_CALL) &&
+ insn->src_reg == 0 &&
+ insn->imm == BPF_FUNC_loop;
+}
+
+/* For all sub-programs in the program (including main) check
+ * insn_aux_data to see if there are bpf_loop calls that require
+ * inlining. If such calls are found the calls are replaced with a
+ * sequence of instructions produced by `inline_bpf_loop` function and
+ * subprog stack_depth is increased by the size of 3 registers.
+ * This stack space is used to spill values of the R6, R7, R8. These
+ * registers are used to store the loop bound, counter and context
+ * variables.
+ */
+static int optimize_bpf_loop(struct bpf_verifier_env *env)
+{
+ struct bpf_subprog_info *subprogs = env->subprog_info;
+ int i, cur_subprog = 0, cnt, delta = 0;
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ u16 stack_depth = subprogs[cur_subprog].stack_depth;
+ u16 stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ u16 stack_depth_extra = 0;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ struct bpf_loop_inline_state *inline_state =
+ &env->insn_aux_data[i + delta].loop_inline_state;
+
+ if (is_bpf_loop_call(insn) && inline_state->fit_for_inline) {
+ struct bpf_prog *new_prog;
+
+ stack_depth_extra = BPF_REG_SIZE * 3 + stack_depth_roundup;
+ new_prog = inline_bpf_loop(env,
+ i + delta,
+ -(stack_depth + stack_depth_extra),
+ inline_state->callback_subprogno,
+ &cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ }
+
+ if (subprogs[cur_subprog + 1].start == i + delta + 1) {
+ subprogs[cur_subprog].stack_depth += stack_depth_extra;
+ cur_subprog++;
+ stack_depth = subprogs[cur_subprog].stack_depth;
+ stack_depth_roundup = round_up(stack_depth, 8) - stack_depth;
+ stack_depth_extra = 0;
+ }
+ }
+
+ env->prog->aux->stack_depth = env->subprog_info[0].stack_depth;
+
+ return 0;
+}
+
static void free_states(struct bpf_verifier_env *env)
{
struct bpf_verifier_state_list *sl, *sln;
@@ -14694,6 +14934,7 @@ int bpf_check_attach_target(struct bpf_verifier_log *log,
fallthrough;
case BPF_MODIFY_RETURN:
case BPF_LSM_MAC:
+ case BPF_LSM_CGROUP:
case BPF_TRACE_FENTRY:
case BPF_TRACE_FEXIT:
if (!btf_type_is_func(t)) {
@@ -14810,8 +15051,8 @@ static int check_attach_btf_id(struct bpf_verifier_env *env)
}
if (prog->aux->sleepable && prog->type != BPF_PROG_TYPE_TRACING &&
- prog->type != BPF_PROG_TYPE_LSM) {
- verbose(env, "Only fentry/fexit/fmod_ret and lsm programs can be sleepable\n");
+ prog->type != BPF_PROG_TYPE_LSM && prog->type != BPF_PROG_TYPE_KPROBE) {
+ verbose(env, "Only fentry/fexit/fmod_ret, lsm, and kprobe/uprobe programs can be sleepable\n");
return -EINVAL;
}
@@ -15012,6 +15253,9 @@ skip_full_check:
ret = check_max_stack_depth(env);
/* instruction rewrites happen after this point */
+ if (ret == 0)
+ ret = optimize_bpf_loop(env);
+
if (is_priv) {
if (ret == 0)
opt_hard_wire_dead_code_branches(env);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c9d32d4d2e20..4e718b93442b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -10083,26 +10083,30 @@ static inline bool perf_event_is_tracing(struct perf_event *event)
int perf_event_set_bpf_prog(struct perf_event *event, struct bpf_prog *prog,
u64 bpf_cookie)
{
- bool is_kprobe, is_tracepoint, is_syscall_tp;
+ bool is_kprobe, is_uprobe, is_tracepoint, is_syscall_tp;
if (!perf_event_is_tracing(event))
return perf_event_set_bpf_handler(event, prog, bpf_cookie);
- is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
+ is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_KPROBE;
+ is_uprobe = event->tp_event->flags & TRACE_EVENT_FL_UPROBE;
is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
is_syscall_tp = is_syscall_trace_event(event->tp_event);
- if (!is_kprobe && !is_tracepoint && !is_syscall_tp)
+ if (!is_kprobe && !is_uprobe && !is_tracepoint && !is_syscall_tp)
/* bpf programs can only be attached to u/kprobe or tracepoint */
return -EINVAL;
- if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
+ if (((is_kprobe || is_uprobe) && prog->type != BPF_PROG_TYPE_KPROBE) ||
(is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT) ||
(is_syscall_tp && prog->type != BPF_PROG_TYPE_TRACEPOINT))
return -EINVAL;
+ if (prog->type == BPF_PROG_TYPE_KPROBE && prog->aux->sleepable && !is_uprobe)
+ /* only uprobe programs are allowed to be sleepable */
+ return -EINVAL;
+
/* Kprobe override only works for kprobes, not uprobes. */
- if (prog->kprobe_override &&
- !(event->tp_event->flags & TRACE_EVENT_FL_KPROBE))
+ if (prog->kprobe_override && !is_kprobe)
return -EINVAL;
if (is_tracepoint || is_syscall_tp) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index fbdf8d3279ac..79a85834ce9d 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,6 +30,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/bsearch.h>
+#include <linux/btf_ids.h>
/*
* These will be re-linked against their real values
@@ -799,6 +800,96 @@ static const struct seq_operations kallsyms_op = {
.show = s_show
};
+#ifdef CONFIG_BPF_SYSCALL
+
+struct bpf_iter__ksym {
+ __bpf_md_ptr(struct bpf_iter_meta *, meta);
+ __bpf_md_ptr(struct kallsym_iter *, ksym);
+};
+
+static int ksym_prog_seq_show(struct seq_file *m, bool in_stop)
+{
+ struct bpf_iter__ksym ctx;
+ struct bpf_iter_meta meta;
+ struct bpf_prog *prog;
+
+ meta.seq = m;
+ prog = bpf_iter_get_info(&meta, in_stop);
+ if (!prog)
+ return 0;
+
+ ctx.meta = &meta;
+ ctx.ksym = m ? m->private : NULL;
+ return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_iter_ksym_seq_show(struct seq_file *m, void *p)
+{
+ return ksym_prog_seq_show(m, false);
+}
+
+static void bpf_iter_ksym_seq_stop(struct seq_file *m, void *p)
+{
+ if (!p)
+ (void) ksym_prog_seq_show(m, true);
+ else
+ s_stop(m, p);
+}
+
+static const struct seq_operations bpf_iter_ksym_ops = {
+ .start = s_start,
+ .next = s_next,
+ .stop = bpf_iter_ksym_seq_stop,
+ .show = bpf_iter_ksym_seq_show,
+};
+
+static int bpf_iter_ksym_init(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+ struct kallsym_iter *iter = priv_data;
+
+ reset_iter(iter, 0);
+
+ /* cache here as in kallsyms_open() case; use current process
+ * credentials to tell BPF iterators if values should be shown.
+ */
+ iter->show_value = kallsyms_show_value(current_cred());
+
+ return 0;
+}
+
+DEFINE_BPF_ITER_FUNC(ksym, struct bpf_iter_meta *meta, struct kallsym_iter *ksym)
+
+static const struct bpf_iter_seq_info ksym_iter_seq_info = {
+ .seq_ops = &bpf_iter_ksym_ops,
+ .init_seq_private = bpf_iter_ksym_init,
+ .fini_seq_private = NULL,
+ .seq_priv_size = sizeof(struct kallsym_iter),
+};
+
+static struct bpf_iter_reg ksym_iter_reg_info = {
+ .target = "ksym",
+ .feature = BPF_ITER_RESCHED,
+ .ctx_arg_info_size = 1,
+ .ctx_arg_info = {
+ { offsetof(struct bpf_iter__ksym, ksym),
+ PTR_TO_BTF_ID_OR_NULL },
+ },
+ .seq_info = &ksym_iter_seq_info,
+};
+
+BTF_ID_LIST(btf_ksym_iter_id)
+BTF_ID(struct, kallsym_iter)
+
+static int __init bpf_ksym_iter_register(void)
+{
+ ksym_iter_reg_info.ctx_arg_info[0].btf_id = *btf_ksym_iter_id;
+ return bpf_iter_reg_target(&ksym_iter_reg_info);
+}
+
+late_initcall(bpf_ksym_iter_register);
+
+#endif /* CONFIG_BPF_SYSCALL */
+
static inline int kallsyms_for_perf(void)
{
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 35d034219513..b233714a1c78 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1240,6 +1240,30 @@ static int do_proc_dointvec_ms_jiffies_conv(bool *negp, unsigned long *lvalp,
return 0;
}
+static int do_proc_dointvec_ms_jiffies_minmax_conv(bool *negp, unsigned long *lvalp,
+ int *valp, int write, void *data)
+{
+ int tmp, ret;
+ struct do_proc_dointvec_minmax_conv_param *param = data;
+ /*
+ * If writing, first do so via a temporary local int so we can
+ * bounds-check it before touching *valp.
+ */
+ int *ip = write ? &tmp : valp;
+
+ ret = do_proc_dointvec_ms_jiffies_conv(negp, lvalp, ip, write, data);
+ if (ret)
+ return ret;
+
+ if (write) {
+ if ((param->min && *param->min > tmp) ||
+ (param->max && *param->max < tmp))
+ return -EINVAL;
+ *valp = tmp;
+ }
+ return 0;
+}
+
/**
* proc_dointvec_jiffies - read a vector of integers as seconds
* @table: the sysctl table
@@ -1262,6 +1286,17 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
do_proc_dointvec_jiffies_conv,NULL);
}
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct do_proc_dointvec_minmax_conv_param param = {
+ .min = (int *) table->extra1,
+ .max = (int *) table->extra2,
+ };
+ return do_proc_dointvec(table, write, buffer, lenp, ppos,
+ do_proc_dointvec_ms_jiffies_minmax_conv, &param);
+}
+
/**
* proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds
* @table: the sysctl table
@@ -1526,6 +1561,12 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write,
return -ENOSYS;
}
+int proc_dointvec_ms_jiffies_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ return -ENOSYS;
+}
+
int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0ea8702eb516..23af5eca11b1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2311,6 +2311,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
return !t.task ? 0 : -EINTR;
}
+EXPORT_SYMBOL_GPL(schedule_hrtimeout_range_clock);
/**
* schedule_hrtimeout_range - sleep until timeout
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 88589d74a892..68e5cdd24cef 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1936,7 +1936,7 @@ int perf_event_attach_bpf_prog(struct perf_event *event,
event->prog = prog;
event->bpf_cookie = bpf_cookie;
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
unlock:
mutex_unlock(&bpf_event_mutex);
@@ -1962,7 +1962,7 @@ void perf_event_detach_bpf_prog(struct perf_event *event)
bpf_prog_array_delete_safe(old_array, event->prog);
} else {
rcu_assign_pointer(event->tp_event->prog_array, new_array);
- bpf_prog_array_free(old_array);
+ bpf_prog_array_free_sleepable(old_array);
}
bpf_prog_put(event->prog);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 601ccf1b2f09..bc921a3f7ea8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1861,6 +1861,8 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
ftrace_hash_rec_update_modify(ops, filter_hash, 1);
}
+static bool ops_references_ip(struct ftrace_ops *ops, unsigned long ip);
+
/*
* Try to update IPMODIFY flag on each ftrace_rec. Return 0 if it is OK
* or no-needed to update, -EBUSY if it detects a conflict of the flag
@@ -1869,6 +1871,13 @@ static void ftrace_hash_rec_enable_modify(struct ftrace_ops *ops,
* - If the hash is NULL, it hits all recs (if IPMODIFY is set, this is rejected)
* - If the hash is EMPTY_HASH, it hits nothing
* - Anything else hits the recs which match the hash entries.
+ *
+ * DIRECT ops does not have IPMODIFY flag, but we still need to check it
+ * against functions with FTRACE_FL_IPMODIFY. If there is any overlap, call
+ * ops_func(SHARE_IPMODIFY_SELF) to make sure current ops can share with
+ * IPMODIFY. If ops_func(SHARE_IPMODIFY_SELF) returns non-zero, propagate
+ * the return value to the caller and eventually to the owner of the DIRECT
+ * ops.
*/
static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_hash *old_hash,
@@ -1877,17 +1886,26 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
struct ftrace_page *pg;
struct dyn_ftrace *rec, *end = NULL;
int in_old, in_new;
+ bool is_ipmodify, is_direct;
/* Only update if the ops has been registered */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
return 0;
- if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ is_ipmodify = ops->flags & FTRACE_OPS_FL_IPMODIFY;
+ is_direct = ops->flags & FTRACE_OPS_FL_DIRECT;
+
+ /* neither IPMODIFY nor DIRECT, skip */
+ if (!is_ipmodify && !is_direct)
+ return 0;
+
+ if (WARN_ON_ONCE(is_ipmodify && is_direct))
return 0;
/*
- * Since the IPMODIFY is a very address sensitive action, we do not
- * allow ftrace_ops to set all functions to new hash.
+ * Since the IPMODIFY and DIRECT are very address sensitive
+ * actions, we do not allow ftrace_ops to set all functions to new
+ * hash.
*/
if (!new_hash || !old_hash)
return -EINVAL;
@@ -1905,12 +1923,32 @@ static int __ftrace_hash_update_ipmodify(struct ftrace_ops *ops,
continue;
if (in_new) {
- /* New entries must ensure no others are using it */
- if (rec->flags & FTRACE_FL_IPMODIFY)
- goto rollback;
- rec->flags |= FTRACE_FL_IPMODIFY;
- } else /* Removed entry */
+ if (rec->flags & FTRACE_FL_IPMODIFY) {
+ int ret;
+
+ /* Cannot have two ipmodify on same rec */
+ if (is_ipmodify)
+ goto rollback;
+
+ FTRACE_WARN_ON(rec->flags & FTRACE_FL_DIRECT);
+
+ /*
+ * Another ops with IPMODIFY is already
+ * attached. We are now attaching a direct
+ * ops. Run SHARE_IPMODIFY_SELF, to check
+ * whether sharing is supported.
+ */
+ if (!ops->ops_func)
+ return -EBUSY;
+ ret = ops->ops_func(ops, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF);
+ if (ret)
+ return ret;
+ } else if (is_ipmodify) {
+ rec->flags |= FTRACE_FL_IPMODIFY;
+ }
+ } else if (is_ipmodify) {
rec->flags &= ~FTRACE_FL_IPMODIFY;
+ }
} while_for_each_ftrace_rec();
return 0;
@@ -2454,8 +2492,7 @@ static void call_direct_funcs(unsigned long ip, unsigned long pip,
struct ftrace_ops direct_ops = {
.func = call_direct_funcs,
- .flags = FTRACE_OPS_FL_IPMODIFY
- | FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
+ .flags = FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS
| FTRACE_OPS_FL_PERMANENT,
/*
* By declaring the main trampoline as this trampoline
@@ -3072,14 +3109,14 @@ static inline int ops_traces_mod(struct ftrace_ops *ops)
}
/*
- * Check if the current ops references the record.
+ * Check if the current ops references the given ip.
*
* If the ops traces all functions, then it was already accounted for.
* If the ops does not trace the current record function, skip it.
* If the ops ignores the function via notrace filter, skip it.
*/
-static inline bool
-ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+static bool
+ops_references_ip(struct ftrace_ops *ops, unsigned long ip)
{
/* If ops isn't enabled, ignore it */
if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
@@ -3091,16 +3128,29 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
/* The function must be in the filter */
if (!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !__ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))
+ !__ftrace_lookup_ip(ops->func_hash->filter_hash, ip))
return false;
/* If in notrace hash, we ignore it too */
- if (ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
+ if (ftrace_lookup_ip(ops->func_hash->notrace_hash, ip))
return false;
return true;
}
+/*
+ * Check if the current ops references the record.
+ *
+ * If the ops traces all functions, then it was already accounted for.
+ * If the ops does not trace the current record function, skip it.
+ * If the ops ignores the function via notrace filter, skip it.
+ */
+static bool
+ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
+{
+ return ops_references_ip(ops, rec->ip);
+}
+
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
bool init_nop = ftrace_need_init_nop();
@@ -5215,6 +5265,8 @@ static struct ftrace_direct_func *ftrace_alloc_direct_func(unsigned long addr)
return direct;
}
+static int register_ftrace_function_nolock(struct ftrace_ops *ops);
+
/**
* register_ftrace_direct - Call a custom trampoline directly
* @ip: The address of the nop at the beginning of a function
@@ -5286,7 +5338,7 @@ int register_ftrace_direct(unsigned long ip, unsigned long addr)
ret = ftrace_set_filter_ip(&direct_ops, ip, 0, 0);
if (!ret && !(direct_ops.flags & FTRACE_OPS_FL_ENABLED)) {
- ret = register_ftrace_function(&direct_ops);
+ ret = register_ftrace_function_nolock(&direct_ops);
if (ret)
ftrace_set_filter_ip(&direct_ops, ip, 1, 0);
}
@@ -5545,8 +5597,7 @@ int modify_ftrace_direct(unsigned long ip,
}
EXPORT_SYMBOL_GPL(modify_ftrace_direct);
-#define MULTI_FLAGS (FTRACE_OPS_FL_IPMODIFY | FTRACE_OPS_FL_DIRECT | \
- FTRACE_OPS_FL_SAVE_REGS)
+#define MULTI_FLAGS (FTRACE_OPS_FL_DIRECT | FTRACE_OPS_FL_SAVE_REGS)
static int check_direct_multi(struct ftrace_ops *ops)
{
@@ -5639,7 +5690,7 @@ int register_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
ops->flags = MULTI_FLAGS;
ops->trampoline = FTRACE_REGS_ADDR;
- err = register_ftrace_function(ops);
+ err = register_ftrace_function_nolock(ops);
out_remove:
if (err)
@@ -5691,22 +5742,8 @@ int unregister_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
}
EXPORT_SYMBOL_GPL(unregister_ftrace_direct_multi);
-/**
- * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
- * to call something else
- * @ops: The address of the struct ftrace_ops object
- * @addr: The address of the new trampoline to call at @ops functions
- *
- * This is used to unregister currently registered direct caller and
- * register new one @addr on functions registered in @ops object.
- *
- * Note there's window between ftrace_shutdown and ftrace_startup calls
- * where there will be no callbacks called.
- *
- * Returns: zero on success. Non zero on error, which includes:
- * -EINVAL - The @ops object was not properly registered.
- */
-int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+static int
+__modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
{
struct ftrace_hash *hash;
struct ftrace_func_entry *entry, *iter;
@@ -5717,20 +5754,15 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
int i, size;
int err;
- if (check_direct_multi(ops))
- return -EINVAL;
- if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
- return -EINVAL;
-
- mutex_lock(&direct_mutex);
+ lockdep_assert_held_once(&direct_mutex);
/* Enable the tmp_ops to have the same functions as the direct ops */
ftrace_ops_init(&tmp_ops);
tmp_ops.func_hash = ops->func_hash;
- err = register_ftrace_function(&tmp_ops);
+ err = register_ftrace_function_nolock(&tmp_ops);
if (err)
- goto out_direct;
+ return err;
/*
* Now the ftrace_ops_list_func() is called to do the direct callers.
@@ -5754,7 +5786,64 @@ int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
/* Removing the tmp_ops will add the updated direct callers to the functions */
unregister_ftrace_function(&tmp_ops);
- out_direct:
+ return err;
+}
+
+/**
+ * modify_ftrace_direct_multi_nolock - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Caller should already have direct_mutex locked, so we don't lock
+ * direct_mutex here.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi_nolock(struct ftrace_ops *ops, unsigned long addr)
+{
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ return __modify_ftrace_direct_multi(ops, addr);
+}
+EXPORT_SYMBOL_GPL(modify_ftrace_direct_multi_nolock);
+
+/**
+ * modify_ftrace_direct_multi - Modify an existing direct 'multi' call
+ * to call something else
+ * @ops: The address of the struct ftrace_ops object
+ * @addr: The address of the new trampoline to call at @ops functions
+ *
+ * This is used to unregister currently registered direct caller and
+ * register new one @addr on functions registered in @ops object.
+ *
+ * Note there's window between ftrace_shutdown and ftrace_startup calls
+ * where there will be no callbacks called.
+ *
+ * Returns: zero on success. Non zero on error, which includes:
+ * -EINVAL - The @ops object was not properly registered.
+ */
+int modify_ftrace_direct_multi(struct ftrace_ops *ops, unsigned long addr)
+{
+ int err;
+
+ if (check_direct_multi(ops))
+ return -EINVAL;
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EINVAL;
+
+ mutex_lock(&direct_mutex);
+ err = __modify_ftrace_direct_multi(ops, addr);
mutex_unlock(&direct_mutex);
return err;
}
@@ -7965,6 +8054,143 @@ int ftrace_is_dead(void)
return ftrace_disabled;
}
+#ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+/*
+ * When registering ftrace_ops with IPMODIFY, it is necessary to make sure
+ * it doesn't conflict with any direct ftrace_ops. If there is existing
+ * direct ftrace_ops on a kernel function being patched, call
+ * FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER on it to enable sharing.
+ *
+ * @ops: ftrace_ops being registered.
+ *
+ * Returns:
+ * 0 on success;
+ * Negative on failure.
+ */
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i, ret;
+
+ lockdep_assert_held_once(&direct_mutex);
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return 0;
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ if (found_op) {
+ if (!op->ops_func)
+ return -EBUSY;
+
+ ret = op->ops_func(op, FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER);
+ if (ret)
+ return ret;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * Similar to prepare_direct_functions_for_ipmodify, clean up after ops
+ * with IPMODIFY is unregistered. The cleanup is optional for most DIRECT
+ * ops.
+ */
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *op;
+ int size, i;
+
+ if (!(ops->flags & FTRACE_OPS_FL_IPMODIFY))
+ return;
+
+ mutex_lock(&direct_mutex);
+
+ hash = ops->func_hash->filter_hash;
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, &hash->buckets[i], hlist) {
+ unsigned long ip = entry->ip;
+ bool found_op = false;
+
+ mutex_lock(&ftrace_lock);
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+ if (!(op->flags & FTRACE_OPS_FL_DIRECT))
+ continue;
+ if (ops_references_ip(op, ip)) {
+ found_op = true;
+ break;
+ }
+ } while_for_each_ftrace_op(op);
+ mutex_unlock(&ftrace_lock);
+
+ /* The cleanup is optional, ignore any errors */
+ if (found_op && op->ops_func)
+ op->ops_func(op, FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER);
+ }
+ }
+ mutex_unlock(&direct_mutex);
+}
+
+#define lock_direct_mutex() mutex_lock(&direct_mutex)
+#define unlock_direct_mutex() mutex_unlock(&direct_mutex)
+
+#else /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+static int prepare_direct_functions_for_ipmodify(struct ftrace_ops *ops)
+{
+ return 0;
+}
+
+static void cleanup_direct_functions_after_ipmodify(struct ftrace_ops *ops)
+{
+}
+
+#define lock_direct_mutex() do { } while (0)
+#define unlock_direct_mutex() do { } while (0)
+
+#endif /* CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS */
+
+/*
+ * Similar to register_ftrace_function, except we don't lock direct_mutex.
+ */
+static int register_ftrace_function_nolock(struct ftrace_ops *ops)
+{
+ int ret;
+
+ ftrace_ops_init(ops);
+
+ mutex_lock(&ftrace_lock);
+
+ ret = ftrace_startup(ops, 0);
+
+ mutex_unlock(&ftrace_lock);
+
+ return ret;
+}
+
/**
* register_ftrace_function - register a function for profiling
* @ops: ops structure that holds the function for profiling.
@@ -7980,14 +8206,15 @@ int register_ftrace_function(struct ftrace_ops *ops)
{
int ret;
- ftrace_ops_init(ops);
-
- mutex_lock(&ftrace_lock);
-
- ret = ftrace_startup(ops, 0);
+ lock_direct_mutex();
+ ret = prepare_direct_functions_for_ipmodify(ops);
+ if (ret < 0)
+ goto out_unlock;
- mutex_unlock(&ftrace_lock);
+ ret = register_ftrace_function_nolock(ops);
+out_unlock:
+ unlock_direct_mutex();
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_function);
@@ -8006,6 +8233,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
ret = ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
+ cleanup_direct_functions_after_ipmodify(ops);
return ret;
}
EXPORT_SYMBOL_GPL(unregister_ftrace_function);
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c3dc4f859a6b..88ba5b4bd0c5 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -16,6 +16,7 @@
#include <linux/namei.h>
#include <linux/string.h>
#include <linux/rculist.h>
+#include <linux/filter.h>
#include "trace_dynevent.h"
#include "trace_probe.h"
@@ -1342,15 +1343,15 @@ static void __uprobe_perf_func(struct trace_uprobe *tu,
int size, esize;
int rctx;
+#ifdef CONFIG_BPF_EVENTS
if (bpf_prog_array_valid(call)) {
u32 ret;
- preempt_disable();
- ret = trace_call_bpf(call, regs);
- preempt_enable();
+ ret = bpf_prog_run_array_sleepable(call->prog_array, regs, bpf_prog_run);
if (!ret)
return;
}
+#endif /* CONFIG_BPF_EVENTS */
esize = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));