diff options
Diffstat (limited to 'kernel/bpf/cgroup.c')
-rw-r--r-- | kernel/bpf/cgroup.c | 448 |
1 files changed, 428 insertions, 20 deletions
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 92a7d0cf8d13..0a00eaca6fae 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -15,19 +15,34 @@ #include <linux/bpf.h> #include <linux/bpf-cgroup.h> #include <net/sock.h> +#include <net/bpf_sk_storage.h> + +#include "../cgroup/cgroup-internal.h" DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); +void cgroup_bpf_offline(struct cgroup *cgrp) +{ + cgroup_get(cgrp); + percpu_ref_kill(&cgrp->bpf.refcnt); +} + /** - * cgroup_bpf_put() - put references of all bpf programs - * @cgrp: the cgroup to modify + * cgroup_bpf_release() - put references of all bpf programs and + * release all cgroup bpf data + * @work: work structure embedded into the cgroup to modify */ -void cgroup_bpf_put(struct cgroup *cgrp) +static void cgroup_bpf_release(struct work_struct *work) { + struct cgroup *cgrp = container_of(work, struct cgroup, + bpf.release_work); enum bpf_cgroup_storage_type stype; + struct bpf_prog_array *old_array; unsigned int type; + mutex_lock(&cgroup_mutex); + for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) { struct list_head *progs = &cgrp->bpf.progs[type]; struct bpf_prog_list *pl, *tmp; @@ -42,8 +57,29 @@ void cgroup_bpf_put(struct cgroup *cgrp) kfree(pl); static_branch_dec(&cgroup_bpf_enabled_key); } - bpf_prog_array_free(cgrp->bpf.effective[type]); + old_array = rcu_dereference_protected( + cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + bpf_prog_array_free(old_array); } + + mutex_unlock(&cgroup_mutex); + + percpu_ref_exit(&cgrp->bpf.refcnt); + cgroup_put(cgrp); +} + +/** + * cgroup_bpf_release_fn() - callback used to schedule releasing + * of bpf cgroup data + * @ref: percpu ref counter structure + */ +static void cgroup_bpf_release_fn(struct percpu_ref *ref) +{ + struct cgroup *cgrp = container_of(ref, struct cgroup, bpf.refcnt); + + INIT_WORK(&cgrp->bpf.release_work, cgroup_bpf_release); + queue_work(system_wq, &cgrp->bpf.release_work); } /* count number of elements in the list. @@ -98,7 +134,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp, */ static int compute_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu **array) + struct bpf_prog_array **array) { enum bpf_cgroup_storage_type stype; struct bpf_prog_array *progs; @@ -136,17 +172,16 @@ static int compute_effective_progs(struct cgroup *cgrp, } } while ((p = cgroup_parent(p))); - rcu_assign_pointer(*array, progs); + *array = progs; return 0; } static void activate_effective_progs(struct cgroup *cgrp, enum bpf_attach_type type, - struct bpf_prog_array __rcu *array) + struct bpf_prog_array *old_array) { - struct bpf_prog_array __rcu *old_array; - - old_array = xchg(&cgrp->bpf.effective[type], array); + rcu_swap_protected(cgrp->bpf.effective[type], old_array, + lockdep_is_held(&cgroup_mutex)); /* free prog array after grace period, since __cgroup_bpf_run_*() * might be still walking the array */ @@ -163,8 +198,13 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) * that array below is variable length */ #define NR ARRAY_SIZE(cgrp->bpf.effective) - struct bpf_prog_array __rcu *arrays[NR] = {}; - int i; + struct bpf_prog_array *arrays[NR] = {}; + int ret, i; + + ret = percpu_ref_init(&cgrp->bpf.refcnt, cgroup_bpf_release_fn, 0, + GFP_KERNEL); + if (ret) + return ret; for (i = 0; i < NR; i++) INIT_LIST_HEAD(&cgrp->bpf.progs[i]); @@ -180,6 +220,9 @@ int cgroup_bpf_inherit(struct cgroup *cgrp) cleanup: for (i = 0; i < NR; i++) bpf_prog_array_free(arrays[i]); + + percpu_ref_exit(&cgrp->bpf.refcnt); + return -ENOMEM; } @@ -193,6 +236,9 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) + continue; + err = compute_effective_progs(desc, type, &desc->bpf.inactive); if (err) goto cleanup; @@ -202,6 +248,14 @@ static int update_effective_progs(struct cgroup *cgrp, css_for_each_descendant_pre(css, &cgrp->self) { struct cgroup *desc = container_of(css, struct cgroup, self); + if (percpu_ref_is_zero(&desc->bpf.refcnt)) { + if (unlikely(desc->bpf.inactive)) { + bpf_prog_array_free(desc->bpf.inactive); + desc->bpf.inactive = NULL; + } + continue; + } + activate_effective_progs(desc, type, desc->bpf.inactive); desc->bpf.inactive = NULL; } @@ -441,10 +495,14 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, enum bpf_attach_type type = attr->query.attach_type; struct list_head *progs = &cgrp->bpf.progs[type]; u32 flags = cgrp->bpf.flags[type]; + struct bpf_prog_array *effective; int cnt, ret = 0, i; + effective = rcu_dereference_protected(cgrp->bpf.effective[type], + lockdep_is_held(&cgroup_mutex)); + if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) - cnt = bpf_prog_array_length(cgrp->bpf.effective[type]); + cnt = bpf_prog_array_length(effective); else cnt = prog_list_length(progs); @@ -461,8 +519,7 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr, } if (attr->query.query_flags & BPF_F_QUERY_EFFECTIVE) { - return bpf_prog_array_copy_to_user(cgrp->bpf.effective[type], - prog_ids, cnt); + return bpf_prog_array_copy_to_user(effective, prog_ids, cnt); } else { struct bpf_prog_list *pl; u32 id; @@ -545,8 +602,16 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr, * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * - * This function will return %-EPERM if any if an attached program was found - * and if it returned != 1 during execution. In all other cases, 0 is returned. + * For egress packets, this function can return: + * NET_XMIT_SUCCESS (0) - continue with packet output + * NET_XMIT_DROP (1) - drop packet and notify TCP to call cwr + * NET_XMIT_CN (2) - continue with packet output and notify TCP + * to call cwr + * -EPERM - drop packet + * + * For ingress packets, this function will return -EPERM if any + * attached program was found and if it returned != 1 during execution. + * Otherwise 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, @@ -572,12 +637,19 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, /* compute pointers for the bpf prog */ bpf_compute_and_save_data_end(skb, &saved_data_end); - ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, - __bpf_prog_run_save_cb); + if (type == BPF_CGROUP_INET_EGRESS) { + ret = BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY( + cgrp->bpf.effective[type], skb, __bpf_prog_run_save_cb); + } else { + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[type], skb, + __bpf_prog_run_save_cb); + ret = (ret == 1 ? 0 : -EPERM); + } bpf_restore_data_end(skb, saved_data_end); __skb_pull(skb, offset); skb->sk = save_sk; - return ret == 1 ? 0 : -EPERM; + + return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); @@ -867,6 +939,190 @@ int __cgroup_bpf_run_filter_sysctl(struct ctl_table_header *head, } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sysctl); +#ifdef CONFIG_NET +static bool __cgroup_bpf_prog_array_is_empty(struct cgroup *cgrp, + enum bpf_attach_type attach_type) +{ + struct bpf_prog_array *prog_array; + bool empty; + + rcu_read_lock(); + prog_array = rcu_dereference(cgrp->bpf.effective[attach_type]); + empty = bpf_prog_array_is_empty(prog_array); + rcu_read_unlock(); + + return empty; +} + +static int sockopt_alloc_buf(struct bpf_sockopt_kern *ctx, int max_optlen) +{ + if (unlikely(max_optlen > PAGE_SIZE) || max_optlen < 0) + return -EINVAL; + + ctx->optval = kzalloc(max_optlen, GFP_USER); + if (!ctx->optval) + return -ENOMEM; + + ctx->optval_end = ctx->optval + max_optlen; + ctx->optlen = max_optlen; + + return 0; +} + +static void sockopt_free_buf(struct bpf_sockopt_kern *ctx) +{ + kfree(ctx->optval); +} + +int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, + int *optname, char __user *optval, + int *optlen, char **kernel_optval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = *level, + .optname = *optname, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_SETSOCKOPT)) + return 0; + + ret = sockopt_alloc_buf(&ctx, *optlen); + if (ret) + return ret; + + if (copy_from_user(ctx.optval, optval, *optlen) != 0) { + ret = -EFAULT; + goto out; + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_SETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen == -1) { + /* optlen set to -1, bypass kernel */ + ret = 1; + } else if (ctx.optlen > *optlen || ctx.optlen < -1) { + /* optlen is out of bounds */ + ret = -EFAULT; + } else { + /* optlen within bounds, run kernel handler */ + ret = 0; + + /* export any potential modifications */ + *level = ctx.level; + *optname = ctx.optname; + *optlen = ctx.optlen; + *kernel_optval = ctx.optval; + } + +out: + if (ret) + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_setsockopt); + +int __cgroup_bpf_run_filter_getsockopt(struct sock *sk, int level, + int optname, char __user *optval, + int __user *optlen, int max_optlen, + int retval) +{ + struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); + struct bpf_sockopt_kern ctx = { + .sk = sk, + .level = level, + .optname = optname, + .retval = retval, + }; + int ret; + + /* Opportunistic check to see whether we have any BPF program + * attached to the hook so we don't waste time allocating + * memory and locking the socket. + */ + if (!cgroup_bpf_enabled || + __cgroup_bpf_prog_array_is_empty(cgrp, BPF_CGROUP_GETSOCKOPT)) + return retval; + + ret = sockopt_alloc_buf(&ctx, max_optlen); + if (ret) + return ret; + + if (!retval) { + /* If kernel getsockopt finished successfully, + * copy whatever was returned to the user back + * into our temporary buffer. Set optlen to the + * one that kernel returned as well to let + * BPF programs inspect the value. + */ + + if (get_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + if (ctx.optlen > max_optlen) + ctx.optlen = max_optlen; + + if (copy_from_user(ctx.optval, optval, ctx.optlen) != 0) { + ret = -EFAULT; + goto out; + } + } + + lock_sock(sk); + ret = BPF_PROG_RUN_ARRAY(cgrp->bpf.effective[BPF_CGROUP_GETSOCKOPT], + &ctx, BPF_PROG_RUN); + release_sock(sk); + + if (!ret) { + ret = -EPERM; + goto out; + } + + if (ctx.optlen > max_optlen) { + ret = -EFAULT; + goto out; + } + + /* BPF programs only allowed to set retval to 0, not some + * arbitrary value. + */ + if (ctx.retval != 0 && ctx.retval != retval) { + ret = -EFAULT; + goto out; + } + + if (copy_to_user(optval, ctx.optval, ctx.optlen) || + put_user(ctx.optlen, optlen)) { + ret = -EFAULT; + goto out; + } + + ret = ctx.retval; + +out: + sockopt_free_buf(&ctx); + return ret; +} +EXPORT_SYMBOL(__cgroup_bpf_run_filter_getsockopt); +#endif + static ssize_t sysctl_cpy_dir(const struct ctl_dir *dir, char **bufp, size_t *lenp) { @@ -1127,3 +1383,155 @@ const struct bpf_verifier_ops cg_sysctl_verifier_ops = { const struct bpf_prog_ops cg_sysctl_prog_ops = { }; + +static const struct bpf_func_proto * +cg_sockopt_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) +{ + switch (func_id) { +#ifdef CONFIG_NET + case BPF_FUNC_sk_storage_get: + return &bpf_sk_storage_get_proto; + case BPF_FUNC_sk_storage_delete: + return &bpf_sk_storage_delete_proto; +#endif +#ifdef CONFIG_INET + case BPF_FUNC_tcp_sock: + return &bpf_tcp_sock_proto; +#endif + default: + return cgroup_base_func_proto(func_id, prog); + } +} + +static bool cg_sockopt_is_valid_access(int off, int size, + enum bpf_access_type type, + const struct bpf_prog *prog, + struct bpf_insn_access_aux *info) +{ + const int size_default = sizeof(__u32); + + if (off < 0 || off >= sizeof(struct bpf_sockopt)) + return false; + + if (off % size != 0) + return false; + + if (type == BPF_WRITE) { + switch (off) { + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_GETSOCKOPT; + case offsetof(struct bpf_sockopt, optname): + /* fallthrough */ + case offsetof(struct bpf_sockopt, level): + if (size != size_default) + return false; + return prog->expected_attach_type == + BPF_CGROUP_SETSOCKOPT; + case offsetof(struct bpf_sockopt, optlen): + return size == size_default; + default: + return false; + } + } + + switch (off) { + case offsetof(struct bpf_sockopt, sk): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_SOCKET; + break; + case offsetof(struct bpf_sockopt, optval): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET; + break; + case offsetof(struct bpf_sockopt, optval_end): + if (size != sizeof(__u64)) + return false; + info->reg_type = PTR_TO_PACKET_END; + break; + case offsetof(struct bpf_sockopt, retval): + if (size != size_default) + return false; + return prog->expected_attach_type == BPF_CGROUP_GETSOCKOPT; + default: + if (size != size_default) + return false; + break; + } + return true; +} + +#define CG_SOCKOPT_ACCESS_FIELD(T, F) \ + T(BPF_FIELD_SIZEOF(struct bpf_sockopt_kern, F), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sockopt_kern, F)) + +static u32 cg_sockopt_convert_ctx_access(enum bpf_access_type type, + const struct bpf_insn *si, + struct bpf_insn *insn_buf, + struct bpf_prog *prog, + u32 *target_size) +{ + struct bpf_insn *insn = insn_buf; + + switch (si->off) { + case offsetof(struct bpf_sockopt, sk): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, sk); + break; + case offsetof(struct bpf_sockopt, level): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, level); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, level); + break; + case offsetof(struct bpf_sockopt, optname): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optname); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optname); + break; + case offsetof(struct bpf_sockopt, optlen): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, optlen); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optlen); + break; + case offsetof(struct bpf_sockopt, retval): + if (type == BPF_WRITE) + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_STX_MEM, retval); + else + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, retval); + break; + case offsetof(struct bpf_sockopt, optval): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval); + break; + case offsetof(struct bpf_sockopt, optval_end): + *insn++ = CG_SOCKOPT_ACCESS_FIELD(BPF_LDX_MEM, optval_end); + break; + } + + return insn - insn_buf; +} + +static int cg_sockopt_get_prologue(struct bpf_insn *insn_buf, + bool direct_write, + const struct bpf_prog *prog) +{ + /* Nothing to do for sockopt argument. The data is kzalloc'ated. + */ + return 0; +} + +const struct bpf_verifier_ops cg_sockopt_verifier_ops = { + .get_func_proto = cg_sockopt_func_proto, + .is_valid_access = cg_sockopt_is_valid_access, + .convert_ctx_access = cg_sockopt_convert_ctx_access, + .gen_prologue = cg_sockopt_get_prologue, +}; + +const struct bpf_prog_ops cg_sockopt_prog_ops = { +}; |