diff options
Diffstat (limited to 'kernel')
48 files changed, 731 insertions, 364 deletions
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig new file mode 100644 index 000000000000..bd04f4a44c01 --- /dev/null +++ b/kernel/bpf/Kconfig @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: GPL-2.0-only + +# BPF interpreter that, for example, classic socket filters depend on. +config BPF + bool + +# Used by archs to tell that they support BPF JIT compiler plus which +# flavour. Only one of the two can be selected for a specific arch since +# eBPF JIT supersedes the cBPF JIT. + +# Classic BPF JIT (cBPF) +config HAVE_CBPF_JIT + bool + +# Extended BPF JIT (eBPF) +config HAVE_EBPF_JIT + bool + +# Used by archs to tell that they want the BPF JIT compiler enabled by +# default for kernels that were compiled with BPF JIT support. +config ARCH_WANT_DEFAULT_BPF_JIT + bool + +menu "BPF subsystem" + +config BPF_SYSCALL + bool "Enable bpf() system call" + select BPF + select IRQ_WORK + select TASKS_TRACE_RCU + select BINARY_PRINTF + select NET_SOCK_MSG if INET + default n + help + Enable the bpf() system call that allows to manipulate BPF programs + and maps via file descriptors. + +config BPF_JIT + bool "Enable BPF Just In Time compiler" + depends on BPF + depends on HAVE_CBPF_JIT || HAVE_EBPF_JIT + depends on MODULES + help + BPF programs are normally handled by a BPF interpreter. This option + allows the kernel to generate native code when a program is loaded + into the kernel. This will significantly speed-up processing of BPF + programs. + + Note, an admin should enable this feature changing: + /proc/sys/net/core/bpf_jit_enable + /proc/sys/net/core/bpf_jit_harden (optional) + /proc/sys/net/core/bpf_jit_kallsyms (optional) + +config BPF_JIT_ALWAYS_ON + bool "Permanently enable BPF JIT and remove BPF interpreter" + depends on BPF_SYSCALL && HAVE_EBPF_JIT && BPF_JIT + help + Enables BPF JIT and removes BPF interpreter to avoid speculative + execution of BPF instructions by the interpreter. + +config BPF_JIT_DEFAULT_ON + def_bool ARCH_WANT_DEFAULT_BPF_JIT || BPF_JIT_ALWAYS_ON + depends on HAVE_EBPF_JIT && BPF_JIT + +config BPF_UNPRIV_DEFAULT_OFF + bool "Disable unprivileged BPF by default" + depends on BPF_SYSCALL + help + Disables unprivileged BPF by default by setting the corresponding + /proc/sys/kernel/unprivileged_bpf_disabled knob to 2. An admin can + still reenable it by setting it to 0 later on, or permanently + disable it by setting it to 1 (from which no other transition to + 0 is possible anymore). + +source "kernel/bpf/preload/Kconfig" + +config BPF_LSM + bool "Enable BPF LSM Instrumentation" + depends on BPF_EVENTS + depends on BPF_SYSCALL + depends on SECURITY + depends on BPF_JIT + help + Enables instrumentation of the security hooks with BPF programs for + implementing dynamic MAC and Audit Policies. + + If you are unsure how to answer this question, answer N. + +endmenu # "BPF subsystem" diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 5efb2b24012c..da471bf01b97 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -107,10 +107,12 @@ bpf_lsm_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_inode_storage_get_proto; case BPF_FUNC_inode_storage_delete: return &bpf_inode_storage_delete_proto; +#ifdef CONFIG_NET case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_proto; case BPF_FUNC_sk_storage_delete: return &bpf_sk_storage_delete_proto; +#endif /* CONFIG_NET */ case BPF_FUNC_spin_lock: return &bpf_spin_lock_proto; case BPF_FUNC_spin_unlock: diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 0600ed325fa0..f982a9f0dbc4 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -5206,6 +5206,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, m->ret_size = ret; for (i = 0; i < nargs; i++) { + if (i == nargs - 1 && args[i].type == 0) { + bpf_log(log, + "The function %s with variable args is unsupported.\n", + tname); + return -EINVAL; + } ret = __get_type_size(btf, args[i].type, &t); if (ret < 0) { bpf_log(log, @@ -5213,6 +5219,12 @@ int btf_distill_func_proto(struct bpf_verifier_log *log, tname, i, btf_kind_str[BTF_INFO_KIND(t->info)]); return -EINVAL; } + if (ret == 0) { + bpf_log(log, + "The function %s has malformed void argument.\n", + tname); + return -EINVAL; + } m->arg_size[i] = ret; } m->nr_args = nargs; diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 544773970dbc..a2f1f15ce432 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -14,6 +14,7 @@ #include <linux/jiffies.h> #include <linux/pid_namespace.h> #include <linux/proc_ns.h> +#include <linux/security.h> #include "../../lib/kstrtox.h" @@ -692,38 +693,41 @@ static int bpf_trace_copy_string(char *buf, void *unsafe_ptr, char fmt_ptype, return -EINVAL; } -/* Per-cpu temp buffers which can be used by printf-like helpers for %s or %p +/* Per-cpu temp buffers used by printf-like helpers to store the bprintf binary + * arguments representation. */ -#define MAX_PRINTF_BUF_LEN 512 +#define MAX_BPRINTF_BUF_LEN 512 -struct bpf_printf_buf { - char tmp_buf[MAX_PRINTF_BUF_LEN]; +/* Support executing three nested bprintf helper calls on a given CPU */ +#define MAX_BPRINTF_NEST_LEVEL 3 +struct bpf_bprintf_buffers { + char tmp_bufs[MAX_BPRINTF_NEST_LEVEL][MAX_BPRINTF_BUF_LEN]; }; -static DEFINE_PER_CPU(struct bpf_printf_buf, bpf_printf_buf); -static DEFINE_PER_CPU(int, bpf_printf_buf_used); +static DEFINE_PER_CPU(struct bpf_bprintf_buffers, bpf_bprintf_bufs); +static DEFINE_PER_CPU(int, bpf_bprintf_nest_level); static int try_get_fmt_tmp_buf(char **tmp_buf) { - struct bpf_printf_buf *bufs; - int used; + struct bpf_bprintf_buffers *bufs; + int nest_level; preempt_disable(); - used = this_cpu_inc_return(bpf_printf_buf_used); - if (WARN_ON_ONCE(used > 1)) { - this_cpu_dec(bpf_printf_buf_used); + nest_level = this_cpu_inc_return(bpf_bprintf_nest_level); + if (WARN_ON_ONCE(nest_level > MAX_BPRINTF_NEST_LEVEL)) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); return -EBUSY; } - bufs = this_cpu_ptr(&bpf_printf_buf); - *tmp_buf = bufs->tmp_buf; + bufs = this_cpu_ptr(&bpf_bprintf_bufs); + *tmp_buf = bufs->tmp_bufs[nest_level - 1]; return 0; } void bpf_bprintf_cleanup(void) { - if (this_cpu_read(bpf_printf_buf_used)) { - this_cpu_dec(bpf_printf_buf_used); + if (this_cpu_read(bpf_bprintf_nest_level)) { + this_cpu_dec(bpf_bprintf_nest_level); preempt_enable(); } } @@ -760,7 +764,7 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args, if (num_args && try_get_fmt_tmp_buf(&tmp_buf)) return -EBUSY; - tmp_buf_end = tmp_buf + MAX_PRINTF_BUF_LEN; + tmp_buf_end = tmp_buf + MAX_BPRINTF_BUF_LEN; *bin_args = (u32 *)tmp_buf; } @@ -1066,11 +1070,13 @@ bpf_base_func_proto(enum bpf_func_id func_id) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return &bpf_probe_read_kernel_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return &bpf_probe_read_kernel_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; case BPF_FUNC_snprintf_btf: return &bpf_snprintf_btf_proto; case BPF_FUNC_snprintf: diff --git a/kernel/bpf/ringbuf.c b/kernel/bpf/ringbuf.c index f25b719ac786..84b3b35fc0d0 100644 --- a/kernel/bpf/ringbuf.c +++ b/kernel/bpf/ringbuf.c @@ -221,25 +221,20 @@ static int ringbuf_map_get_next_key(struct bpf_map *map, void *key, return -ENOTSUPP; } -static size_t bpf_ringbuf_mmap_page_cnt(const struct bpf_ringbuf *rb) -{ - size_t data_pages = (rb->mask + 1) >> PAGE_SHIFT; - - /* consumer page + producer page + 2 x data pages */ - return RINGBUF_POS_PAGES + 2 * data_pages; -} - static int ringbuf_map_mmap(struct bpf_map *map, struct vm_area_struct *vma) { struct bpf_ringbuf_map *rb_map; - size_t mmap_sz; rb_map = container_of(map, struct bpf_ringbuf_map, map); - mmap_sz = bpf_ringbuf_mmap_page_cnt(rb_map->rb) << PAGE_SHIFT; - - if (vma->vm_pgoff * PAGE_SIZE + (vma->vm_end - vma->vm_start) > mmap_sz) - return -EINVAL; + if (vma->vm_flags & VM_WRITE) { + /* allow writable mapping for the consumer_pos only */ + if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EPERM; + } else { + vma->vm_flags &= ~VM_MAYWRITE; + } + /* remap_vmalloc_range() checks size and offset constraints */ return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF); } @@ -315,6 +310,9 @@ static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size) return NULL; len = round_up(size + BPF_RINGBUF_HDR_SZ, 8); + if (len > rb->mask + 1) + return NULL; + cons_pos = smp_load_acquire(&rb->consumer_pos); if (in_nmi()) { diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 941ca06d9dfa..ea04b0deb5ce 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -50,7 +50,8 @@ static DEFINE_SPINLOCK(map_idr_lock); static DEFINE_IDR(link_idr); static DEFINE_SPINLOCK(link_idr_lock); -int sysctl_unprivileged_bpf_disabled __read_mostly; +int sysctl_unprivileged_bpf_disabled __read_mostly = + IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0; static const struct bpf_map_ops * const bpf_map_types[] = { #define BPF_PROG_TYPE(_id, _name, prog_ctx_type, kern_ctx_type) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 757476c91c98..c6a27574242d 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6409,18 +6409,10 @@ enum { }; static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, - const struct bpf_reg_state *off_reg, - u32 *alu_limit, u8 opcode) + u32 *alu_limit, bool mask_to_left) { - bool off_is_neg = off_reg->smin_value < 0; - bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || - (opcode == BPF_SUB && !off_is_neg); u32 max = 0, ptr_limit = 0; - if (!tnum_is_const(off_reg->var_off) && - (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) - return REASON_BOUNDS; - switch (ptr_reg->type) { case PTR_TO_STACK: /* Offset 0 is out-of-bounds, but acceptable start for the @@ -6486,15 +6478,41 @@ static bool sanitize_needed(u8 opcode) return opcode == BPF_ADD || opcode == BPF_SUB; } +struct bpf_sanitize_info { + struct bpf_insn_aux_data aux; + bool mask_to_left; +}; + +static struct bpf_verifier_state * +sanitize_speculative_path(struct bpf_verifier_env *env, + const struct bpf_insn *insn, + u32 next_idx, u32 curr_idx) +{ + struct bpf_verifier_state *branch; + struct bpf_reg_state *regs; + + branch = push_stack(env, next_idx, curr_idx, true); + if (branch && insn) { + regs = branch->frame[branch->curframe]->regs; + if (BPF_SRC(insn->code) == BPF_K) { + mark_reg_unknown(env, regs, insn->dst_reg); + } else if (BPF_SRC(insn->code) == BPF_X) { + mark_reg_unknown(env, regs, insn->dst_reg); + mark_reg_unknown(env, regs, insn->src_reg); + } + } + return branch; +} + static int sanitize_ptr_alu(struct bpf_verifier_env *env, struct bpf_insn *insn, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg, struct bpf_reg_state *dst_reg, - struct bpf_insn_aux_data *tmp_aux, + struct bpf_sanitize_info *info, const bool commit_window) { - struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : tmp_aux; + struct bpf_insn_aux_data *aux = commit_window ? cur_aux(env) : &info->aux; struct bpf_verifier_state *vstate = env->cur_state; bool off_is_imm = tnum_is_const(off_reg->var_off); bool off_is_neg = off_reg->smin_value < 0; @@ -6515,7 +6533,16 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, if (vstate->speculative) goto do_sim; - err = retrieve_ptr_limit(ptr_reg, off_reg, &alu_limit, opcode); + if (!commit_window) { + if (!tnum_is_const(off_reg->var_off) && + (off_reg->smin_value < 0) != (off_reg->smax_value < 0)) + return REASON_BOUNDS; + + info->mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + } + + err = retrieve_ptr_limit(ptr_reg, &alu_limit, info->mask_to_left); if (err < 0) return err; @@ -6523,8 +6550,8 @@ static int sanitize_ptr_alu(struct bpf_verifier_env *env, /* In commit phase we narrow the masking window based on * the observed pointer move after the simulated operation. */ - alu_state = tmp_aux->alu_state; - alu_limit = abs(tmp_aux->alu_limit - alu_limit); + alu_state = info->aux.alu_state; + alu_limit = abs(info->aux.alu_limit - alu_limit); } else { alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; alu_state |= off_is_imm ? BPF_ALU_IMMEDIATE : 0; @@ -6539,8 +6566,12 @@ do_sim: /* If we're in commit phase, we're done here given we already * pushed the truncated dst_reg into the speculative verification * stack. + * + * Also, when register is a known constant, we rewrite register-based + * operation to immediate-based, and thus do not need masking (and as + * a consequence, do not need to simulate the zero-truncation either). */ - if (commit_window) + if (commit_window || off_is_imm) return 0; /* Simulate and find potential out-of-bounds access under @@ -6556,12 +6587,26 @@ do_sim: tmp = *dst_reg; *dst_reg = *ptr_reg; } - ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + ret = sanitize_speculative_path(env, NULL, env->insn_idx + 1, + env->insn_idx); if (!ptr_is_dst_reg && ret) *dst_reg = tmp; return !ret ? REASON_STACK : 0; } +static void sanitize_mark_insn_seen(struct bpf_verifier_env *env) +{ + struct bpf_verifier_state *vstate = env->cur_state; + + /* If we simulate paths under speculation, we don't update the + * insn as 'seen' such that when we verify unreachable paths in + * the non-speculative domain, sanitize_dead_code() can still + * rewrite/sanitize them. + */ + if (!vstate->speculative) + env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; +} + static int sanitize_err(struct bpf_verifier_env *env, const struct bpf_insn *insn, int reason, const struct bpf_reg_state *off_reg, @@ -6685,7 +6730,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; - struct bpf_insn_aux_data tmp_aux = {}; + struct bpf_sanitize_info info = {}; u8 opcode = BPF_OP(insn->code); u32 dst = insn->dst_reg; int ret; @@ -6754,7 +6799,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, ptr_reg, off_reg, dst_reg, - &tmp_aux, false); + &info, false); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -6895,7 +6940,7 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, return -EACCES; if (sanitize_needed(opcode)) { ret = sanitize_ptr_alu(env, insn, dst_reg, off_reg, dst_reg, - &tmp_aux, true); + &info, true); if (ret < 0) return sanitize_err(env, insn, ret, off_reg, dst_reg); } @@ -7084,11 +7129,10 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umax_val = src_reg->u32_max_value; - /* Assuming scalar64_min_max_and will be called so its safe - * to skip updating register for known 32-bit case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our minimum from the var_off, since that's inherently * bitwise. Our maximum is the minimum of the operands' maxima. @@ -7108,7 +7152,6 @@ static void scalar32_min_max_and(struct bpf_reg_state *dst_reg, dst_reg->s32_min_value = dst_reg->u32_min_value; dst_reg->s32_max_value = dst_reg->u32_max_value; } - } static void scalar_min_max_and(struct bpf_reg_state *dst_reg, @@ -7155,11 +7198,10 @@ static void scalar32_min_max_or(struct bpf_reg_state *dst_reg, s32 smin_val = src_reg->s32_min_value; u32 umin_val = src_reg->u32_min_value; - /* Assuming scalar64_min_max_or will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get our maximum from the var_off, and our minimum is the * maximum of the operands' minima @@ -7224,11 +7266,10 @@ static void scalar32_min_max_xor(struct bpf_reg_state *dst_reg, struct tnum var32_off = tnum_subreg(dst_reg->var_off); s32 smin_val = src_reg->s32_min_value; - /* Assuming scalar64_min_max_xor will be called so it is safe - * to skip updating register for known case. - */ - if (src_known && dst_known) + if (src_known && dst_known) { + __mark_reg32_known(dst_reg, var32_off.value); return; + } /* We get both minimum and maximum from the var32_off. */ dst_reg->u32_min_value = var32_off.value; @@ -8744,14 +8785,28 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, if (err) return err; } + if (pred == 1) { - /* only follow the goto, ignore fall-through */ + /* Only follow the goto, ignore fall-through. If needed, push + * the fall-through branch for simulation under speculative + * execution. + */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, *insn_idx + 1, + *insn_idx)) + return -EFAULT; *insn_idx += insn->off; return 0; } else if (pred == 0) { - /* only follow fall-through branch, since - * that's where the program will go + /* Only follow the fall-through branch, since that's where the + * program will go. If needed, push the goto branch for + * simulation under speculative execution. */ + if (!env->bypass_spec_v1 && + !sanitize_speculative_path(env, insn, + *insn_idx + insn->off + 1, + *insn_idx)) + return -EFAULT; return 0; } @@ -10624,7 +10679,7 @@ static int do_check(struct bpf_verifier_env *env) } regs = cur_regs(env); - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); prev_insn_idx = env->insn_idx; if (class == BPF_ALU || class == BPF_ALU64) { @@ -10851,7 +10906,7 @@ process_bpf_exit: return err; env->insn_idx++; - env->insn_aux_data[env->insn_idx].seen = env->pass_cnt; + sanitize_mark_insn_seen(env); } else { verbose(env, "invalid BPF_LD mode\n"); return -EINVAL; @@ -11360,6 +11415,7 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, { struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; struct bpf_insn *insn = new_prog->insnsi; + u32 old_seen = old_data[off].seen; u32 prog_len; int i; @@ -11380,7 +11436,8 @@ static int adjust_insn_aux_data(struct bpf_verifier_env *env, memcpy(new_data + off + cnt - 1, old_data + off, sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); for (i = off; i < off + cnt - 1; i++) { - new_data[i].seen = env->pass_cnt; + /* Expand insni[off]'s seen count to the patched range. */ + new_data[i].seen = old_seen; new_data[i].zext_dst = insn_has_def32(env, insn + i); } env->insn_aux_data = new_data; @@ -12704,6 +12761,9 @@ static void free_states(struct bpf_verifier_env *env) * insn_aux_data was touched. These variables are compared to clear temporary * data from failed pass. For testing and experiments do_check_common() can be * run multiple times even when prior attempt to verify is unsuccessful. + * + * Note that special handling is needed on !env->bypass_spec_v1 if this is + * ever called outside of error path with subsequent program rejection. */ static void sanitize_insn_aux_data(struct bpf_verifier_env *env) { @@ -13200,6 +13260,17 @@ int bpf_check_attach_target(struct bpf_verifier_log *log, return 0; } +BTF_SET_START(btf_id_deny) +BTF_ID_UNUSED +#ifdef CONFIG_SMP +BTF_ID(func, migrate_disable) +BTF_ID(func, migrate_enable) +#endif +#if !defined CONFIG_PREEMPT_RCU && !defined CONFIG_TINY_RCU +BTF_ID(func, rcu_read_unlock_strict) +#endif +BTF_SET_END(btf_id_deny) + static int check_attach_btf_id(struct bpf_verifier_env *env) { struct bpf_prog *prog = env->prog; @@ -13259,6 +13330,9 @@ static int check_attach_btf_id(struct bpf_verifier_env *env) ret = bpf_lsm_verify_prog(&env->log, prog); if (ret < 0) return ret; + } else if (prog->type == BPF_PROG_TYPE_TRACING && + btf_id_set_contains(&btf_id_deny, btf_id)) { + return -EINVAL; } key = bpf_trampoline_compute_key(tgt_prog, prog->aux->attach_btf, btf_id); @@ -13358,12 +13432,6 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (is_priv) env->test_state_freq = attr->prog_flags & BPF_F_TEST_STATE_FREQ; - if (bpf_prog_is_dev_bound(env->prog->aux)) { - ret = bpf_prog_offload_verifier_prep(env->prog); - if (ret) - goto skip_full_check; - } - env->explored_states = kvcalloc(state_htab_size(env), sizeof(struct bpf_verifier_state_list *), GFP_USER); @@ -13391,6 +13459,12 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr, if (ret < 0) goto skip_full_check; + if (bpf_prog_is_dev_bound(env->prog->aux)) { + ret = bpf_prog_offload_verifier_prep(env->prog); + if (ret) + goto skip_full_check; + } + ret = check_cfg(env); if (ret < 0) goto skip_full_check; diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 391aa570369b..1f274d7fc934 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -820,6 +820,10 @@ static int cgroup1_rename(struct kernfs_node *kn, struct kernfs_node *new_parent struct cgroup *cgrp = kn->priv; int ret; + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(new_name_str, '\n')) + return -EINVAL; + if (kernfs_type(kn) != KERNFS_DIR) return -ENOTDIR; if (kn->parent != new_parent) @@ -1001,7 +1005,7 @@ static int check_cgroupfs_options(struct fs_context *fc) ctx->subsys_mask &= enabled; /* - * In absense of 'none', 'name=' or subsystem name options, + * In absence of 'none', 'name=' and subsystem name options, * let's default to 'all'. */ if (!ctx->subsys_mask && !ctx->none && !ctx->name) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index e049edd66776..21ecc6ee6a6d 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -468,7 +468,7 @@ static struct cgroup_subsys_state *cgroup_css(struct cgroup *cgrp, * @cgrp: the cgroup of interest * @ss: the subsystem of interest * - * Find and get @cgrp's css assocaited with @ss. If the css doesn't exist + * Find and get @cgrp's css associated with @ss. If the css doesn't exist * or is offline, %NULL is returned. */ static struct cgroup_subsys_state *cgroup_tryget_css(struct cgroup *cgrp, @@ -1633,7 +1633,7 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) /** * css_clear_dir - remove subsys files in a cgroup directory - * @css: taget css + * @css: target css */ static void css_clear_dir(struct cgroup_subsys_state *css) { @@ -5350,7 +5350,7 @@ out_unlock: /* * This is called when the refcnt of a css is confirmed to be killed. * css_tryget_online() is now guaranteed to fail. Tell the subsystem to - * initate destruction and put the css ref from kill_css(). + * initiate destruction and put the css ref from kill_css(). */ static void css_killed_work_fn(struct work_struct *work) { @@ -5634,8 +5634,6 @@ int __init cgroup_init_early(void) return 0; } -static u16 cgroup_disable_mask __initdata; - /** * cgroup_init - cgroup initialization * @@ -5694,12 +5692,8 @@ int __init cgroup_init(void) * disabled flag and cftype registration needs kmalloc, * both of which aren't available during early_init. */ - if (cgroup_disable_mask & (1 << ssid)) { - static_branch_disable(cgroup_subsys_enabled_key[ssid]); - printk(KERN_INFO "Disabling %s control group subsystem\n", - ss->name); + if (!cgroup_ssid_enabled(ssid)) continue; - } if (cgroup1_ssid_disabled(ssid)) printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", @@ -6058,7 +6052,7 @@ out_revert: * @kargs: the arguments passed to create the child process * * This calls the cancel_fork() callbacks if a fork failed *after* - * cgroup_can_fork() succeded and cleans up references we took to + * cgroup_can_fork() succeeded and cleans up references we took to * prepare a new css_set for the child process in cgroup_can_fork(). */ void cgroup_cancel_fork(struct task_struct *child, @@ -6214,7 +6208,10 @@ static int __init cgroup_disable(char *str) if (strcmp(token, ss->name) && strcmp(token, ss->legacy_name)) continue; - cgroup_disable_mask |= 1 << i; + + static_branch_disable(cgroup_subsys_enabled_key[i]); + pr_info("Disabling %s control group subsystem\n", + ss->name); } } return 1; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index a945504c0ae7..adb5190c4429 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3376,7 +3376,7 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) } /** - * cpuset_nodemask_valid_mems_allowed - check nodemask vs. curremt mems_allowed + * cpuset_nodemask_valid_mems_allowed - check nodemask vs. current mems_allowed * @nodemask: the nodemask to be checked * * Are any of the nodes in the nodemask allowed in current->mems_allowed? diff --git a/kernel/cgroup/rdma.c b/kernel/cgroup/rdma.c index ae042c347c64..3135406608c7 100644 --- a/kernel/cgroup/rdma.c +++ b/kernel/cgroup/rdma.c @@ -244,7 +244,7 @@ EXPORT_SYMBOL(rdmacg_uncharge); * This function follows charging resource in hierarchical way. * It will fail if the charge would cause the new value to exceed the * hierarchical limit. - * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. + * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL. * Returns pointer to rdmacg for this resource when charging is successful. * * Charger needs to account resources on two criteria. diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 3a3fd2993a65..cee265cb535c 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -75,7 +75,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * @root: root of the tree to traversal * @cpu: target cpu * - * Walks the udpated rstat_cpu tree on @cpu from @root. %NULL @pos starts + * Walks the updated rstat_cpu tree on @cpu from @root. %NULL @pos starts * the traversal and %NULL return indicates the end. During traversal, * each returned cgroup is unlinked from the tree. Must be called with the * matching cgroup_rstat_cpu_lock held. diff --git a/kernel/crash_core.c b/kernel/crash_core.c index 825284baaf46..684a6061a13a 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -464,6 +464,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); VMCOREINFO_STRUCT_SIZE(mem_section); VMCOREINFO_OFFSET(mem_section, section_mem_map); + VMCOREINFO_NUMBER(SECTION_SIZE_BITS); VMCOREINFO_NUMBER(MAX_PHYSMEM_BITS); #endif VMCOREINFO_STRUCT_SIZE(page); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 8ca7d505d61c..e50df8d8f87e 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -335,6 +335,14 @@ void __init swiotlb_exit(void) } /* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + +/* * Bounce: copy the swiotlb buffer from or back to the original dma location */ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size, @@ -346,10 +354,17 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size size_t alloc_size = mem->slots[index].alloc_size; unsigned long pfn = PFN_DOWN(orig_addr); unsigned char *vaddr = phys_to_virt(tlb_addr); + unsigned int tlb_offset; if (orig_addr == INVALID_PHYS_ADDR) return; + tlb_offset = (tlb_addr & (IO_TLB_SIZE - 1)) - + swiotlb_align_offset(dev, orig_addr); + + orig_addr += tlb_offset; + alloc_size -= tlb_offset; + if (size > alloc_size) { dev_WARN_ONCE(dev, 1, "Buffer overflow detected. Allocation size: %zu. Mapping size: %zu.\n", @@ -391,14 +406,6 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size #define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) /* - * Return the offset into a iotlb slot required to keep the device happy. - */ -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) -{ - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); -} - -/* * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. */ static inline unsigned long get_max_slots(unsigned long boundary_mask) diff --git a/kernel/entry/common.c b/kernel/entry/common.c index a0b3b04fb596..bf16395b9e13 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include <linux/highmem.h> #include <linux/livepatch.h> #include <linux/audit.h> +#include <linux/tick.h> #include "common.h" @@ -186,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_disable_exit_to_user(); /* Check if any of the above work has queued a deferred wakeup */ - rcu_nocb_flush_deferred_wakeup(); + tick_nohz_user_enter_prepare(); ti_work = READ_ONCE(current_thread_info()->flags); } @@ -202,7 +203,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) lockdep_assert_irqs_disabled(); /* Flush pending rcuog wakeup before the last need_resched() check */ - rcu_nocb_flush_deferred_wakeup(); + tick_nohz_user_enter_prepare(); if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) ti_work = exit_to_user_mode_loop(regs, ti_work); diff --git a/kernel/events/core.c b/kernel/events/core.c index 2e947a485898..fe88d6eea3c2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4609,7 +4609,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task, cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); + raw_spin_lock_irqsave(&ctx->lock, flags); ++ctx->pin_count; + raw_spin_unlock_irqrestore(&ctx->lock, flags); return ctx; } @@ -6389,8 +6391,6 @@ void perf_event_wakeup(struct perf_event *event) static void perf_sigtrap(struct perf_event *event) { - struct kernel_siginfo info; - /* * We'd expect this to only occur if the irq_work is delayed and either * ctx->task or current has changed in the meantime. This can be the @@ -6405,13 +6405,8 @@ static void perf_sigtrap(struct perf_event *event) if (current->flags & PF_EXITING) return; - clear_siginfo(&info); - info.si_signo = SIGTRAP; - info.si_code = TRAP_PERF; - info.si_errno = event->attr.type; - info.si_perf = event->attr.sig_data; - info.si_addr = (void __user *)event->pending_addr; - force_sig_info(&info); + force_sig_perf((void __user *)event->pending_addr, + event->attr.type, event->attr.sig_data); } static void perf_pending_event_disable(struct perf_event *event) diff --git a/kernel/exit.c b/kernel/exit.c index fd1c04193e18..65809fac3038 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -162,7 +162,6 @@ static void __exit_signal(struct task_struct *tsk) flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); } - exit_task_sigqueue_cache(tsk); } static void delayed_put_task_struct(struct rcu_head *rhp) diff --git a/kernel/fork.c b/kernel/fork.c index dc06afd725cb..a070caed5c8e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2008,7 +2008,6 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); - p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/kernel/futex.c b/kernel/futex.c index f832b6434625..2ecb07575055 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -35,7 +35,6 @@ #include <linux/jhash.h> #include <linux/pagemap.h> #include <linux/syscalls.h> -#include <linux/hugetlb.h> #include <linux/freezer.h> #include <linux/memblock.h> #include <linux/fault-inject.h> @@ -650,7 +649,7 @@ again: key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = basepage_index(tail); + key->shared.pgoff = page_to_pgoff(tail); rcu_read_unlock(); } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index 23a7a0ba1388..db8c248ebc8c 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -70,9 +70,6 @@ bool irq_work_queue(struct irq_work *work) if (!irq_work_claim(work)) return false; - /*record irq_work call stack in order to print it in KASAN reports*/ - kasan_record_aux_stack(work); - /* Queue the entry and raise the IPI if needed. */ preempt_disable(); __irq_work_queue_local(work); diff --git a/kernel/jump_label.c b/kernel/jump_label.c index ba39fbb1f8e7..bdb0681bece8 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -309,7 +309,7 @@ EXPORT_SYMBOL_GPL(jump_label_rate_limit); static int addr_conflict(struct jump_entry *entry, void *start, void *end) { if (jump_entry_code(entry) <= (unsigned long)end && - jump_entry_code(entry) + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + jump_entry_code(entry) + jump_entry_size(entry) > (unsigned long)start) return 1; return 0; @@ -483,13 +483,14 @@ void __init jump_label_init(void) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; /* rewrite NOPs */ if (jump_label_type(iter) == JUMP_LABEL_NOP) arch_jump_label_transform_static(iter, JUMP_LABEL_NOP); - if (init_section_contains((void *)jump_entry_code(iter), 1)) - jump_entry_set_init(iter); + in_init = init_section_contains((void *)jump_entry_code(iter), 1); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) @@ -634,9 +635,10 @@ static int jump_label_add_module(struct module *mod) for (iter = iter_start; iter < iter_stop; iter++) { struct static_key *iterk; + bool in_init; - if (within_module_init(jump_entry_code(iter), mod)) - jump_entry_set_init(iter); + in_init = within_module_init(jump_entry_code(iter), mod); + jump_entry_set_init(iter, in_init); iterk = jump_entry_key(iter); if (iterk == key) diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c index c1dd02f3be8b..e65de172ccf7 100644 --- a/kernel/kcsan/debugfs.c +++ b/kernel/kcsan/debugfs.c @@ -266,9 +266,10 @@ static const struct file_operations debugfs_ops = .release = single_release }; -static void __init kcsan_debugfs_init(void) +static int __init kcsan_debugfs_init(void) { debugfs_create_file("kcsan", 0644, NULL, NULL, &debugfs_ops); + return 0; } late_initcall(kcsan_debugfs_init); diff --git a/kernel/kthread.c b/kernel/kthread.c index fe3f2a40d61e..0fccf7d0c6a1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1093,8 +1093,38 @@ void kthread_flush_work(struct kthread_work *work) EXPORT_SYMBOL_GPL(kthread_flush_work); /* - * This function removes the work from the worker queue. Also it makes sure - * that it won't get queued later via the delayed work's timer. + * Make sure that the timer is neither set nor running and could + * not manipulate the work list_head any longer. + * + * The function is called under worker->lock. The lock is temporary + * released but the timer can't be set again in the meantime. + */ +static void kthread_cancel_delayed_work_timer(struct kthread_work *work, + unsigned long *flags) +{ + struct kthread_delayed_work *dwork = + container_of(work, struct kthread_delayed_work, work); + struct kthread_worker *worker = work->worker; + + /* + * del_timer_sync() must be called to make sure that the timer + * callback is not running. The lock must be temporary released + * to avoid a deadlock with the callback. In the meantime, + * any queuing is blocked by setting the canceling counter. + */ + work->canceling++; + raw_spin_unlock_irqrestore(&worker->lock, *flags); + del_timer_sync(&dwork->timer); + raw_spin_lock_irqsave(&worker->lock, *flags); + work->canceling--; +} + +/* + * This function removes the work from the worker queue. + * + * It is called under worker->lock. The caller must make sure that + * the timer used by delayed work is not running, e.g. by calling + * kthread_cancel_delayed_work_timer(). * * The work might still be in use when this function finishes. See the * current_work proceed by the worker. @@ -1102,28 +1132,8 @@ EXPORT_SYMBOL_GPL(kthread_flush_work); * Return: %true if @work was pending and successfully canceled, * %false if @work was not pending */ -static bool __kthread_cancel_work(struct kthread_work *work, bool is_dwork, - unsigned long *flags) +static bool __kthread_cancel_work(struct kthread_work *work) { - /* Try to cancel the timer if exists. */ - if (is_dwork) { - struct kthread_delayed_work *dwork = - container_of(work, struct kthread_delayed_work, work); - struct kthread_worker *worker = work->worker; - - /* - * del_timer_sync() must be called to make sure that the timer - * callback is not running. The lock must be temporary released - * to avoid a deadlock with the callback. In the meantime, - * any queuing is blocked by setting the canceling counter. - */ - work->canceling++; - raw_spin_unlock_irqrestore(&worker->lock, *flags); - del_timer_sync(&dwork->timer); - raw_spin_lock_irqsave(&worker->lock, *flags); - work->canceling--; - } - /* * Try to remove the work from a worker list. It might either * be from worker->work_list or from worker->delayed_work_list. @@ -1176,11 +1186,23 @@ bool kthread_mod_delayed_work(struct kthread_worker *worker, /* Work must not be used with >1 worker, see kthread_queue_work() */ WARN_ON_ONCE(work->worker != worker); - /* Do not fight with another command that is canceling this work. */ + /* + * Temporary cancel the work but do not fight with another command + * that is canceling the work as well. + * + * It is a bit tricky because of possible races with another + * mod_delayed_work() and cancel_delayed_work() callers. + * + * The timer must be canceled first because worker->lock is released + * when doing so. But the work can be removed from the queue (list) + * only when it can be queued again so that the return value can + * be used for reference counting. + */ + kthread_cancel_delayed_work_timer(work, &flags); if (work->canceling) goto out; + ret = __kthread_cancel_work(work); - ret = __kthread_cancel_work(work, true, &flags); fast_queue: __kthread_queue_delayed_work(worker, dwork, delay); out: @@ -1202,7 +1224,10 @@ static bool __kthread_cancel_work_sync(struct kthread_work *work, bool is_dwork) /* Work must not be used with >1 worker, see kthread_queue_work(). */ WARN_ON_ONCE(work->worker != worker); - ret = __kthread_cancel_work(work, is_dwork, &flags); + if (is_dwork) + kthread_cancel_delayed_work_timer(work, &flags); + + ret = __kthread_cancel_work(work); if (worker->current_work != work) goto out_fast; diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 095c87f97a31..0c0524bfff99 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -843,7 +843,7 @@ static int count_matching_names(struct lock_class *new_class) } /* used from NMI context -- must be lockless */ -static __always_inline struct lock_class * +static noinstr struct lock_class * look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; @@ -851,12 +851,14 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) struct lock_class *class; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { + instrumentation_begin(); debug_locks_off(); printk(KERN_ERR "BUG: looking up invalid subclass: %u\n", subclass); printk(KERN_ERR "turning off the locking correctness validator.\n"); dump_stack(); + instrumentation_end(); return NULL; } @@ -5847,7 +5849,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); + trace_lock_contended(lock, ip); if (unlikely(!lock_stat || !lockdep_enabled())) return; @@ -5865,7 +5867,7 @@ void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_contended(lock, ip); + trace_lock_acquired(lock, ip); if (unlikely(!lock_stat || !lockdep_enabled())) return; diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index a7276aaf2abc..db9301591e3f 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -57,7 +57,7 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, task->blocked_on = waiter; } -void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task) { DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list)); @@ -65,7 +65,7 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter); task->blocked_on = NULL; - list_del_init(&waiter->list); + INIT_LIST_HEAD(&waiter->list); waiter->task = NULL; } diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 1edd3f45a4ec..53e631e1d76d 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -22,7 +22,7 @@ extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); extern void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); -extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, +extern void debug_mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct task_struct *task); extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index cb6b112ce155..013e1b08a1bf 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -194,7 +194,7 @@ static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_wait * Add @waiter to a given location in the lock wait_list and set the * FLAG_WAITERS flag if it's the first waiter. */ -static void __sched +static void __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, struct list_head *list) { @@ -205,6 +205,16 @@ __mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); } +static void +__mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter) +{ + list_del(&waiter->list); + if (likely(list_empty(&lock->wait_list))) + __mutex_clear_flag(lock, MUTEX_FLAGS); + + debug_mutex_remove_waiter(lock, waiter, current); +} + /* * Give up ownership to a specific task, when @task = NULL, this is equivalent * to a regular unlock. Sets PICKUP on a handoff, clears HANDOFF, preserves @@ -1061,9 +1071,7 @@ acquired: __ww_mutex_check_waiters(lock, ww_ctx); } - mutex_remove_waiter(lock, &waiter, current); - if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAGS); + __mutex_remove_waiter(lock, &waiter); debug_mutex_free_waiter(&waiter); @@ -1080,7 +1088,7 @@ skip_wait: err: __set_current_state(TASK_RUNNING); - mutex_remove_waiter(lock, &waiter, current); + __mutex_remove_waiter(lock, &waiter); err_early_kill: spin_unlock(&lock->wait_lock); debug_mutex_free_waiter(&waiter); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 1c2287d3fa71..f0c710b1d192 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -10,12 +10,10 @@ * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: */ -#define mutex_remove_waiter(lock, waiter, task) \ - __list_del((waiter)->list.prev, (waiter)->list.next) - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) +#define debug_mutex_remove_waiter(lock, waiter, ti) do { } while (0) #define debug_mutex_unlock(lock) do { } while (0) #define debug_mutex_init(lock, name, key) do { } while (0) diff --git a/kernel/module.c b/kernel/module.c index b5dd92e35b02..927d46cb8eb9 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -266,9 +266,18 @@ static void module_assert_mutex_or_preempt(void) #endif } +#ifdef CONFIG_MODULE_SIG static bool sig_enforce = IS_ENABLED(CONFIG_MODULE_SIG_FORCE); module_param(sig_enforce, bool_enable_only, 0644); +void set_module_sig_enforced(void) +{ + sig_enforce = true; +} +#else +#define sig_enforce false +#endif + /* * Export sig_enforce kernel cmdline parameter to allow other subsystems rely * on that instead of directly to CONFIG_MODULE_SIG_FORCE config. @@ -279,11 +288,6 @@ bool is_module_sig_enforced(void) } EXPORT_SYMBOL(is_module_sig_enforced); -void set_module_sig_enforced(void) -{ - sig_enforce = true; -} - /* Block module loading/unloading? */ int modules_disabled = 0; core_param(nomodule, modules_disabled, bint, 0); @@ -2401,6 +2405,15 @@ static long get_offset(struct module *mod, unsigned int *size, return ret; } +static bool module_init_layout_section(const char *sname) +{ +#ifndef CONFIG_MODULE_UNLOAD + if (module_exit_section(sname)) + return true; +#endif + return module_init_section(sname); +} + /* * Lay out the SHF_ALLOC sections in a way not dissimilar to how ld * might -- code, read-only data, read-write data, small data. Tally @@ -2435,7 +2448,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || module_init_section(sname)) + || module_init_layout_section(sname)) continue; s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); @@ -2468,7 +2481,7 @@ static void layout_sections(struct module *mod, struct load_info *info) if ((s->sh_flags & masks[m][0]) != masks[m][0] || (s->sh_flags & masks[m][1]) || s->sh_entsize != ~0UL - || !module_init_section(sname)) + || !module_init_layout_section(sname)) continue; s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); @@ -2807,11 +2820,7 @@ void * __weak module_alloc(unsigned long size) bool __weak module_init_section(const char *name) { -#ifndef CONFIG_MODULE_UNLOAD - return strstarts(name, ".init") || module_exit_section(name); -#else return strstarts(name, ".init"); -#endif } bool __weak module_exit_section(const char *name) diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index 7a1414622051..94232186fccb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -391,6 +391,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* No obstacles. */ return vprintk_default(fmt, args); } +EXPORT_SYMBOL(vprintk); void __init printk_safe_init(void) { @@ -411,4 +412,3 @@ void __init printk_safe_init(void) /* Flush pending messages that did not have scheduled IRQ works. */ printk_safe_flush(); } -EXPORT_SYMBOL(vprintk); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 76f09456ec4b..2997ca600d18 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -170,6 +170,21 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +static bool looks_like_a_spurious_pid(struct task_struct *task) +{ + if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP)) + return false; + + if (task_pid_vnr(task) == task->ptrace_message) + return false; + /* + * The tracee changed its pid but the PTRACE_EVENT_EXEC event + * was not wait()'ed, most probably debugger targets the old + * leader which was destroyed in de_thread(). + */ + return true; +} + /* Ensure that nothing can wake it up, even SIGKILL */ static bool ptrace_freeze_traced(struct task_struct *task) { @@ -180,7 +195,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) return ret; spin_lock_irq(&task->sighand->siglock); - if (task_is_traced(task) && !__fatal_signal_pending(task)) { + if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && + !__fatal_signal_pending(task)) { task->state = __TASK_TRACED; ret = true; } diff --git a/kernel/reboot.c b/kernel/reboot.c index a6ad5eb2fa73..f7440c0c7e43 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "reboot: " fmt +#include <linux/atomic.h> #include <linux/ctype.h> #include <linux/export.h> #include <linux/kexec.h> @@ -518,6 +519,84 @@ void orderly_reboot(void) } EXPORT_SYMBOL_GPL(orderly_reboot); +/** + * hw_failure_emergency_poweroff_func - emergency poweroff work after a known delay + * @work: work_struct associated with the emergency poweroff function + * + * This function is called in very critical situations to force + * a kernel poweroff after a configurable timeout value. + */ +static void hw_failure_emergency_poweroff_func(struct work_struct *work) +{ + /* + * We have reached here after the emergency shutdown waiting period has + * expired. This means orderly_poweroff has not been able to shut off + * the system for some reason. + * + * Try to shut down the system immediately using kernel_power_off + * if populated + */ + pr_emerg("Hardware protection timed-out. Trying forced poweroff\n"); + kernel_power_off(); + + /* + * Worst of the worst case trigger emergency restart + */ + pr_emerg("Hardware protection shutdown failed. Trying emergency restart\n"); + emergency_restart(); +} + +static DECLARE_DELAYED_WORK(hw_failure_emergency_poweroff_work, + hw_failure_emergency_poweroff_func); + +/** + * hw_failure_emergency_poweroff - Trigger an emergency system poweroff + * + * This may be called from any critical situation to trigger a system shutdown + * after a given period of time. If time is negative this is not scheduled. + */ +static void hw_failure_emergency_poweroff(int poweroff_delay_ms) +{ + if (poweroff_delay_ms <= 0) + return; + schedule_delayed_work(&hw_failure_emergency_poweroff_work, + msecs_to_jiffies(poweroff_delay_ms)); +} + +/** + * hw_protection_shutdown - Trigger an emergency system poweroff + * + * @reason: Reason of emergency shutdown to be printed. + * @ms_until_forced: Time to wait for orderly shutdown before tiggering a + * forced shudown. Negative value disables the forced + * shutdown. + * + * Initiate an emergency system shutdown in order to protect hardware from + * further damage. Usage examples include a thermal protection or a voltage or + * current regulator failures. + * NOTE: The request is ignored if protection shutdown is already pending even + * if the previous request has given a large timeout for forced shutdown. + * Can be called from any context. + */ +void hw_protection_shutdown(const char *reason, int ms_until_forced) +{ + static atomic_t allow_proceed = ATOMIC_INIT(1); + + pr_emerg("HARDWARE PROTECTION shutdown (%s)\n", reason); + + /* Shutdown should be initiated only once. */ + if (!atomic_dec_and_test(&allow_proceed)) + return; + + /* + * Queue a backup emergency shutdown in the event of + * orderly_poweroff failure + */ + hw_failure_emergency_poweroff(ms_until_forced); + orderly_poweroff(true); +} +EXPORT_SYMBOL_GPL(hw_protection_shutdown); + static int __init reboot_setup(char *str) { for (;;) { diff --git a/kernel/resource.c b/kernel/resource.c index 028a5ab18818..ca9f5198a01f 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1805,7 +1805,7 @@ static struct resource *__request_free_mem_region(struct device *dev, REGION_DISJOINT) continue; - if (!__request_region_locked(res, &iomem_resource, addr, size, + if (__request_region_locked(res, &iomem_resource, addr, size, name, 0)) break; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5226cc26a095..4ca80df205ce 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6389,7 +6389,6 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } -EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 9c882f20803e..c5aacbd492a1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -885,6 +885,7 @@ static const struct seq_operations sched_debug_sops = { #define __PS(S, F) SEQ_printf(m, "%-45s:%21Ld\n", S, (long long)(F)) #define __P(F) __PS(#F, F) #define P(F) __PS(#F, p->F) +#define PM(F, M) __PS(#F, p->F & (M)) #define __PSN(S, F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", S, SPLIT_NS((long long)(F))) #define __PN(F) __PSN(#F, F) #define PN(F) __PSN(#F, p->F) @@ -1011,7 +1012,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P(se.avg.util_avg); P(se.avg.last_update_time); P(se.avg.util_est.ewma); - P(se.avg.util_est.enqueued); + PM(se.avg.util_est.enqueued, ~UTIL_AVG_UNCHANGED); #endif #ifdef CONFIG_UCLAMP_TASK __PS("uclamp.min", p->uclamp_req[UCLAMP_MIN].value); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 20aa234ffe04..23663318fb81 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3298,6 +3298,52 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags) #ifdef CONFIG_SMP #ifdef CONFIG_FAIR_GROUP_SCHED +/* + * Because list_add_leaf_cfs_rq always places a child cfs_rq on the list + * immediately before a parent cfs_rq, and cfs_rqs are removed from the list + * bottom-up, we only have to test whether the cfs_rq before us on the list + * is our child. + * If cfs_rq is not on the list, test whether a child needs its to be added to + * connect a branch to the tree * (see list_add_leaf_cfs_rq() for details). + */ +static inline bool child_cfs_rq_on_list(struct cfs_rq *cfs_rq) +{ + struct cfs_rq *prev_cfs_rq; + struct list_head *prev; + + if (cfs_rq->on_list) { + prev = cfs_rq->leaf_cfs_rq_list.prev; + } else { + struct rq *rq = rq_of(cfs_rq); + + prev = rq->tmp_alone_branch; + } + + prev_cfs_rq = container_of(prev, struct cfs_rq, leaf_cfs_rq_list); + + return (prev_cfs_rq->tg->parent == cfs_rq->tg); +} + +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + if (cfs_rq->load.weight) + return false; + + if (cfs_rq->avg.load_sum) + return false; + + if (cfs_rq->avg.util_sum) + return false; + + if (cfs_rq->avg.runnable_sum) + return false; + + if (child_cfs_rq_on_list(cfs_rq)) + return false; + + return true; +} + /** * update_tg_load_avg - update the tg's load avg * @cfs_rq: the cfs_rq whose avg changed @@ -3499,10 +3545,9 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf static inline void update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) { - long delta_avg, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; + long delta, running_sum, runnable_sum = gcfs_rq->prop_runnable_sum; unsigned long load_avg; u64 load_sum = 0; - s64 delta_sum; u32 divider; if (!runnable_sum) @@ -3549,13 +3594,13 @@ update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq load_sum = (s64)se_weight(se) * runnable_sum; load_avg = div_s64(load_sum, divider); - delta_sum = load_sum - (s64)se_weight(se) * se->avg.load_sum; - delta_avg = load_avg - se->avg.load_avg; + delta = load_avg - se->avg.load_avg; se->avg.load_sum = runnable_sum; se->avg.load_avg = load_avg; - add_positive(&cfs_rq->avg.load_avg, delta_avg); - add_positive(&cfs_rq->avg.load_sum, delta_sum); + + add_positive(&cfs_rq->avg.load_avg, delta); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; } static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) @@ -3766,11 +3811,17 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + /* + * cfs_rq->avg.period_contrib can be used for both cfs_rq and se. + * See ___update_load_avg() for details. + */ + u32 divider = get_pelt_divider(&cfs_rq->avg); + dequeue_load_avg(cfs_rq, se); sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg); - sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum); + cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * divider; sub_positive(&cfs_rq->avg.runnable_avg, se->avg.runnable_avg); - sub_positive(&cfs_rq->avg.runnable_sum, se->avg.runnable_sum); + cfs_rq->avg.runnable_sum = cfs_rq->avg.runnable_avg * divider; add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum); @@ -3902,7 +3953,7 @@ static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); - return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED); + return max(ue.ewma, (ue.enqueued & ~UTIL_AVG_UNCHANGED)); } static inline unsigned long task_util_est(struct task_struct *p) @@ -4002,7 +4053,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * Reset EWMA on utilization increases, the moving average is used only * to smooth utilization decreases. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + ue.enqueued = task_util(p); if (sched_feat(UTIL_EST_FASTUP)) { if (ue.ewma < ue.enqueued) { ue.ewma = ue.enqueued; @@ -4051,6 +4102,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; done: + ue.enqueued |= UTIL_AVG_UNCHANGED; WRITE_ONCE(p->se.avg.util_est, ue); trace_sched_util_est_se_tp(&p->se); @@ -4085,6 +4137,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) #else /* CONFIG_SMP */ +static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) +{ + return true; +} + #define UPDATE_TG 0x0 #define SKIP_AGE_LOAD 0x0 #define DO_ATTACH 0x0 @@ -4743,8 +4800,8 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; - /* Add cfs_rq with already running entity in the list */ - if (cfs_rq->nr_running >= 1) + /* Add cfs_rq with load or one or more already running entities to the list */ + if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running) list_add_leaf_cfs_rq(cfs_rq); } @@ -6217,7 +6274,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } if (has_idle_core) - set_idle_cores(this, false); + set_idle_cores(target, false); if (sched_feat(SIS_PROP) && !has_idle_core) { time = cpu_clock(this) - time; @@ -7990,23 +8047,6 @@ static bool __update_blocked_others(struct rq *rq, bool *done) #ifdef CONFIG_FAIR_GROUP_SCHED -static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) -{ - if (cfs_rq->load.weight) - return false; - - if (cfs_rq->avg.load_sum) - return false; - - if (cfs_rq->avg.util_sum) - return false; - - if (cfs_rq->avg.runnable_sum) - return false; - - return true; -} - static bool __update_blocked_fair(struct rq *rq, bool *done) { struct cfs_rq *cfs_rq, *pos; @@ -8030,7 +8070,7 @@ static bool __update_blocked_fair(struct rq *rq, bool *done) /* Propagate pending load changes to the parent, if any: */ se = cfs_rq->tg->se[cpu]; if (se && !skip_blocked_update(se)) - update_load_avg(cfs_rq_of(se), se, 0); + update_load_avg(cfs_rq_of(se), se, UPDATE_TG); /* * There can be a lot of idle CPU cgroups. Don't let fully diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h index 1462846d244e..cfe94ffd2b38 100644 --- a/kernel/sched/pelt.h +++ b/kernel/sched/pelt.h @@ -42,15 +42,6 @@ static inline u32 get_pelt_divider(struct sched_avg *avg) return LOAD_AVG_MAX - 1024 + avg->period_contrib; } -/* - * When a task is dequeued, its estimated utilization should not be update if - * its util_avg has not been updated at least once. - * This flag is used to synchronize util_avg updates with util_est updates. - * We map this information into the LSB bit of the utilization saved at - * dequeue time (i.e. util_est.dequeued). - */ -#define UTIL_AVG_UNCHANGED 0x1 - static inline void cfs_se_util_change(struct sched_avg *avg) { unsigned int enqueued; @@ -58,7 +49,7 @@ static inline void cfs_se_util_change(struct sched_avg *avg) if (!sched_feat(UTIL_EST)) return; - /* Avoid store if the flag has been already set */ + /* Avoid store if the flag has been already reset */ enqueued = avg->util_est.enqueued; if (!(enqueued & UTIL_AVG_UNCHANGED)) return; diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 6ecd3f3a52b5..9f58049ac16d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -1105,28 +1105,30 @@ static int seccomp_do_user_notification(int this_syscall, up(&match->notif->request); wake_up_poll(&match->wqh, EPOLLIN | EPOLLRDNORM); - mutex_unlock(&match->notify_lock); /* * This is where we wait for a reply from userspace. */ -wait: - err = wait_for_completion_interruptible(&n.ready); - mutex_lock(&match->notify_lock); - if (err == 0) { - /* Check if we were woken up by a addfd message */ + do { + mutex_unlock(&match->notify_lock); + err = wait_for_completion_interruptible(&n.ready); + mutex_lock(&match->notify_lock); + if (err != 0) + goto interrupted; + addfd = list_first_entry_or_null(&n.addfd, struct seccomp_kaddfd, list); - if (addfd && n.state != SECCOMP_NOTIFY_REPLIED) { + /* Check if we were woken up by a addfd message */ + if (addfd) seccomp_handle_addfd(addfd); - mutex_unlock(&match->notify_lock); - goto wait; - } - ret = n.val; - err = n.error; - flags = n.flags; - } + } while (n.state != SECCOMP_NOTIFY_REPLIED); + + ret = n.val; + err = n.error; + flags = n.flags; + +interrupted: /* If there were any pending addfd calls, clear them out */ list_for_each_entry_safe(addfd, tmp, &n.addfd, list) { /* The process went away before we got a chance to handle it */ diff --git a/kernel/signal.c b/kernel/signal.c index 66e88649cf74..30a0bee5ff9b 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -431,16 +431,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, rcu_read_unlock(); if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - /* - * Preallocation does not hold sighand::siglock so it can't - * use the cache. The lockless caching requires that only - * one consumer and only one producer run at a time. - */ - q = READ_ONCE(t->sigqueue_cache); - if (!q || sigqueue_flags) - q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); - else - WRITE_ONCE(t->sigqueue_cache, NULL); + q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); } else { print_dropped_signal(sig); } @@ -457,44 +448,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, return q; } -void exit_task_sigqueue_cache(struct task_struct *tsk) -{ - /* Race free because @tsk is mopped up */ - struct sigqueue *q = tsk->sigqueue_cache; - - if (q) { - tsk->sigqueue_cache = NULL; - /* - * Hand it back to the cache as the task might - * be self reaping which would leak the object. - */ - kmem_cache_free(sigqueue_cachep, q); - } -} - -static void sigqueue_cache_or_free(struct sigqueue *q) -{ - /* - * Cache one sigqueue per task. This pairs with the consumer side - * in __sigqueue_alloc() and needs READ/WRITE_ONCE() to prevent the - * compiler from store tearing and to tell KCSAN that the data race - * is intentional when run without holding current->sighand->siglock, - * which is fine as current obviously cannot run __sigqueue_free() - * concurrently. - */ - if (!READ_ONCE(current->sigqueue_cache)) - WRITE_ONCE(current->sigqueue_cache, q); - else - kmem_cache_free(sigqueue_cachep, q); -} - static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) return; if (atomic_dec_and_test(&q->user->sigpending)) free_uid(q->user); - sigqueue_cache_or_free(q); + kmem_cache_free(sigqueue_cachep, q); } void flush_sigqueue(struct sigpending *queue) @@ -1236,6 +1196,7 @@ static inline bool has_si_pid_and_uid(struct kernel_siginfo *info) case SIL_TIMER: case SIL_POLL: case SIL_FAULT: + case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: @@ -1804,6 +1765,21 @@ int force_sig_pkuerr(void __user *addr, u32 pkey) } #endif +int force_sig_perf(void __user *addr, u32 type, u64 sig_data) +{ + struct kernel_siginfo info; + + clear_siginfo(&info); + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_PERF; + info.si_addr = addr; + info.si_perf_data = sig_data; + info.si_perf_type = type; + + return force_sig_info(&info); +} + /* For the crazy architectures that include trap information in * the errno field, instead of an actual errno value. */ @@ -2564,6 +2540,7 @@ static void hide_si_addr_tag_bits(struct ksignal *ksig) { switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { case SIL_FAULT: + case SIL_FAULT_TRAPNO: case SIL_FAULT_MCEERR: case SIL_FAULT_BNDERR: case SIL_FAULT_PKUERR: @@ -3251,6 +3228,10 @@ enum siginfo_layout siginfo_layout(unsigned sig, int si_code) #endif else if ((sig == SIGTRAP) && (si_code == TRAP_PERF)) layout = SIL_PERF_EVENT; +#ifdef __ARCH_SI_TRAPNO + else if (layout == SIL_FAULT) + layout = SIL_FAULT_TRAPNO; +#endif } else if (si_code <= NSIGPOLL) layout = SIL_POLL; @@ -3354,35 +3335,28 @@ void copy_siginfo_to_external32(struct compat_siginfo *to, break; case SIL_FAULT: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO + break; + case SIL_FAULT_TRAPNO: + to->si_addr = ptr_to_compat(from->si_addr); to->si_trapno = from->si_trapno; -#endif break; case SIL_FAULT_MCEERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_lower = ptr_to_compat(from->si_lower); to->si_upper = ptr_to_compat(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = ptr_to_compat(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_pkey = from->si_pkey; break; case SIL_PERF_EVENT: to->si_addr = ptr_to_compat(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -3438,35 +3412,28 @@ static int post_copy_siginfo_from_user32(kernel_siginfo_t *to, break; case SIL_FAULT: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO + break; + case SIL_FAULT_TRAPNO: + to->si_addr = compat_ptr(from->si_addr); to->si_trapno = from->si_trapno; -#endif break; case SIL_FAULT_MCEERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_addr_lsb = from->si_addr_lsb; break; case SIL_FAULT_BNDERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_lower = compat_ptr(from->si_lower); to->si_upper = compat_ptr(from->si_upper); break; case SIL_FAULT_PKUERR: to->si_addr = compat_ptr(from->si_addr); -#ifdef __ARCH_SI_TRAPNO - to->si_trapno = from->si_trapno; -#endif to->si_pkey = from->si_pkey; break; case SIL_PERF_EVENT: to->si_addr = compat_ptr(from->si_addr); - to->si_perf = from->si_perf; + to->si_perf_data = from->si_perf_data; + to->si_perf_type = from->si_perf_type; break; case SIL_CHLD: to->si_pid = from->si_pid; @@ -4644,11 +4611,13 @@ static inline void siginfo_buildtime_checks(void) /* sigfault */ CHECK_OFFSET(si_addr); + CHECK_OFFSET(si_trapno); CHECK_OFFSET(si_addr_lsb); CHECK_OFFSET(si_lower); CHECK_OFFSET(si_upper); CHECK_OFFSET(si_pkey); - CHECK_OFFSET(si_perf); + CHECK_OFFSET(si_perf_data); + CHECK_OFFSET(si_perf_type); /* sigpoll */ CHECK_OFFSET(si_band); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 14edf84cc571..d4a78e08f6d8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -225,7 +225,27 @@ static int bpf_stats_handler(struct ctl_table *table, int write, mutex_unlock(&bpf_stats_enabled_mutex); return ret; } -#endif + +static int bpf_unpriv_handler(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + int ret, unpriv_enable = *(int *)table->data; + bool locked_state = unpriv_enable == 1; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &unpriv_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (locked_state && unpriv_enable != 1) + return -EPERM; + *(int *)table->data = unpriv_enable; + } + return ret; +} +#endif /* CONFIG_BPF_SYSCALL && CONFIG_SYSCTL */ /* * /proc/sys support @@ -2600,10 +2620,9 @@ static struct ctl_table kern_table[] = { .data = &sysctl_unprivileged_bpf_disabled, .maxlen = sizeof(sysctl_unprivileged_bpf_disabled), .mode = 0644, - /* only handle a transition from default "0" to "1" */ - .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ONE, - .extra2 = SYSCTL_ONE, + .proc_handler = bpf_unpriv_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &two, }, { .procname = "bpf_stats_enabled", diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index bea9d08b1698..5897828b9d7e 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -92,7 +92,7 @@ static int alarmtimer_rtc_add_device(struct device *dev, if (rtcdev) return -EBUSY; - if (!rtc->ops->set_alarm) + if (!test_bit(RTC_FEATURE_ALARM, rtc->features)) return -1; if (!device_may_wakeup(rtc->dev.parent)) return -1; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 828b091501ca..6784f27a3099 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -230,6 +230,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; +EXPORT_SYMBOL_GPL(tick_nohz_full_mask); bool tick_nohz_full_running; EXPORT_SYMBOL_GPL(tick_nohz_full_running); static atomic_t tick_dep_mask; diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index d2d7cf6cfe83..7a52bc172841 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -215,16 +215,11 @@ const struct bpf_func_proto bpf_probe_read_user_str_proto = { static __always_inline int bpf_probe_read_kernel_common(void *dst, u32 size, const void *unsafe_ptr) { - int ret = security_locked_down(LOCKDOWN_BPF_READ); + int ret; - if (unlikely(ret < 0)) - goto fail; ret = copy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) - goto fail; - return ret; -fail: - memset(dst, 0, size); + memset(dst, 0, size); return ret; } @@ -246,10 +241,7 @@ const struct bpf_func_proto bpf_probe_read_kernel_proto = { static __always_inline int bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) { - int ret = security_locked_down(LOCKDOWN_BPF_READ); - - if (unlikely(ret < 0)) - goto fail; + int ret; /* * The strncpy_from_kernel_nofault() call will likely not fill the @@ -262,11 +254,7 @@ bpf_probe_read_kernel_str_common(void *dst, u32 size, const void *unsafe_ptr) */ ret = strncpy_from_kernel_nofault(dst, unsafe_ptr, size); if (unlikely(ret < 0)) - goto fail; - - return ret; -fail: - memset(dst, 0, size); + memset(dst, 0, size); return ret; } @@ -1011,16 +999,20 @@ bpf_tracing_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) case BPF_FUNC_probe_read_user: return &bpf_probe_read_user_proto; case BPF_FUNC_probe_read_kernel: - return &bpf_probe_read_kernel_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_proto; case BPF_FUNC_probe_read_user_str: return &bpf_probe_read_user_str_proto; case BPF_FUNC_probe_read_kernel_str: - return &bpf_probe_read_kernel_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_kernel_str_proto; #ifdef CONFIG_ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE case BPF_FUNC_probe_read: - return &bpf_probe_read_compat_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_compat_proto; case BPF_FUNC_probe_read_str: - return &bpf_probe_read_compat_str_proto; + return security_locked_down(LOCKDOWN_BPF_READ) < 0 ? + NULL : &bpf_probe_read_compat_str_proto; #endif #ifdef CONFIG_CGROUPS case BPF_FUNC_get_current_cgroup_id: diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2e8a3fde7104..72ef4dccbcc4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1967,12 +1967,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, static void print_ip_ins(const char *fmt, const unsigned char *p) { + char ins[MCOUNT_INSN_SIZE]; int i; + if (copy_from_kernel_nofault(ins, p, MCOUNT_INSN_SIZE)) { + printk(KERN_CONT "%s[FAULT] %px\n", fmt, p); + return; + } + printk(KERN_CONT "%s", fmt); for (i = 0; i < MCOUNT_INSN_SIZE; i++) - printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]); + printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]); } enum ftrace_bug_type ftrace_bug_type; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 560e4c8d3825..d23a09d3eb37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2198,9 +2198,6 @@ struct saved_cmdlines_buffer { }; static struct saved_cmdlines_buffer *savedcmd; -/* temporary disable recording */ -static atomic_t trace_record_taskinfo_disabled __read_mostly; - static inline char *get_saved_cmdlines(int idx) { return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN]; @@ -2486,8 +2483,6 @@ static bool tracing_record_taskinfo_skip(int flags) { if (unlikely(!(flags & (TRACE_RECORD_CMDLINE | TRACE_RECORD_TGID)))) return true; - if (atomic_read(&trace_record_taskinfo_disabled) || !tracing_is_on()) - return true; if (!__this_cpu_read(trace_taskinfo_save)) return true; return false; @@ -2736,7 +2731,7 @@ trace_event_buffer_lock_reserve(struct trace_buffer **current_rb, (entry = this_cpu_read(trace_buffered_event))) { /* Try to use the per cpu buffer first */ val = this_cpu_inc_return(trace_buffered_event_cnt); - if ((len < (PAGE_SIZE - sizeof(*entry))) && val == 1) { + if ((len < (PAGE_SIZE - sizeof(*entry) - sizeof(entry->array[0]))) && val == 1) { trace_event_setup(entry, type, trace_ctx); entry->array[0] = len; return entry; @@ -3704,6 +3699,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, goto print; while (*p) { + bool star = false; + int len = 0; + j = 0; /* We only care about %s and variants */ @@ -3725,13 +3723,17 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, /* Need to test cases like %08.*s */ for (j = 1; p[i+j]; j++) { if (isdigit(p[i+j]) || - p[i+j] == '*' || p[i+j] == '.') continue; + if (p[i+j] == '*') { + star = true; + continue; + } break; } if (p[i+j] == 's') break; + star = false; } j = 0; } @@ -3744,6 +3746,9 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, iter->fmt[i] = '\0'; trace_seq_vprintf(&iter->seq, iter->fmt, ap); + if (star) + len = va_arg(ap, int); + /* The ap now points to the string data of the %s */ str = va_arg(ap, const char *); @@ -3762,8 +3767,18 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, int ret; /* Try to safely read the string */ - ret = strncpy_from_kernel_nofault(iter->fmt, str, - iter->fmt_size); + if (star) { + if (len + 1 > iter->fmt_size) + len = iter->fmt_size - 1; + if (len < 0) + len = 0; + ret = copy_from_kernel_nofault(iter->fmt, str, len); + iter->fmt[len] = 0; + star = false; + } else { + ret = strncpy_from_kernel_nofault(iter->fmt, str, + iter->fmt_size); + } if (ret < 0) trace_seq_printf(&iter->seq, "(0x%px)", str); else @@ -3775,7 +3790,10 @@ void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, strncpy(iter->fmt, p + i, j + 1); iter->fmt[j+1] = '\0'; } - trace_seq_printf(&iter->seq, iter->fmt, str); + if (star) + trace_seq_printf(&iter->seq, iter->fmt, len, str); + else + trace_seq_printf(&iter->seq, iter->fmt, str); p += i + j + 1; } @@ -3975,9 +3993,6 @@ static void *s_start(struct seq_file *m, loff_t *pos) return ERR_PTR(-EBUSY); #endif - if (!iter->snapshot) - atomic_inc(&trace_record_taskinfo_disabled); - if (*pos != iter->pos) { iter->ent = NULL; iter->cpu = 0; @@ -4020,9 +4035,6 @@ static void s_stop(struct seq_file *m, void *p) return; #endif - if (!iter->snapshot) - atomic_dec(&trace_record_taskinfo_disabled); - trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index c1637f90c8a3..4702efb00ff2 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -115,9 +115,9 @@ u64 notrace trace_clock_global(void) prev_time = READ_ONCE(trace_clock_struct.prev_time); now = sched_clock_cpu(this_cpu); - /* Make sure that now is always greater than prev_time */ + /* Make sure that now is always greater than or equal to prev_time */ if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; /* * If in an NMI context then dont risk lockups and simply return @@ -131,7 +131,7 @@ u64 notrace trace_clock_global(void) /* Reread prev_time in case it was already updated */ prev_time = READ_ONCE(trace_clock_struct.prev_time); if ((s64)(now - prev_time) < 0) - now = prev_time + 1; + now = prev_time; trace_clock_struct.prev_time = now; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 7c397907d0e9..92d3bcc5a5e0 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -302,10 +302,10 @@ void touch_softlockup_watchdog_sync(void) __this_cpu_write(watchdog_report_ts, SOFTLOCKUP_DELAY_REPORT); } -static int is_softlockup(unsigned long touch_ts, unsigned long period_ts) +static int is_softlockup(unsigned long touch_ts, + unsigned long period_ts, + unsigned long now) { - unsigned long now = get_timestamp(); - if ((watchdog_enabled & SOFT_WATCHDOG_ENABLED) && watchdog_thresh){ /* Warn about unreasonable delays. */ if (time_after(now, period_ts + get_softlockup_thresh())) @@ -353,8 +353,7 @@ static int softlockup_fn(void *data) /* watchdog kicker functions */ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) { - unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts); - unsigned long period_ts = __this_cpu_read(watchdog_report_ts); + unsigned long touch_ts, period_ts, now; struct pt_regs *regs = get_irq_regs(); int duration; int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace; @@ -377,11 +376,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period)); /* + * Read the current timestamp first. It might become invalid anytime + * when a virtual machine is stopped by the host or when the watchog + * is touched from NMI. + */ + now = get_timestamp(); + /* * If a virtual machine is stopped by the host it can look to - * the watchdog like a soft lockup. Check to see if the host - * stopped the vm before we process the timestamps. + * the watchdog like a soft lockup. This function touches the watchdog. */ kvm_check_and_clear_guest_paused(); + /* + * The stored timestamp is comparable with @now only when not touched. + * It might get touched anytime from NMI. Make sure that is_softlockup() + * uses the same (valid) value. + */ + period_ts = READ_ONCE(*this_cpu_ptr(&watchdog_report_ts)); /* Reset the interval when touched by known problematic code. */ if (period_ts == SOFTLOCKUP_DELAY_REPORT) { @@ -398,13 +408,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer) return HRTIMER_RESTART; } - /* check for a softlockup - * This is done by making sure a high priority task is - * being scheduled. The task touches the watchdog to - * indicate it is getting cpu time. If it hasn't then - * this is a good indication some task is hogging the cpu - */ - duration = is_softlockup(touch_ts, period_ts); + /* Check for a softlockup. */ + touch_ts = __this_cpu_read(watchdog_touch_ts); + duration = is_softlockup(touch_ts, period_ts, now); if (unlikely(duration)) { /* * Prevent multiple soft-lockup reports if one cpu is already diff --git a/kernel/workqueue.c b/kernel/workqueue.c index b19d759e55a5..50142fc08902 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -50,6 +50,7 @@ #include <linux/uaccess.h> #include <linux/sched/isolation.h> #include <linux/nmi.h> +#include <linux/kvm_para.h> #include "workqueue_internal.h" @@ -5772,6 +5773,7 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) { unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; bool lockup_detected = false; + unsigned long now = jiffies; struct worker_pool *pool; int pi; @@ -5786,6 +5788,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) if (list_empty(&pool->worklist)) continue; + /* + * If a virtual machine is stopped by the host it can look to + * the watchdog like a stall. + */ + kvm_check_and_clear_guest_paused(); + /* get the latest of pool and touched timestamps */ if (pool->cpu >= 0) touched = READ_ONCE(per_cpu(wq_watchdog_touched_cpu, pool->cpu)); @@ -5799,12 +5807,12 @@ static void wq_watchdog_timer_fn(struct timer_list *unused) ts = touched; /* did we stall? */ - if (time_after(jiffies, ts + thresh)) { + if (time_after(now, ts + thresh)) { lockup_detected = true; pr_emerg("BUG: workqueue lockup - pool"); pr_cont_pool_info(pool); pr_cont(" stuck for %us!\n", - jiffies_to_msecs(jiffies - pool_ts) / 1000); + jiffies_to_msecs(now - pool_ts) / 1000); } } |