aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorJakub Kicinski2023-11-23 12:19:49 -0800
committerJakub Kicinski2023-11-23 12:20:58 -0800
commit45c226dde742a92e22dcd65b96bf7e02620a9c19 (patch)
treeabaedb7f2ddf75914659c7b9a48af34ca89a9208 /kernel
parentc5b9f4792ea6b9abfcfb9486ba256f55e296aaa7 (diff)
parentd3fa86b1a7b4cdc4367acacea16b72e0a200b3d7 (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
Cross-merge networking fixes after downstream PR. Conflicts: drivers/net/ethernet/intel/ice/ice_main.c c9663f79cd82 ("ice: adjust switchdev rebuild path") 7758017911a4 ("ice: restore timestamp configuration after device reset") https://lore.kernel.org/all/20231121211259.3348630-1-anthony.l.nguyen@intel.com/ Adjacent changes: kernel/bpf/verifier.c bb124da69c47 ("bpf: keep track of max number of bpf_loop callback iterations") 5f99f312bd3b ("bpf: add register bounds sanity checks and sanitization") Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/bpf/verifier.c402
-rw-r--r--kernel/cgroup/cgroup.c12
-rw-r--r--kernel/cpu.c8
-rw-r--r--kernel/events/core.c17
-rw-r--r--kernel/futex/core.c9
-rw-r--r--kernel/sched/fair.c161
-rw-r--r--kernel/sys.c4
-rw-r--r--kernel/time/hrtimer.c33
9 files changed, 448 insertions, 200 deletions
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 91e82e34b51e..7a98cd176a12 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -531,7 +531,7 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
if (tsk != current)
return 0;
- if (WARN_ON_ONCE(!current->mm))
+ if (!current->mm)
return 0;
exe_file = get_mm_exe_file(current->mm);
if (!exe_file)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 1340921ea311..405da1f9e724 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -465,13 +465,12 @@ static bool is_dynptr_ref_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_dynptr_data;
}
-static bool is_callback_calling_kfunc(u32 btf_id);
+static bool is_sync_callback_calling_kfunc(u32 btf_id);
static bool is_bpf_throw_kfunc(struct bpf_insn *insn);
-static bool is_callback_calling_function(enum bpf_func_id func_id)
+static bool is_sync_callback_calling_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_for_each_map_elem ||
- func_id == BPF_FUNC_timer_set_callback ||
func_id == BPF_FUNC_find_vma ||
func_id == BPF_FUNC_loop ||
func_id == BPF_FUNC_user_ringbuf_drain;
@@ -482,6 +481,18 @@ static bool is_async_callback_calling_function(enum bpf_func_id func_id)
return func_id == BPF_FUNC_timer_set_callback;
}
+static bool is_callback_calling_function(enum bpf_func_id func_id)
+{
+ return is_sync_callback_calling_function(func_id) ||
+ is_async_callback_calling_function(func_id);
+}
+
+static bool is_sync_callback_calling_insn(struct bpf_insn *insn)
+{
+ return (bpf_helper_call(insn) && is_sync_callback_calling_function(insn->imm)) ||
+ (bpf_pseudo_kfunc_call(insn) && is_sync_callback_calling_kfunc(insn->imm));
+}
+
static bool is_storage_get_function(enum bpf_func_id func_id)
{
return func_id == BPF_FUNC_sk_storage_get ||
@@ -1348,6 +1359,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
dst_state->first_insn_idx = src->first_insn_idx;
dst_state->last_insn_idx = src->last_insn_idx;
dst_state->dfs_depth = src->dfs_depth;
+ dst_state->callback_unroll_depth = src->callback_unroll_depth;
dst_state->used_as_loop_entry = src->used_as_loop_entry;
for (i = 0; i <= src->curframe; i++) {
dst = dst_state->frame[i];
@@ -3131,13 +3143,11 @@ static void mark_insn_zext(struct bpf_verifier_env *env,
reg->subreg_def = DEF_NOT_SUBREG;
}
-static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
- enum reg_arg_type t)
+static int __check_reg_arg(struct bpf_verifier_env *env, struct bpf_reg_state *regs, u32 regno,
+ enum reg_arg_type t)
{
- struct bpf_verifier_state *vstate = env->cur_state;
- struct bpf_func_state *state = vstate->frame[vstate->curframe];
struct bpf_insn *insn = env->prog->insnsi + env->insn_idx;
- struct bpf_reg_state *reg, *regs = state->regs;
+ struct bpf_reg_state *reg;
bool rw64;
if (regno >= MAX_BPF_REG) {
@@ -3178,6 +3188,15 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
return 0;
}
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
+ enum reg_arg_type t)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_func_state *state = vstate->frame[vstate->curframe];
+
+ return __check_reg_arg(env, state->regs, regno, t);
+}
+
static void mark_jmp_point(struct bpf_verifier_env *env, int idx)
{
env->insn_aux_data[idx].jmp_point = true;
@@ -3416,6 +3435,8 @@ static void fmt_stack_mask(char *buf, ssize_t buf_sz, u64 stack_mask)
}
}
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx);
+
/* For given verifier state backtrack_insn() is called from the last insn to
* the first insn. Its purpose is to compute a bitmask of registers and
* stack slots that needs precision in the parent verifier state.
@@ -3591,16 +3612,13 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
return -EFAULT;
return 0;
}
- } else if ((bpf_helper_call(insn) &&
- is_callback_calling_function(insn->imm) &&
- !is_async_callback_calling_function(insn->imm)) ||
- (bpf_pseudo_kfunc_call(insn) && is_callback_calling_kfunc(insn->imm))) {
- /* callback-calling helper or kfunc call, which means
- * we are exiting from subprog, but unlike the subprog
- * call handling above, we shouldn't propagate
- * precision of r1-r5 (if any requested), as they are
- * not actually arguments passed directly to callback
- * subprogs
+ } else if (is_sync_callback_calling_insn(insn) && idx != subseq_idx - 1) {
+ /* exit from callback subprog to callback-calling helper or
+ * kfunc call. Use idx/subseq_idx check to discern it from
+ * straight line code backtracking.
+ * Unlike the subprog call handling above, we shouldn't
+ * propagate precision of r1-r5 (if any requested), as they are
+ * not actually arguments passed directly to callback subprogs
*/
if (bt_reg_mask(bt) & ~BPF_REGMASK_ARGS) {
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
@@ -3635,10 +3653,18 @@ static int backtrack_insn(struct bpf_verifier_env *env, int idx, int subseq_idx,
} else if (opcode == BPF_EXIT) {
bool r0_precise;
+ /* Backtracking to a nested function call, 'idx' is a part of
+ * the inner frame 'subseq_idx' is a part of the outer frame.
+ * In case of a regular function call, instructions giving
+ * precision to registers R1-R5 should have been found already.
+ * In case of a callback, it is ok to have R1-R5 marked for
+ * backtracking, as these registers are set by the function
+ * invoking callback.
+ */
+ if (subseq_idx >= 0 && calls_callback(env, subseq_idx))
+ for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ bt_clear_reg(bt, i);
if (bt_reg_mask(bt) & BPF_REGMASK_ARGS) {
- /* if backtracing was looking for registers R1-R5
- * they should have been found already.
- */
verbose(env, "BUG regs %x\n", bt_reg_mask(bt));
WARN_ONCE(1, "verifier backtracking bug");
return -EFAULT;
@@ -9090,7 +9116,7 @@ static void clear_caller_saved_regs(struct bpf_verifier_env *env,
/* after the call registers r0 - r5 were scratched */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(env, regs, caller_saved[i]);
- check_reg_arg(env, caller_saved[i], DST_OP_NO_MARK);
+ __check_reg_arg(env, regs, caller_saved[i], DST_OP_NO_MARK);
}
}
@@ -9103,11 +9129,10 @@ static int set_callee_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee, int insn_idx);
-static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx, int subprog,
- set_callee_state_fn set_callee_state_cb)
+static int setup_func_entry(struct bpf_verifier_env *env, int subprog, int callsite,
+ set_callee_state_fn set_callee_state_cb,
+ struct bpf_verifier_state *state)
{
- struct bpf_verifier_state *state = env->cur_state;
struct bpf_func_state *caller, *callee;
int err;
@@ -9117,54 +9142,72 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
return -E2BIG;
}
- caller = state->frame[state->curframe];
if (state->frame[state->curframe + 1]) {
verbose(env, "verifier bug. Frame %d already allocated\n",
state->curframe + 1);
return -EFAULT;
}
+ caller = state->frame[state->curframe];
+ callee = kzalloc(sizeof(*callee), GFP_KERNEL);
+ if (!callee)
+ return -ENOMEM;
+ state->frame[state->curframe + 1] = callee;
+
+ /* callee cannot access r0, r6 - r9 for reading and has to write
+ * into its own stack before reading from it.
+ * callee can read/write into caller's stack
+ */
+ init_func_state(env, callee,
+ /* remember the callsite, it will be used by bpf_exit */
+ callsite,
+ state->curframe + 1 /* frameno within this callchain */,
+ subprog /* subprog number within this prog */);
+ /* Transfer references to the callee */
+ err = copy_reference_state(callee, caller);
+ err = err ?: set_callee_state_cb(env, caller, callee, callsite);
+ if (err)
+ goto err_out;
+
+ /* only increment it after check_reg_arg() finished */
+ state->curframe++;
+
+ return 0;
+
+err_out:
+ free_func_state(callee);
+ state->frame[state->curframe + 1] = NULL;
+ return err;
+}
+
+static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int insn_idx, int subprog,
+ set_callee_state_fn set_callee_state_cb)
+{
+ struct bpf_verifier_state *state = env->cur_state, *callback_state;
+ struct bpf_func_state *caller, *callee;
+ int err;
+
+ caller = state->frame[state->curframe];
err = btf_check_subprog_call(env, subprog, caller->regs);
if (err == -EFAULT)
return err;
- if (subprog_is_global(env, subprog)) {
- if (err) {
- verbose(env, "Caller passes invalid args into func#%d\n",
- subprog);
- return err;
- } else {
- if (env->log.level & BPF_LOG_LEVEL)
- verbose(env,
- "Func#%d is global and valid. Skipping.\n",
- subprog);
- clear_caller_saved_regs(env, caller->regs);
-
- /* All global functions return a 64-bit SCALAR_VALUE */
- mark_reg_unknown(env, caller->regs, BPF_REG_0);
- caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
-
- /* continue with next insn after call */
- return 0;
- }
- }
/* set_callee_state is used for direct subprog calls, but we are
* interested in validating only BPF helpers that can call subprogs as
* callbacks
*/
- if (set_callee_state_cb != set_callee_state) {
- env->subprog_info[subprog].is_cb = true;
- if (bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_kfunc(insn->imm)) {
- verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- } else if (!bpf_pseudo_kfunc_call(insn) &&
- !is_callback_calling_function(insn->imm)) { /* helper */
- verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
- func_id_name(insn->imm), insn->imm);
- return -EFAULT;
- }
+ env->subprog_info[subprog].is_cb = true;
+ if (bpf_pseudo_kfunc_call(insn) &&
+ !is_sync_callback_calling_kfunc(insn->imm)) {
+ verbose(env, "verifier bug: kfunc %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
+ } else if (!bpf_pseudo_kfunc_call(insn) &&
+ !is_callback_calling_function(insn->imm)) { /* helper */
+ verbose(env, "verifier bug: helper %s#%d not marked as callback-calling\n",
+ func_id_name(insn->imm), insn->imm);
+ return -EFAULT;
}
if (insn->code == (BPF_JMP | BPF_CALL) &&
@@ -9175,53 +9218,83 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
/* there is no real recursion here. timer callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
- *insn_idx, subprog);
+ insn_idx, subprog);
if (!async_cb)
return -EFAULT;
callee = async_cb->frame[0];
callee->async_entry_cnt = caller->async_entry_cnt + 1;
/* Convert bpf_timer_set_callback() args into timer callback args */
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ err = set_callee_state_cb(env, caller, callee, insn_idx);
if (err)
return err;
+ return 0;
+ }
+
+ /* for callback functions enqueue entry to callback and
+ * proceed with next instruction within current frame.
+ */
+ callback_state = push_stack(env, env->subprog_info[subprog].start, insn_idx, false);
+ if (!callback_state)
+ return -ENOMEM;
+
+ err = setup_func_entry(env, subprog, insn_idx, set_callee_state_cb,
+ callback_state);
+ if (err)
+ return err;
+
+ callback_state->callback_unroll_depth++;
+ callback_state->frame[callback_state->curframe - 1]->callback_depth++;
+ caller->callback_depth = 0;
+ return 0;
+}
+
+static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_func_state *caller;
+ int err, subprog, target_insn;
+
+ target_insn = *insn_idx + insn->imm + 1;
+ subprog = find_subprog(env, target_insn);
+ if (subprog < 0) {
+ verbose(env, "verifier bug. No program starts at insn %d\n", target_insn);
+ return -EFAULT;
+ }
+
+ caller = state->frame[state->curframe];
+ err = btf_check_subprog_call(env, subprog, caller->regs);
+ if (err == -EFAULT)
+ return err;
+ if (subprog_is_global(env, subprog)) {
+ if (err) {
+ verbose(env, "Caller passes invalid args into func#%d\n", subprog);
+ return err;
+ }
+
+ if (env->log.level & BPF_LOG_LEVEL)
+ verbose(env, "Func#%d is global and valid. Skipping.\n", subprog);
clear_caller_saved_regs(env, caller->regs);
+
+ /* All global functions return a 64-bit SCALAR_VALUE */
mark_reg_unknown(env, caller->regs, BPF_REG_0);
caller->regs[BPF_REG_0].subreg_def = DEF_NOT_SUBREG;
+
/* continue with next insn after call */
return 0;
}
- callee = kzalloc(sizeof(*callee), GFP_KERNEL);
- if (!callee)
- return -ENOMEM;
- state->frame[state->curframe + 1] = callee;
-
- /* callee cannot access r0, r6 - r9 for reading and has to write
- * into its own stack before reading from it.
- * callee can read/write into caller's stack
+ /* for regular function entry setup new frame and continue
+ * from that frame.
*/
- init_func_state(env, callee,
- /* remember the callsite, it will be used by bpf_exit */
- *insn_idx /* callsite */,
- state->curframe + 1 /* frameno within this callchain */,
- subprog /* subprog number within this prog */);
-
- /* Transfer references to the callee */
- err = copy_reference_state(callee, caller);
+ err = setup_func_entry(env, subprog, *insn_idx, set_callee_state, state);
if (err)
- goto err_out;
-
- err = set_callee_state_cb(env, caller, callee, *insn_idx);
- if (err)
- goto err_out;
+ return err;
clear_caller_saved_regs(env, caller->regs);
- /* only increment it after check_reg_arg() finished */
- state->curframe++;
-
/* and go analyze first insn of the callee */
*insn_idx = env->subprog_info[subprog].start - 1;
@@ -9229,14 +9302,10 @@ static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn
verbose(env, "caller:\n");
print_verifier_state(env, caller, true);
verbose(env, "callee:\n");
- print_verifier_state(env, callee, true);
+ print_verifier_state(env, state->frame[state->curframe], true);
}
- return 0;
-err_out:
- free_func_state(callee);
- state->frame[state->curframe + 1] = NULL;
- return err;
+ return 0;
}
int map_set_for_each_callback_args(struct bpf_verifier_env *env,
@@ -9280,22 +9349,6 @@ static int set_callee_state(struct bpf_verifier_env *env,
return 0;
}
-static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- int *insn_idx)
-{
- int subprog, target_insn;
-
- target_insn = *insn_idx + insn->imm + 1;
- subprog = find_subprog(env, target_insn);
- if (subprog < 0) {
- verbose(env, "verifier bug. No program starts at insn %d\n",
- target_insn);
- return -EFAULT;
- }
-
- return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
-}
-
static int set_map_elem_callback_state(struct bpf_verifier_env *env,
struct bpf_func_state *caller,
struct bpf_func_state *callee,
@@ -9488,9 +9541,10 @@ static bool in_rbtree_lock_required_cb(struct bpf_verifier_env *env)
static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
{
- struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state, *prev_st;
struct bpf_func_state *caller, *callee;
struct bpf_reg_state *r0;
+ bool in_callback_fn;
int err;
callee = state->frame[state->curframe];
@@ -9519,6 +9573,11 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
return -EINVAL;
}
+ if (!calls_callback(env, callee->callsite)) {
+ verbose(env, "BUG: in callback at %d, callsite %d !calls_callback\n",
+ *insn_idx, callee->callsite);
+ return -EFAULT;
+ }
} else {
/* return to the caller whatever r0 had in the callee */
caller->regs[BPF_REG_0] = *r0;
@@ -9536,7 +9595,16 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
return err;
}
- *insn_idx = callee->callsite + 1;
+ /* for callbacks like bpf_loop or bpf_for_each_map_elem go back to callsite,
+ * there function call logic would reschedule callback visit. If iteration
+ * converges is_state_visited() would prune that visit eventually.
+ */
+ in_callback_fn = callee->in_callback_fn;
+ if (in_callback_fn)
+ *insn_idx = callee->callsite;
+ else
+ *insn_idx = callee->callsite + 1;
+
if (env->log.level & BPF_LOG_LEVEL) {
verbose(env, "returning from callee:\n");
print_verifier_state(env, callee, true);
@@ -9547,6 +9615,24 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
* bpf_throw, this will be done by copy_verifier_state for extra frames. */
free_func_state(callee);
state->frame[state->curframe--] = NULL;
+
+ /* for callbacks widen imprecise scalars to make programs like below verify:
+ *
+ * struct ctx { int i; }
+ * void cb(int idx, struct ctx *ctx) { ctx->i++; ... }
+ * ...
+ * struct ctx = { .i = 0; }
+ * bpf_loop(100, cb, &ctx, 0);
+ *
+ * This is similar to what is done in process_iter_next_call() for open
+ * coded iterators.
+ */
+ prev_st = in_callback_fn ? find_prev_entry(env, state, *insn_idx) : NULL;
+ if (prev_st) {
+ err = widen_imprecise_scalars(env, prev_st, state);
+ if (err)
+ return err;
+ }
return 0;
}
@@ -9952,24 +10038,37 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
}
break;
case BPF_FUNC_for_each_map_elem:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_map_elem_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_map_elem_callback_state);
break;
case BPF_FUNC_timer_set_callback:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_timer_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_timer_callback_state);
break;
case BPF_FUNC_find_vma:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_find_vma_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_find_vma_callback_state);
break;
case BPF_FUNC_snprintf:
err = check_bpf_snprintf_call(env, regs);
break;
case BPF_FUNC_loop:
update_loop_inline_state(env, meta.subprogno);
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_loop_callback_state);
+ /* Verifier relies on R1 value to determine if bpf_loop() iteration
+ * is finished, thus mark it precise.
+ */
+ err = mark_chain_precision(env, BPF_REG_1);
+ if (err)
+ return err;
+ if (cur_func(env)->callback_depth < regs[BPF_REG_1].umax_value) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_loop_callback_state);
+ } else {
+ cur_func(env)->callback_depth = 0;
+ if (env->log.level & BPF_LOG_LEVEL2)
+ verbose(env, "frame%d bpf_loop iteration limit reached\n",
+ env->cur_state->curframe);
+ }
break;
case BPF_FUNC_dynptr_from_mem:
if (regs[BPF_REG_1].type != PTR_TO_MAP_VALUE) {
@@ -10065,8 +10164,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
break;
}
case BPF_FUNC_user_ringbuf_drain:
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_user_ringbuf_callback_state);
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_user_ringbuf_callback_state);
break;
}
@@ -10965,7 +11064,7 @@ static bool is_bpf_graph_api_kfunc(u32 btf_id)
btf_id == special_kfunc_list[KF_bpf_refcount_acquire_impl];
}
-static bool is_callback_calling_kfunc(u32 btf_id)
+static bool is_sync_callback_calling_kfunc(u32 btf_id)
{
return btf_id == special_kfunc_list[KF_bpf_rbtree_add_impl];
}
@@ -11727,6 +11826,21 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EACCES;
}
+ /* Check the arguments */
+ err = check_kfunc_args(env, &meta, insn_idx);
+ if (err < 0)
+ return err;
+
+ if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
+ err = push_callback_call(env, insn, insn_idx, meta.subprogno,
+ set_rbtree_add_callback_state);
+ if (err) {
+ verbose(env, "kfunc %s#%d failed callback verification\n",
+ func_name, meta.func_id);
+ return err;
+ }
+ }
+
rcu_lock = is_kfunc_bpf_rcu_read_lock(&meta);
rcu_unlock = is_kfunc_bpf_rcu_read_unlock(&meta);
@@ -11762,10 +11876,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
return -EINVAL;
}
- /* Check the arguments */
- err = check_kfunc_args(env, &meta, insn_idx);
- if (err < 0)
- return err;
/* In case of release function, we get register number of refcounted
* PTR_TO_BTF_ID in bpf_kfunc_arg_meta, do the release now.
*/
@@ -11799,16 +11909,6 @@ static int check_kfunc_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
}
}
- if (meta.func_id == special_kfunc_list[KF_bpf_rbtree_add_impl]) {
- err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
- set_rbtree_add_callback_state);
- if (err) {
- verbose(env, "kfunc %s#%d failed callback verification\n",
- func_name, meta.func_id);
- return err;
- }
- }
-
if (meta.func_id == special_kfunc_list[KF_bpf_throw]) {
if (!bpf_jit_supports_exceptions()) {
verbose(env, "JIT does not support calling kfunc %s#%d\n",
@@ -15079,6 +15179,15 @@ static bool is_force_checkpoint(struct bpf_verifier_env *env, int insn_idx)
return env->insn_aux_data[insn_idx].force_checkpoint;
}
+static void mark_calls_callback(struct bpf_verifier_env *env, int idx)
+{
+ env->insn_aux_data[idx].calls_callback = true;
+}
+
+static bool calls_callback(struct bpf_verifier_env *env, int insn_idx)
+{
+ return env->insn_aux_data[insn_idx].calls_callback;
+}
enum {
DONE_EXPLORING = 0,
@@ -15192,6 +15301,21 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
* async state will be pushed for further exploration.
*/
mark_prune_point(env, t);
+ /* For functions that invoke callbacks it is not known how many times
+ * callback would be called. Verifier models callback calling functions
+ * by repeatedly visiting callback bodies and returning to origin call
+ * instruction.
+ * In order to stop such iteration verifier needs to identify when a
+ * state identical some state from a previous iteration is reached.
+ * Check below forces creation of checkpoint before callback calling
+ * instruction to allow search for such identical states.
+ */
+ if (is_sync_callback_calling_insn(insn)) {
+ mark_calls_callback(env, t);
+ mark_force_checkpoint(env, t);
+ mark_prune_point(env, t);
+ mark_jmp_point(env, t);
+ }
if (insn->src_reg == BPF_PSEUDO_KFUNC_CALL) {
struct bpf_kfunc_call_arg_meta meta;
@@ -16661,10 +16785,16 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
}
goto skip_inf_loop_check;
}
+ if (calls_callback(env, insn_idx)) {
+ if (states_equal(env, &sl->state, cur, true))
+ goto hit;
+ goto skip_inf_loop_check;
+ }
/* attempt to detect infinite loop to avoid unnecessary doomed work */
if (states_maybe_looping(&sl->state, cur) &&
states_equal(env, &sl->state, cur, false) &&
- !iter_active_depths_differ(&sl->state, cur)) {
+ !iter_active_depths_differ(&sl->state, cur) &&
+ sl->state.callback_unroll_depth == cur->callback_unroll_depth) {
verbose_linfo(env, insn_idx, "; ");
verbose(env, "infinite loop detected at insn %d\n", insn_idx);
verbose(env, "cur state:");
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 4e610863cc37..8f3cef1a4d8a 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3898,14 +3898,6 @@ static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
-static int cgroup_pressure_open(struct kernfs_open_file *of)
-{
- if (of->file->f_mode & FMODE_WRITE && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- return 0;
-}
-
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
struct cgroup_file_ctx *ctx = of->priv;
@@ -5312,7 +5304,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "io.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IO]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_io_pressure_show,
.write = cgroup_io_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5321,7 +5312,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "memory.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_MEM]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_memory_pressure_show,
.write = cgroup_memory_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5330,7 +5320,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "cpu.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_CPU]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_cpu_pressure_show,
.write = cgroup_cpu_pressure_write,
.poll = cgroup_pressure_poll,
@@ -5340,7 +5329,6 @@ static struct cftype cgroup_psi_files[] = {
{
.name = "irq.pressure",
.file_offset = offsetof(struct cgroup, psi_files[PSI_IRQ]),
- .open = cgroup_pressure_open,
.seq_show = cgroup_irq_pressure_show,
.write = cgroup_irq_pressure_write,
.poll = cgroup_pressure_poll,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 9e4c6780adde..a86972a91991 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -2113,7 +2113,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
[CPUHP_HRTIMERS_PREPARE] = {
.name = "hrtimers:prepare",
.startup.single = hrtimers_prepare_cpu,
- .teardown.single = hrtimers_dead_cpu,
+ .teardown.single = NULL,
},
[CPUHP_SMPCFD_PREPARE] = {
.name = "smpcfd:prepare",
@@ -2205,6 +2205,12 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = NULL,
.teardown.single = smpcfd_dying_cpu,
},
+ [CPUHP_AP_HRTIMERS_DYING] = {
+ .name = "hrtimers:dying",
+ .startup.single = NULL,
+ .teardown.single = hrtimers_cpu_dying,
+ },
+
/* Entry state on starting. Interrupts enabled from here on. Transient
* state for synchronsization */
[CPUHP_AP_ONLINE] = {
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 683dc086ef10..b704d83a28b2 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4828,6 +4828,11 @@ find_get_pmu_context(struct pmu *pmu, struct perf_event_context *ctx,
void *task_ctx_data = NULL;
if (!ctx->task) {
+ /*
+ * perf_pmu_migrate_context() / __perf_pmu_install_event()
+ * relies on the fact that find_get_pmu_context() cannot fail
+ * for CPU contexts.
+ */
struct perf_cpu_pmu_context *cpc;
cpc = per_cpu_ptr(pmu->cpu_pmu_context, event->cpu);
@@ -12889,6 +12894,9 @@ static void __perf_pmu_install_event(struct pmu *pmu,
int cpu, struct perf_event *event)
{
struct perf_event_pmu_context *epc;
+ struct perf_event_context *old_ctx = event->ctx;
+
+ get_ctx(ctx); /* normally find_get_context() */
event->cpu = cpu;
epc = find_get_pmu_context(pmu, ctx, event);
@@ -12897,6 +12905,11 @@ static void __perf_pmu_install_event(struct pmu *pmu,
if (event->state >= PERF_EVENT_STATE_OFF)
event->state = PERF_EVENT_STATE_INACTIVE;
perf_install_in_context(ctx, event, cpu);
+
+ /*
+ * Now that event->ctx is updated and visible, put the old ctx.
+ */
+ put_ctx(old_ctx);
}
static void __perf_pmu_install(struct perf_event_context *ctx,
@@ -12935,6 +12948,10 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
struct perf_event_context *src_ctx, *dst_ctx;
LIST_HEAD(events);
+ /*
+ * Since per-cpu context is persistent, no need to grab an extra
+ * reference.
+ */
src_ctx = &per_cpu_ptr(&perf_cpu_context, src_cpu)->ctx;
dst_ctx = &per_cpu_ptr(&perf_cpu_context, dst_cpu)->ctx;
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index 52695c59d041..dad981a865b8 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -700,7 +700,8 @@ retry:
owner = uval & FUTEX_TID_MASK;
if (pending_op && !pi && !owner) {
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
return 0;
}
@@ -752,8 +753,10 @@ retry:
* Wake robust non-PI futexes here. The wakeup of
* PI futexes happens in exit_pi_state():
*/
- if (!pi && (uval & FUTEX_WAITERS))
- futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
+ if (!pi && (uval & FUTEX_WAITERS)) {
+ futex_wake(uaddr, FLAGS_SIZE_32 | FLAGS_SHARED, 1,
+ FUTEX_BITSET_MATCH_ANY);
+ }
return 0;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2048138ce54b..d7a3c63a2171 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3666,41 +3666,140 @@ static inline void
dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
#endif
+static void reweight_eevdf(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight)
+{
+ unsigned long old_weight = se->load.weight;
+ u64 avruntime = avg_vruntime(cfs_rq);
+ s64 vlag, vslice;
+
+ /*
+ * VRUNTIME
+ * ========
+ *
+ * COROLLARY #1: The virtual runtime of the entity needs to be
+ * adjusted if re-weight at !0-lag point.
+ *
+ * Proof: For contradiction assume this is not true, so we can
+ * re-weight without changing vruntime at !0-lag point.
+ *
+ * Weight VRuntime Avg-VRuntime
+ * before w v V
+ * after w' v' V'
+ *
+ * Since lag needs to be preserved through re-weight:
+ *
+ * lag = (V - v)*w = (V'- v')*w', where v = v'
+ * ==> V' = (V - v)*w/w' + v (1)
+ *
+ * Let W be the total weight of the entities before reweight,
+ * since V' is the new weighted average of entities:
+ *
+ * V' = (WV + w'v - wv) / (W + w' - w) (2)
+ *
+ * by using (1) & (2) we obtain:
+ *
+ * (WV + w'v - wv) / (W + w' - w) = (V - v)*w/w' + v
+ * ==> (WV-Wv+Wv+w'v-wv)/(W+w'-w) = (V - v)*w/w' + v
+ * ==> (WV - Wv)/(W + w' - w) + v = (V - v)*w/w' + v
+ * ==> (V - v)*W/(W + w' - w) = (V - v)*w/w' (3)
+ *
+ * Since we are doing at !0-lag point which means V != v, we
+ * can simplify (3):
+ *
+ * ==> W / (W + w' - w) = w / w'
+ * ==> Ww' = Ww + ww' - ww
+ * ==> W * (w' - w) = w * (w' - w)
+ * ==> W = w (re-weight indicates w' != w)
+ *
+ * So the cfs_rq contains only one entity, hence vruntime of
+ * the entity @v should always equal to the cfs_rq's weighted
+ * average vruntime @V, which means we will always re-weight
+ * at 0-lag point, thus breach assumption. Proof completed.
+ *
+ *
+ * COROLLARY #2: Re-weight does NOT affect weighted average
+ * vruntime of all the entities.
+ *
+ * Proof: According to corollary #1, Eq. (1) should be:
+ *
+ * (V - v)*w = (V' - v')*w'
+ * ==> v' = V' - (V - v)*w/w' (4)
+ *
+ * According to the weighted average formula, we have:
+ *
+ * V' = (WV - wv + w'v') / (W - w + w')
+ * = (WV - wv + w'(V' - (V - v)w/w')) / (W - w + w')
+ * = (WV - wv + w'V' - Vw + wv) / (W - w + w')
+ * = (WV + w'V' - Vw) / (W - w + w')
+ *
+ * ==> V'*(W - w + w') = WV + w'V' - Vw
+ * ==> V' * (W - w) = (W - w) * V (5)
+ *
+ * If the entity is the only one in the cfs_rq, then reweight
+ * always occurs at 0-lag point, so V won't change. Or else
+ * there are other entities, hence W != w, then Eq. (5) turns
+ * into V' = V. So V won't change in either case, proof done.
+ *
+ *
+ * So according to corollary #1 & #2, the effect of re-weight
+ * on vruntime should be:
+ *
+ * v' = V' - (V - v) * w / w' (4)
+ * = V - (V - v) * w / w'
+ * = V - vl * w / w'
+ * = V - vl'
+ */
+ if (avruntime != se->vruntime) {
+ vlag = (s64)(avruntime - se->vruntime);
+ vlag = div_s64(vlag * old_weight, weight);
+ se->vruntime = avruntime - vlag;
+ }
+
+ /*
+ * DEADLINE
+ * ========
+ *
+ * When the weight changes, the virtual time slope changes and
+ * we should adjust the relative virtual deadline accordingly.
+ *
+ * d' = v' + (d - v)*w/w'
+ * = V' - (V - v)*w/w' + (d - v)*w/w'
+ * = V - (V - v)*w/w' + (d - v)*w/w'
+ * = V + (d - V)*w/w'
+ */
+ vslice = (s64)(se->deadline - avruntime);
+ vslice = div_s64(vslice * old_weight, weight);
+ se->deadline = avruntime + vslice;
+}
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
- unsigned long old_weight = se->load.weight;
+ bool curr = cfs_rq->curr == se;
if (se->on_rq) {
/* commit outstanding execution time */
- if (cfs_rq->curr == se)
+ if (curr)
update_curr(cfs_rq);
else
- avg_vruntime_sub(cfs_rq, se);
+ __dequeue_entity(cfs_rq, se);
update_load_sub(&cfs_rq->load, se->load.weight);
}
dequeue_load_avg(cfs_rq, se);
- update_load_set(&se->load, weight);
-
if (!se->on_rq) {
/*
* Because we keep se->vlag = V - v_i, while: lag_i = w_i*(V - v_i),
* we need to scale se->vlag when w_i changes.
*/
- se->vlag = div_s64(se->vlag * old_weight, weight);
+ se->vlag = div_s64(se->vlag * se->load.weight, weight);
} else {
- s64 deadline = se->deadline - se->vruntime;
- /*
- * When the weight changes, the virtual time slope changes and
- * we should adjust the relative virtual deadline accordingly.
- */
- deadline = div_s64(deadline * old_weight, weight);
- se->deadline = se->vruntime + deadline;
- if (se != cfs_rq->curr)
- min_deadline_cb_propagate(&se->run_node, NULL);
+ reweight_eevdf(cfs_rq, se, weight);
}
+ update_load_set(&se->load, weight);
+
#ifdef CONFIG_SMP
do {
u32 divider = get_pelt_divider(&se->avg);
@@ -3712,8 +3811,17 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
enqueue_load_avg(cfs_rq, se);
if (se->on_rq) {
update_load_add(&cfs_rq->load, se->load.weight);
- if (cfs_rq->curr != se)
- avg_vruntime_add(cfs_rq, se);
+ if (!curr) {
+ /*
+ * The entity's vruntime has been adjusted, so let's check
+ * whether the rq-wide min_vruntime needs updated too. Since
+ * the calculations above require stable min_vruntime rather
+ * than up-to-date one, we do the update at the end of the
+ * reweight process.
+ */
+ __enqueue_entity(cfs_rq, se);
+ update_min_vruntime(cfs_rq);
+ }
}
}
@@ -3857,14 +3965,11 @@ static void update_cfs_group(struct sched_entity *se)
#ifndef CONFIG_SMP
shares = READ_ONCE(gcfs_rq->tg->shares);
-
- if (likely(se->load.weight == shares))
- return;
#else
- shares = calc_group_shares(gcfs_rq);
+ shares = calc_group_shares(gcfs_rq);
#endif
-
- reweight_entity(cfs_rq_of(se), se, shares);
+ if (unlikely(se->load.weight != shares))
+ reweight_entity(cfs_rq_of(se), se, shares);
}
#else /* CONFIG_FAIR_GROUP_SCHED */
@@ -11079,12 +11184,16 @@ static int should_we_balance(struct lb_env *env)
continue;
}
- /* Are we the first idle CPU? */
+ /*
+ * Are we the first idle core in a non-SMT domain or higher,
+ * or the first idle CPU in a SMT domain?
+ */
return cpu == env->dst_cpu;
}
- if (idle_smt == env->dst_cpu)
- return true;
+ /* Are we the first idle CPU with busy siblings? */
+ if (idle_smt != -1)
+ return idle_smt == env->dst_cpu;
/* Are we the first CPU of this group ? */
return group_balance_cpu(sg) == env->dst_cpu;
diff --git a/kernel/sys.c b/kernel/sys.c
index 420d9cb9cc8e..e219fcfa112d 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2394,6 +2394,10 @@ static inline int prctl_set_mdwe(unsigned long bits, unsigned long arg3,
if (bits & PR_MDWE_NO_INHERIT && !(bits & PR_MDWE_REFUSE_EXEC_GAIN))
return -EINVAL;
+ /* PARISC cannot allow mdwe as it needs writable stacks */
+ if (IS_ENABLED(CONFIG_PARISC))
+ return -EINVAL;
+
current_bits = get_current_mdwe();
if (current_bits && current_bits != bits)
return -EPERM; /* Cannot unset the flags */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 238262e4aba7..760793998cdd 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2219,29 +2219,22 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
}
}
-int hrtimers_dead_cpu(unsigned int scpu)
+int hrtimers_cpu_dying(unsigned int dying_cpu)
{
struct hrtimer_cpu_base *old_base, *new_base;
- int i;
+ int i, ncpu = cpumask_first(cpu_active_mask);
- BUG_ON(cpu_online(scpu));
- tick_cancel_sched_timer(scpu);
+ tick_cancel_sched_timer(dying_cpu);
+
+ old_base = this_cpu_ptr(&hrtimer_bases);
+ new_base = &per_cpu(hrtimer_bases, ncpu);
- /*
- * this BH disable ensures that raise_softirq_irqoff() does
- * not wakeup ksoftirqd (and acquire the pi-lock) while
- * holding the cpu_base lock
- */
- local_bh_disable();
- local_irq_disable();
- old_base = &per_cpu(hrtimer_bases, scpu);
- new_base = this_cpu_ptr(&hrtimer_bases);
/*
* The caller is globally serialized and nobody else
* takes two locks at once, deadlock is not possible.
*/
- raw_spin_lock(&new_base->lock);
- raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+ raw_spin_lock(&old_base->lock);
+ raw_spin_lock_nested(&new_base->lock, SINGLE_DEPTH_NESTING);
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
migrate_hrtimer_list(&old_base->clock_base[i],
@@ -2252,15 +2245,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
* The migration might have changed the first expiring softirq
* timer on this CPU. Update it.
*/
- hrtimer_update_softirq_timer(new_base, false);
+ __hrtimer_get_next_event(new_base, HRTIMER_ACTIVE_SOFT);
+ /* Tell the other CPU to retrigger the next event */
+ smp_call_function_single(ncpu, retrigger_next_event, NULL, 0);
- raw_spin_unlock(&old_base->lock);
raw_spin_unlock(&new_base->lock);
+ raw_spin_unlock(&old_base->lock);
- /* Check, if we got expired work to do */
- __hrtimer_peek_ahead_timers();
- local_irq_enable();
- local_bh_enable();
return 0;
}