From d78973c32a210b0057b51880f7119bf2b61d5a65 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 12 Dec 2016 18:01:45 -0800 Subject: llist: Clarify comments about when locking is needed llist.h comments are confusing about when locking is needed versus when it isn't. Clarify these comments by being more descriptive about why locking is needed for llist_del_first. Cc: Ingo Molnar Cc: Will Deacon Cc: Paul McKenney Acked-by: Huang Ying Acked-by: Mathieu Desnoyers Signed-off-by: Joel Fernandes Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/llist.h | 37 +++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/include/linux/llist.h b/include/linux/llist.h index fd4ca0b4fe0f..171baa90f6f6 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -3,28 +3,33 @@ /* * Lock-less NULL terminated single linked list * - * If there are multiple producers and multiple consumers, llist_add - * can be used in producers and llist_del_all can be used in - * consumers. They can work simultaneously without lock. But - * llist_del_first can not be used here. Because llist_del_first - * depends on list->first->next does not changed if list->first is not - * changed during its operation, but llist_del_first, llist_add, - * llist_add (or llist_del_all, llist_add, llist_add) sequence in - * another consumer may violate that. - * - * If there are multiple producers and one consumer, llist_add can be - * used in producers and llist_del_all or llist_del_first can be used - * in the consumer. - * - * This can be summarized as follow: + * Cases where locking is not needed: + * If there are multiple producers and multiple consumers, llist_add can be + * used in producers and llist_del_all can be used in consumers simultaneously + * without locking. Also a single consumer can use llist_del_first while + * multiple producers simultaneously use llist_add, without any locking. + * + * Cases where locking is needed: + * If we have multiple consumers with llist_del_first used in one consumer, and + * llist_del_first or llist_del_all used in other consumers, then a lock is + * needed. This is because llist_del_first depends on list->first->next not + * changing, but without lock protection, there's no way to be sure about that + * if a preemption happens in the middle of the delete operation and on being + * preempted back, the list->first is the same as before causing the cmpxchg in + * llist_del_first to succeed. For example, while a llist_del_first operation + * is in progress in one consumer, then a llist_del_first, llist_add, + * llist_add (or llist_del_all, llist_add, llist_add) sequence in another + * consumer may cause violations. + * + * This can be summarized as follows: * * | add | del_first | del_all * add | - | - | - * del_first | | L | L * del_all | | | - * - * Where "-" stands for no lock is needed, while "L" stands for lock - * is needed. + * Where, a particular row's operation can happen concurrently with a column's + * operation, with "-" being no lock needed, while "L" being lock is needed. * * The list entries deleted via llist_del_all can be traversed with * traversing function such as llist_for_each etc. But the list -- cgit v1.2.3 From 02a5c550b2738f2bfea8e1e00aa75944d71c9e18 Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Wed, 2 Nov 2016 17:25:06 -0700 Subject: rcu: Abstract extended quiescent state determination This commit is the fourth step towards full abstraction of all accesses to the ->dynticks counter, implementing previously open-coded checks and comparisons in new rcu_dynticks_in_eqs() and rcu_dynticks_in_eqs_since() functions. This abstraction will ease changes to the ->dynticks counter operation. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/linux/rcutiny.h | 6 ++++++ kernel/rcu/tree.c | 52 +++++++++++++++++++++++++++++++++++------------- kernel/rcu/tree.h | 2 ++ kernel/rcu/tree_exp.h | 6 +++--- kernel/rcu/tree_plugin.h | 2 +- kernel/rcu/tree_trace.c | 2 +- 6 files changed, 51 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index ac81e4063b40..4f9b2fa2173d 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -27,6 +27,12 @@ #include +struct rcu_dynticks; +static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +{ + return 0; +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 3169d5a21b55..8b970319c75b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -336,17 +336,48 @@ static void rcu_dynticks_eqs_online(void) atomic_add(0x1, &rdtp->dynticks); } +/* + * Is the current CPU in an extended quiescent state? + * + * No ordering, as we are sampling CPU-local information. + */ +bool rcu_dynticks_curr_cpu_in_eqs(void) +{ + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); + + return !(atomic_read(&rdtp->dynticks) & 0x1); +} + /* * Snapshot the ->dynticks counter with full ordering so as to allow * stable comparison of this counter with past and future snapshots. */ -static int rcu_dynticks_snap(struct rcu_dynticks *rdtp) +int rcu_dynticks_snap(struct rcu_dynticks *rdtp) { int snap = atomic_add_return(0, &rdtp->dynticks); return snap; } +/* + * Return true if the snapshot returned from rcu_dynticks_snap() + * indicates that RCU is in an extended quiescent state. + */ +static bool rcu_dynticks_in_eqs(int snap) +{ + return !(snap & 0x1); +} + +/* + * Return true if the CPU corresponding to the specified rcu_dynticks + * structure has spent some time in an extended quiescent state since + * rcu_dynticks_snap() returned the specified snapshot. + */ +static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) +{ + return snap != rcu_dynticks_snap(rdtp); +} + /* * Do a double-increment of the ->dynticks counter to emulate a * momentary idle-CPU quiescent state. @@ -1045,7 +1076,7 @@ void rcu_nmi_enter(void) * to be in the outermost NMI handler that interrupted an RCU-idle * period (observation due to Andy Lutomirski). */ - if (!(atomic_read(&rdtp->dynticks) & 0x1)) { + if (rcu_dynticks_curr_cpu_in_eqs()) { rcu_dynticks_eqs_exit(); incby = 1; } @@ -1071,7 +1102,7 @@ void rcu_nmi_exit(void) * to us!) */ WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); - WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); + WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); /* * If the nesting level is not 1, the CPU wasn't RCU-idle, so @@ -1097,9 +1128,7 @@ void rcu_nmi_exit(void) */ bool notrace __rcu_is_watching(void) { - struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - - return atomic_read(&rdtp->dynticks) & 0x1; + return !rcu_dynticks_curr_cpu_in_eqs(); } /** @@ -1184,7 +1213,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, { rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); rcu_sysidle_check_cpu(rdp, isidle, maxj); - if ((rdp->dynticks_snap & 0x1) == 0) { + if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) @@ -1203,12 +1232,7 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { - unsigned int curr; int *rcrmp; - unsigned int snap; - - curr = (unsigned int)rcu_dynticks_snap(rdp->dynticks); - snap = (unsigned int)rdp->dynticks_snap; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1218,7 +1242,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * read-side critical section that started before the beginning * of the current RCU grace period. */ - if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); rdp->dynticks_fqs++; return 1; @@ -3807,7 +3831,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); - WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); + WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); rdp->cpu = cpu; rdp->rsp = rsp; rcu_boot_init_nocb_percpu_data(rdp); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..3b953dcf6afc 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -595,6 +595,8 @@ extern struct rcu_state rcu_bh_state; extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ +int rcu_dynticks_snap(struct rcu_dynticks *rdtp); + #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 011f626b2fd8..e155a465cf84 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -360,7 +360,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, rdp->exp_dynticks_snap = rcu_dynticks_snap(rdp->dynticks); if (raw_smp_processor_id() == cpu || - !(rdp->exp_dynticks_snap & 0x1) || + rcu_dynticks_in_eqs(rdp->exp_dynticks_snap) || !(rnp->qsmaskinitnext & rdp->grpmask)) mask_ofl_test |= rdp->grpmask; } @@ -383,8 +383,8 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, if (!(mask_ofl_ipi & mask)) continue; retry_ipi: - if (rcu_dynticks_snap(rdp->dynticks) != - rdp->exp_dynticks_snap) { + if (rcu_dynticks_in_eqs_since(rdp->dynticks, + rdp->exp_dynticks_snap)) { mask_ofl_test |= mask; continue; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 56583e764ebf..652209589adf 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1643,7 +1643,7 @@ static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) "o."[!!(rdp->grpmask & rdp->mynode->qsmaskinit)], "N."[!!(rdp->grpmask & rdp->mynode->qsmaskinitnext)], ticks_value, ticks_title, - atomic_read(&rdtp->dynticks) & 0xfff, + rcu_dynticks_snap(rdtp) & 0xfff, rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), READ_ONCE(rsp->n_force_qs) - rsp->n_force_qs_gpstart, diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index b1f28972872c..b833cd0a29e8 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -124,7 +124,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", - atomic_read(&rdp->dynticks->dynticks), + rcu_dynticks_snap(rdp->dynticks), rdp->dynticks->dynticks_nesting, rdp->dynticks->dynticks_nmi_nesting, rdp->dynticks_fqs); -- cgit v1.2.3 From 3a19b46a5c17b12ef0691df19c676ba3da330a57 Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Wed, 30 Nov 2016 11:21:21 -0800 Subject: rcu: Check cond_resched_rcu_qs() state less often to reduce GP overhead Commit 4a81e8328d37 ("rcu: Reduce overhead of cond_resched() checks for RCU") moved quiescent-state generation out of cond_resched() and commit bde6c3aa9930 ("rcu: Provide cond_resched_rcu_qs() to force quiescent states in long loops") introduced cond_resched_rcu_qs(), and commit 5cd37193ce85 ("rcu: Make cond_resched_rcu_qs() apply to normal RCU flavors") introduced the per-CPU rcu_qs_ctr variable, which is frequently polled by the RCU core state machine. This frequent polling can increase grace-period rate, which in turn increases grace-period overhead, which is visible in some benchmarks (for example, the "open1" benchmark in Anton Blanchard's "will it scale" suite). This commit therefore reduces the rate at which rcu_qs_ctr is polled by moving that polling into the force-quiescent-state (FQS) machinery, and by further polling it only after the grace period has been in effect for at least jiffies_till_sched_qs jiffies. Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- include/trace/events/rcu.h | 10 +++++----- kernel/rcu/tree.c | 46 ++++++++++++++++++++++++++++++++++------------ 2 files changed, 39 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 9d4f9b3a2b7b..e3facb356838 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -385,11 +385,11 @@ TRACE_EVENT(rcu_quiescent_state_report, /* * Tracepoint for quiescent states detected by force_quiescent_state(). - * These trace events include the type of RCU, the grace-period number - * that was blocked by the CPU, the CPU itself, and the type of quiescent - * state, which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, - * or "kick" when kicking a CPU that has been in dyntick-idle mode for - * too long. + * These trace events include the type of RCU, the grace-period number that + * was blocked by the CPU, the CPU itself, and the type of quiescent state, + * which can be "dti" for dyntick-idle mode, "ofl" for CPU offline, "kick" + * when kicking a CPU that has been in dyntick-idle mode for too long, or + * "rqc" if the CPU got a quiescent state via its rcu_qs_ctr. */ TRACE_EVENT(rcu_fqs, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 8b970319c75b..d8245cbd08f9 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1232,7 +1232,10 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { + unsigned long jtsq; int *rcrmp; + unsigned long rjtsc; + struct rcu_node *rnp; /* * If the CPU passed through or entered a dynticks idle phase with @@ -1248,6 +1251,31 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, return 1; } + /* Compute and saturate jiffies_till_sched_qs. */ + jtsq = jiffies_till_sched_qs; + rjtsc = rcu_jiffies_till_stall_check(); + if (jtsq > rjtsc / 2) { + WRITE_ONCE(jiffies_till_sched_qs, rjtsc); + jtsq = rjtsc / 2; + } else if (jtsq < 1) { + WRITE_ONCE(jiffies_till_sched_qs, 1); + jtsq = 1; + } + + /* + * Has this CPU encountered a cond_resched_rcu_qs() since the + * beginning of the grace period? For this to be the case, + * the CPU has to have noticed the current grace period. This + * might not be the case for nohz_full CPUs looping in the kernel. + */ + rnp = rdp->mynode; + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { + trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); + return 1; + } + /* * Check for the CPU being offline, but only if the grace period * is old enough. We don't need to worry about the CPU changing @@ -1290,9 +1318,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * warning delay. */ rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (ULONG_CMP_GE(jiffies, - rdp->rsp->gp_start + jiffies_till_sched_qs) || - ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { + if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched)) { if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { WRITE_ONCE(rdp->cond_resched_completed, READ_ONCE(rdp->mynode->completed)); @@ -2550,10 +2577,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) rnp = rdp->mynode; raw_spin_lock_irqsave_rcu_node(rnp, flags); - if ((rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || - rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || - rdp->gpwrap) { + if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || + rnp->completed == rnp->gpnum || rdp->gpwrap) { /* * The grace period in which this quiescent state was @@ -2608,8 +2633,7 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) * Was there a quiescent state since the beginning of the grace * period? If no, then exit and wait for the next call. */ - if (rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) + if (rdp->cpu_no_qs.b.norm) return; /* @@ -3563,9 +3587,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; - } else if (rdp->core_needs_qs && - (!rdp->cpu_no_qs.b.norm || - rdp->rcu_qs_ctr_snap != __this_cpu_read(rcu_qs_ctr))) { + } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; return 1; } -- cgit v1.2.3 From f2c4689640e9a34bc45c013032185ed4ce47e7ff Mon Sep 17 00:00:00 2001 From: Lance Roy Date: Mon, 23 Jan 2017 13:35:18 -0800 Subject: srcu: Implement more-efficient reader counts SRCU uses two per-cpu counters: a nesting counter to count the number of active critical sections, and a sequence counter to ensure that the nesting counters don't change while they are being added together in srcu_readers_active_idx_check(). This patch instead uses per-cpu lock and unlock counters. Because both counters only increase and srcu_readers_active_idx_check() reads the unlock counter before the lock counter, this achieves the same end without having to increment two different counters in srcu_read_lock(). This also saves a smp_mb() in srcu_readers_active_idx_check(). Possible bug: There is no guarantee that the lock counter won't overflow during srcu_readers_active_idx_check(), as there are no memory barriers around srcu_flip() (see comment in srcu_readers_active_idx_check() for details). However, this problem was already present before this patch. Suggested-by: Mathieu Desnoyers Signed-off-by: Lance Roy Cc: Paul E. McKenney Cc: Lai Jiangshan Cc: Peter Zijlstra Signed-off-by: Paul E. McKenney --- include/linux/srcu.h | 10 ++-- kernel/rcu/rcutorture.c | 19 +++++++- kernel/rcu/srcu.c | 122 +++++++++++++++++------------------------------- 3 files changed, 66 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/include/linux/srcu.h b/include/linux/srcu.h index dc8eb63c6568..a598cf3ac70c 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -33,9 +33,9 @@ #include #include -struct srcu_struct_array { - unsigned long c[2]; - unsigned long seq[2]; +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; }; struct rcu_batch { @@ -46,7 +46,7 @@ struct rcu_batch { struct srcu_struct { unsigned long completed; - struct srcu_struct_array __percpu *per_cpu_ref; + struct srcu_array __percpu *per_cpu_ref; spinlock_t queue_lock; /* protect ->batch_queue, ->running */ bool running; /* callbacks just queued */ @@ -118,7 +118,7 @@ void process_srcu(struct work_struct *work); * See include/linux/percpu-defs.h for the rules on per-CPU variables. */ #define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_struct_array, name##_srcu_array);\ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) #define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) #define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 87c51225ceec..d81345be730e 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -564,10 +564,25 @@ static void srcu_torture_stats(void) pr_alert("%s%s per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { + unsigned long l0, l1; + unsigned long u0, u1; long c0, c1; + struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); - c0 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[!idx]; - c1 = (long)per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu)->c[idx]; + u0 = counts->unlock_count[!idx]; + u1 = counts->unlock_count[idx]; + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. + */ + smp_rmb(); + + l0 = counts->lock_count[!idx]; + l1 = counts->lock_count[idx]; + + c0 = l0 - u0; + c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index 9b9cdd549caa..c9a0015e1c2e 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -106,7 +106,7 @@ static int init_srcu_struct_fields(struct srcu_struct *sp) rcu_batch_init(&sp->batch_check1); rcu_batch_init(&sp->batch_done); INIT_DELAYED_WORK(&sp->work, process_srcu); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + sp->per_cpu_ref = alloc_percpu(struct srcu_array); return sp->per_cpu_ref ? 0 : -ENOMEM; } @@ -141,114 +141,77 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ /* - * Returns approximate total of the readers' ->seq[] values for the + * Returns approximate total of the readers' ->lock_count[] values for the * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[idx]); } return sum; } /* - * Returns approximate number of readers active on the specified rank - * of the per-CPU ->c[] counters. + * Returns approximate total of the readers' ->unlock_count[] values for the + * rank of per-CPU counters specified by idx. */ -static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) { int cpu; unsigned long sum = 0; - unsigned long t; for_each_possible_cpu(cpu) { - t = READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); - sum += t; + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->unlock_count[idx]); } return sum; } /* * Return true if the number of pre-existing readers is determined to - * be stably zero. An example unstable zero can occur if the call - * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, - * but due to task migration, sees the corresponding __srcu_read_unlock() - * decrement. This can happen because srcu_readers_active_idx() takes - * time to sum the array, and might in fact be interrupted or preempted - * partway through the summation. + * be zero. */ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) { - unsigned long seq; + unsigned long unlocks; - seq = srcu_readers_seq_idx(sp, idx); + unlocks = srcu_readers_unlock_idx(sp, idx); /* - * The following smp_mb() A pairs with the smp_mb() B located in - * __srcu_read_lock(). This pairing ensures that if an - * __srcu_read_lock() increments its counter after the summation - * in srcu_readers_active_idx(), then the corresponding SRCU read-side - * critical section will see any changes made prior to the start - * of the current SRCU grace period. + * Make sure that a lock is always counted if the corresponding unlock + * is counted. Needs to be a smp_mb() as the read side may contain a + * read from a variable that is written to before the synchronize_srcu() + * in the write side. In this case smp_mb()s A and B act like the store + * buffering pattern. * - * Also, if the above call to srcu_readers_seq_idx() saw the - * increment of ->seq[], then the call to srcu_readers_active_idx() - * must see the increment of ->c[]. + * This smp_mb() also pairs with smp_mb() C to prevent accesses after the + * synchronize_srcu() from being executed before the grace period ends. */ smp_mb(); /* A */ /* - * Note that srcu_readers_active_idx() can incorrectly return - * zero even though there is a pre-existing reader throughout. - * To see this, suppose that task A is in a very long SRCU - * read-side critical section that started on CPU 0, and that - * no other reader exists, so that the sum of the counters - * is equal to one. Then suppose that task B starts executing - * srcu_readers_active_idx(), summing up to CPU 1, and then that - * task C starts reading on CPU 0, so that its increment is not - * summed, but finishes reading on CPU 2, so that its decrement - * -is- summed. Then when task B completes its sum, it will - * incorrectly get zero, despite the fact that task A has been - * in its SRCU read-side critical section the whole time. + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does not + * mean that there are no more readers, as one could have read the + * current index but not have incremented the lock counter yet. * - * We therefore do a validation step should srcu_readers_active_idx() - * return zero. + * Possible bug: There is no guarantee that there haven't been ULONG_MAX + * increments of ->lock_count[] since the unlocks were counted, meaning + * that this could return true even if there are still active readers. + * Since there are no memory barriers around srcu_flip(), the CPU is not + * required to increment ->completed before running + * srcu_readers_unlock_idx(), which means that there could be an + * arbitrarily large number of critical sections that execute after + * srcu_readers_unlock_idx() but use the old value of ->completed. */ - if (srcu_readers_active_idx(sp, idx) != 0) - return false; - - /* - * The remainder of this function is the validation step. - * The following smp_mb() D pairs with the smp_mb() C in - * __srcu_read_unlock(). If the __srcu_read_unlock() was seen - * by srcu_readers_active_idx() above, then any destructive - * operation performed after the grace period will happen after - * the corresponding SRCU read-side critical section. - * - * Note that there can be at most NR_CPUS worth of readers using - * the old index, which is not enough to overflow even a 32-bit - * integer. (Yes, this does mean that systems having more than - * a billion or so CPUs need to be 64-bit systems.) Therefore, - * the sum of the ->seq[] counters cannot possibly overflow. - * Therefore, the only way that the return values of the two - * calls to srcu_readers_seq_idx() can be equal is if there were - * no increments of the corresponding rank of ->seq[] counts - * in the interim. But the missed-increment scenario laid out - * above includes an increment of the ->seq[] counter by - * the corresponding __srcu_read_lock(). Therefore, if this - * scenario occurs, the return values from the two calls to - * srcu_readers_seq_idx() will differ, and thus the validation - * step below suffices. - */ - smp_mb(); /* D */ - - return srcu_readers_seq_idx(sp, idx) == seq; + return srcu_readers_lock_idx(sp, idx) == unlocks; } /** @@ -266,8 +229,12 @@ static bool srcu_readers_active(struct srcu_struct *sp) unsigned long sum = 0; for_each_possible_cpu(cpu) { - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); - sum += READ_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); + struct srcu_array *cpuc = per_cpu_ptr(sp->per_cpu_ref, cpu); + + sum += READ_ONCE(cpuc->lock_count[0]); + sum += READ_ONCE(cpuc->lock_count[1]); + sum -= READ_ONCE(cpuc->unlock_count[0]); + sum -= READ_ONCE(cpuc->unlock_count[1]); } return sum; } @@ -298,9 +265,8 @@ int __srcu_read_lock(struct srcu_struct *sp) int idx; idx = READ_ONCE(sp->completed) & 0x1; - __this_cpu_inc(sp->per_cpu_ref->c[idx]); + __this_cpu_inc(sp->per_cpu_ref->lock_count[idx]); smp_mb(); /* B */ /* Avoid leaking the critical section. */ - __this_cpu_inc(sp->per_cpu_ref->seq[idx]); return idx; } EXPORT_SYMBOL_GPL(__srcu_read_lock); @@ -314,7 +280,7 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); void __srcu_read_unlock(struct srcu_struct *sp, int idx) { smp_mb(); /* C */ /* Avoid leaking the critical section. */ - this_cpu_dec(sp->per_cpu_ref->c[idx]); + this_cpu_inc(sp->per_cpu_ref->unlock_count[idx]); } EXPORT_SYMBOL_GPL(__srcu_read_unlock); @@ -349,7 +315,7 @@ static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) /* * Increment the ->completed counter so that future SRCU readers will - * use the other rank of the ->c[] and ->seq[] arrays. This allows + * use the other rank of the ->(un)lock_count[] arrays. This allows * us to wait for pre-existing readers in a starvation-free manner. */ static void srcu_flip(struct srcu_struct *sp) -- cgit v1.2.3 From d85b62f18d543c663cbdd6061054efeb9e66cee7 Mon Sep 17 00:00:00 2001 From: Paul E. McKenney Date: Mon, 28 Nov 2016 12:08:49 -0800 Subject: srcu: Force full grace-period ordering If a process invokes synchronize_srcu(), is delayed just the right amount of time, and thus does not sleep when waiting for the grace period to complete, there is no ordering between the end of the grace period and the code following the synchronize_srcu(). Similarly, there can be a lack of ordering between the end of the SRCU grace period and callback invocation. This commit adds the necessary ordering. Reported-by: Lance Roy Signed-off-by: Paul E. McKenney [ paulmck: Further smp_mb() adjustment per email with Lance Roy. ] --- include/linux/rcupdate.h | 12 ++++++++++++ kernel/rcu/srcu.c | 10 ++++++++-- kernel/rcu/tree.h | 12 ------------ 3 files changed, 20 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 01f71e1d2e94..6ade6a52d9d4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1161,5 +1161,17 @@ do { \ ftrace_dump(oops_dump_mode); \ } while (0) +/* + * Place this after a lock-acquisition primitive to guarantee that + * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies + * if the UNLOCK and LOCK are executed by the same CPU or if the + * UNLOCK and LOCK operate on the same lock variable. + */ +#ifdef CONFIG_PPC +#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ +#else /* #ifdef CONFIG_PPC */ +#define smp_mb__after_unlock_lock() do { } while (0) +#endif /* #else #ifdef CONFIG_PPC */ + #endif /* __LINUX_RCUPDATE_H */ diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index c9a0015e1c2e..665bc9951523 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -358,6 +358,7 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head, head->next = NULL; head->func = func; spin_lock_irqsave(&sp->queue_lock, flags); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ rcu_batch_queue(&sp->batch_queue, head); if (!sp->running) { sp->running = true; @@ -391,6 +392,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) head->next = NULL; head->func = wakeme_after_rcu; spin_lock_irq(&sp->queue_lock); + smp_mb__after_unlock_lock(); /* Caller's prior accesses before GP. */ if (!sp->running) { /* steal the processing owner */ sp->running = true; @@ -410,8 +412,11 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) spin_unlock_irq(&sp->queue_lock); } - if (!done) + if (!done) { wait_for_completion(&rcu.completion); + smp_mb(); /* Caller's later accesses after GP. */ + } + } /** @@ -579,7 +584,8 @@ static void srcu_advance_batches(struct srcu_struct *sp, int trycount) /* * Invoke a limited number of SRCU callbacks that have passed through * their grace period. If there are more to do, SRCU will reschedule - * the workqueue. + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). */ static void srcu_invoke_callbacks(struct srcu_struct *sp) { diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index fe98dd24adf8..abcc25bdcb29 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -687,18 +687,6 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) } #endif /* #ifdef CONFIG_RCU_TRACE */ -/* - * Place this after a lock-acquisition primitive to guarantee that - * an UNLOCK+LOCK pair act as a full barrier. This guarantee applies - * if the UNLOCK and LOCK are executed by the same CPU or if the - * UNLOCK and LOCK operate on the same lock variable. - */ -#ifdef CONFIG_PPC -#define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ -#define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ - /* * Wrappers for the rcu_node::lock acquire and release. * -- cgit v1.2.3