From ebc3505d507cf0aafdc31e4b2359c9b22b3927c8 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 17 Jun 2020 13:25:26 -0700
Subject: rcu: Remove KCSAN stubs

KCSAN is now in mainline, so this commit removes the stubs for the
data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS()
macros.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8ce77d9ac716..eb36779697ca 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -70,19 +70,6 @@
 #endif
 #define MODULE_PARAM_PREFIX "rcutree."
 
-#ifndef data_race
-#define data_race(expr)							\
-	({								\
-		expr;							\
-	})
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
 /* Data structures. */
 
 /*
-- 
cgit v1.2.3


From beb27bd649a08655b6e15b71265fccad9c00bd2c Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 17 Jun 2020 13:26:20 -0700
Subject: rcu: Remove KCSAN stubs from update.c

KCSAN is now in mainline, so this commit removes the stubs for the
data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS()
macros.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/update.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 2de49b5d8dd2..5f7713a27dbb 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -53,19 +53,6 @@
 #endif
 #define MODULE_PARAM_PREFIX "rcupdate."
 
-#ifndef data_race
-#define data_race(expr)							\
-	({								\
-		expr;							\
-	})
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
 #ifndef CONFIG_TINY_RCU
 module_param(rcu_expedited, int, 0);
 module_param(rcu_normal, int, 0);
-- 
cgit v1.2.3


From d9b60741318f6f8bcb2adc4beaef724c923fcb93 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 17 Jun 2020 13:24:04 -0700
Subject: srcu: Remove KCSAN stubs

KCSAN is now in mainline, so this commit removes the stubs for the
data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS()
macros.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/srcutree.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index c100acf332ed..c13348ee80a5 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -29,19 +29,6 @@
 #include "rcu.h"
 #include "rcu_segcblist.h"
 
-#ifndef data_race
-#define data_race(expr)							\
-	({								\
-		expr;							\
-	})
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
 /* Holdoff in nanoseconds for auto-expediting. */
 #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000)
 static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF;
-- 
cgit v1.2.3


From 7487ea07dfa9bd782a13469cab18973ea0ab8c57 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 18 Jun 2020 09:51:12 -0700
Subject: rcu: Initialize at declaration time in rcu_exp_handler()

This commit moves the initialization of the CONFIG_PREEMPT=n version of
the rcu_exp_handler() function's rdp and rnp local variables into their
respective declarations to save a couple lines of code.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_exp.h | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 1888c0eb1216..8760b6ead770 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -732,11 +732,9 @@ static void rcu_exp_need_qs(void)
 /* Invoked on each online non-idle CPU for expedited quiescent state. */
 static void rcu_exp_handler(void *unused)
 {
-	struct rcu_data *rdp;
-	struct rcu_node *rnp;
+	struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
+	struct rcu_node *rnp = rdp->mynode;
 
-	rdp = this_cpu_ptr(&rcu_data);
-	rnp = rdp->mynode;
 	if (!(READ_ONCE(rnp->expmask) & rdp->grpmask) ||
 	    __this_cpu_read(rcu_data.cpu_no_qs.b.exp))
 		return;
-- 
cgit v1.2.3


From a7886e899fd8334a03d37e66ad10295d175725ea Mon Sep 17 00:00:00 2001
From: Joel Fernandes (Google)
Date: Thu, 18 Jun 2020 21:36:40 -0400
Subject: rcu/trace: Use gp_seq_req in acceleration's rcu_grace_period
 tracepoint

During acceleration of CB, the rsp's gp_seq is rcu_seq_snap'd. This is
the value used for acceleration - it is the value of gp_seq at which it
is safe the execute all callbacks in the callback list.

The rdp's gp_seq is not very useful for this scenario. Make
rcu_grace_period report the gp_seq_req instead as it allows one to
reason about how the acceleration works.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index eb36779697ca..896912034982 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1483,9 +1483,10 @@ static bool rcu_accelerate_cbs(struct rcu_node *rnp, struct rcu_data *rdp)
 
 	/* Trace depending on how much we were able to accelerate. */
 	if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL))
-		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccWaitCB"));
+		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccWaitCB"));
 	else
-		trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("AccReadyCB"));
+		trace_rcu_grace_period(rcu_state.name, gp_seq_req, TPS("AccReadyCB"));
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From e082c7b38185af0f59e55efff840939c35391f85 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 22 Jun 2020 09:25:34 -0700
Subject: nocb: Clarify RCU nocb CPU error message

A message of the form "rcu:    !!! lDTs ." can be tracked down, but
doing so is not trivial.  This commit therefore eases this process by
adding text so that this error message now reads as follows:
"rcu:    nocb GP activity on CB-only CPU!!! lDTs ."

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 982fc5be5269..bbc0c07ce56e 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2417,7 +2417,7 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 	    !waslocked && !wastimer && !wassleep)
 		return;  /* Nothing untowards. */
 
-	pr_info("   !!! %c%c%c%c %c\n",
+	pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
 		"lL"[waslocked],
 		"dD"[!!rdp->nocb_defer_wakeup],
 		"tT"[wastimer],
-- 
cgit v1.2.3


From 9c39245382de4d52a122641952900709d4a9950b Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay
Date: Mon, 22 Jun 2020 00:07:27 +0530
Subject: rcu/tree: Force quiescent state on callback overload

On callback overload, it is necessary to quickly detect idle CPUs,
and rcu_gp_fqs_check_wake() checks for this condition.  Unfortunately,
the code following the call to this function does not repeat this check,
which means that in reality no actual quiescent-state forcing, instead
only a couple of quick and pointless wakeups at the beginning of the
grace period.

This commit therefore adds a check for the RCU_GP_FLAG_OVLD flag in
the post-wakeup "if" statement in rcu_gp_fqs_loop().

Fixes: 1fca4d12f4637 ("rcu: Expedite first two FQS scans under callback-overload conditions")
Reviewed-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 896912034982..4770d7709dc2 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1884,7 +1884,7 @@ static void rcu_gp_fqs_loop(void)
 			break;
 		/* If time for quiescent-state forcing, do it. */
 		if (!time_after(rcu_state.jiffies_force_qs, jiffies) ||
-		    (gf & RCU_GP_FLAG_FQS)) {
+		    (gf & (RCU_GP_FLAG_FQS | RCU_GP_FLAG_OVLD))) {
 			trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
 					       TPS("fqsstart"));
 			rcu_gp_fqs(first_gp_fqs);
-- 
cgit v1.2.3


From 9b1ce0acb5e65e9ea1e6b322562d072f9f7d1f6e Mon Sep 17 00:00:00 2001
From: Neeraj Upadhyay
Date: Mon, 22 Jun 2020 23:37:03 +0530
Subject: rcu/tree: Remove CONFIG_PREMPT_RCU check in force_qs_rnp()

Originally, the call to rcu_preempt_blocked_readers_cgp() from
force_qs_rnp() had to be conditioned on CONFIG_PREEMPT_RCU=y, as in
commit a77da14ce9af ("rcu: Yet another fix for preemption and CPU
hotplug").  However, there is now a CONFIG_PREEMPT_RCU=n definition of
rcu_preempt_blocked_readers_cgp() that unconditionally returns zero, so
invoking it is now safe.  In addition, the CONFIG_PREEMPT_RCU=n definition
of rcu_initiate_boost() simply releases the rcu_node structure's ->lock,
which is what happens when the "if" condition evaluates to false.

This commit therefore drops the IS_ENABLED(CONFIG_PREEMPT_RCU) check,
so that rcu_initiate_boost() is called only in CONFIG_PREEMPT_RCU=y
kernels when there are readers blocking the current grace period.
This does not change the behavior, but reduces code-reader confusion by
eliminating non-CONFIG_PREEMPT_RCU=y calls to rcu_initiate_boost().

Signed-off-by: Neeraj Upadhyay <neeraju@codeaurora.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4770d7709dc2..acc926f07dc1 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2533,8 +2533,7 @@ static void force_qs_rnp(int (*f)(struct rcu_data *rdp))
 		raw_spin_lock_irqsave_rcu_node(rnp, flags);
 		rcu_state.cbovldnext |= !!rnp->cbovldmask;
 		if (rnp->qsmask == 0) {
-			if (!IS_ENABLED(CONFIG_PREEMPT_RCU) ||
-			    rcu_preempt_blocked_readers_cgp(rnp)) {
+			if (rcu_preempt_blocked_readers_cgp(rnp)) {
 				/*
 				 * No point in scanning bits because they
 				 * are all zero.  But we might need to
-- 
cgit v1.2.3


From 2130c6b4f610ea65e9df71dfa79ee08f2fc17743 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 22 Jun 2020 16:46:43 -0700
Subject: nocb: Remove show_rcu_nocb_state() false positive printout

The rcu_data structure's ->nocb_timer field is used to defer wakeups of
the corresponding no-CBs CPU's grace-period kthread ("rcuog*"), and that
structure's ->nocb_defer_wakeup field is used to track such deferral.
This means that the show_rcu_nocb_state() printing an error when those
fields are set for a CPU not corresponding to a no-CBs grace-period
kthread is erroneous.

This commit therefore switches the check from ->nocb_timer to
->nocb_bypass_timer and removes the check of ->nocb_defer_wakeup.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index bbc0c07ce56e..4d63ee3de7a9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2411,10 +2411,9 @@ static void show_rcu_nocb_state(struct rcu_data *rdp)
 		return;
 
 	waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
-	wastimer = timer_pending(&rdp->nocb_timer);
+	wastimer = timer_pending(&rdp->nocb_bypass_timer);
 	wassleep = swait_active(&rdp->nocb_gp_wq);
-	if (!rdp->nocb_defer_wakeup && !rdp->nocb_gp_sleep &&
-	    !waslocked && !wastimer && !wassleep)
+	if (!rdp->nocb_gp_sleep && !waslocked && !wastimer && !wassleep)
 		return;  /* Nothing untowards. */
 
 	pr_info("   nocb GP activity on CB-only CPU!!! %c%c%c%c %c\n",
-- 
cgit v1.2.3


From b5374b2df0ac1c78895b8eb8d9582a7bdc67257d Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 23 Jun 2020 17:09:27 -0700
Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_divisor

Given that sysfs can change the value of rcu_divisor at any time, this
commit adds a READ_ONCE to the sole access to that variable.  While in
the area, this commit also adds bounds checking, clamping the value to
a shift that makes sense for a signed long.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index acc926f07dc1..1dca14cf66f9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2362,6 +2362,7 @@ int rcutree_dead_cpu(unsigned int cpu)
  */
 static void rcu_do_batch(struct rcu_data *rdp)
 {
+	int div;
 	unsigned long flags;
 	const bool offloaded = IS_ENABLED(CONFIG_RCU_NOCB_CPU) &&
 			       rcu_segcblist_is_offloaded(&rdp->cblist);
@@ -2390,7 +2391,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	rcu_nocb_lock(rdp);
 	WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
 	pending = rcu_segcblist_n_cbs(&rdp->cblist);
-	bl = max(rdp->blimit, pending >> rcu_divisor);
+	div = READ_ONCE(rcu_divisor);
+	div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
+	bl = max(rdp->blimit, pending >> div);
 	if (unlikely(bl > 100))
 		tlimit = local_clock() + rcu_resched_ns;
 	trace_rcu_batch_start(rcu_state.name,
-- 
cgit v1.2.3


From a2b354b9950bb859d8d959f951dda26725b041fb Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 23 Jun 2020 17:49:40 -0700
Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_resched_ns

Given that sysfs can change the value of rcu_resched_ns at any time,
this commit adds a READ_ONCE() to the sole access to that variable.
While in the area, this commit also adds bounds checking, clamping the
value to at least a millisecond, but no longer than a second.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1dca14cf66f9..da05afc53493 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2394,8 +2394,12 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	div = READ_ONCE(rcu_divisor);
 	div = div < 0 ? 7 : div > sizeof(long) * 8 - 2 ? sizeof(long) * 8 - 2 : div;
 	bl = max(rdp->blimit, pending >> div);
-	if (unlikely(bl > 100))
-		tlimit = local_clock() + rcu_resched_ns;
+	if (unlikely(bl > 100)) {
+		long rrn = READ_ONCE(rcu_resched_ns);
+
+		rrn = rrn < NSEC_PER_MSEC ? NSEC_PER_MSEC : rrn > NSEC_PER_SEC ? NSEC_PER_SEC : rrn;
+		tlimit = local_clock() + rrn;
+	}
 	trace_rcu_batch_start(rcu_state.name,
 			      rcu_segcblist_n_cbs(&rdp->cblist), bl);
 	rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
-- 
cgit v1.2.3


From fe63b723cc7ca3a91ea91274e0f2cba29452b3fa Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 23 Jun 2020 18:04:45 -0700
Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to rcu_kick_kthreads

Given that sysfs can change the value of rcu_kick_kthreads at any time,
this commit adds a READ_ONCE() to the sole access to that variable.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_stall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index b5d3b4794db4..a1780a621b5e 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -158,7 +158,7 @@ static void rcu_stall_kick_kthreads(void)
 {
 	unsigned long j;
 
-	if (!rcu_kick_kthreads)
+	if (!READ_ONCE(rcu_kick_kthreads))
 		return;
 	j = READ_ONCE(rcu_state.jiffies_kick_kthreads);
 	if (time_after(jiffies, j) && rcu_state.gp_kthread &&
@@ -580,7 +580,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 	unsigned long js;
 	struct rcu_node *rnp;
 
-	if ((rcu_stall_is_suppressed() && !rcu_kick_kthreads) ||
+	if ((rcu_stall_is_suppressed() && !READ_ONCE(rcu_kick_kthreads)) ||
 	    !rcu_gp_in_progress())
 		return;
 	rcu_stall_kick_kthreads();
-- 
cgit v1.2.3


From 1ef5a442a113d140580b3b8bbd6f50c9f7746397 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 23 Jun 2020 20:57:59 -0700
Subject: rcu: Add READ_ONCE() to rcu_do_batch() access to
 rcu_cpu_stall_ftrace_dump

Given that sysfs can change the value of rcu_cpu_stall_ftrace_dump at any
time, this commit adds a READ_ONCE() to the accesses to that variable.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_stall.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index a1780a621b5e..0fde39b8daab 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -623,7 +623,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(gps);
-		if (rcu_cpu_stall_ftrace_dump)
+		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
 			rcu_ftrace_dump(DUMP_ALL);
 
 	} else if (rcu_gp_in_progress() &&
@@ -632,7 +632,7 @@ static void check_cpu_stall(struct rcu_data *rdp)
 
 		/* They had a few time units to dump stack, so complain. */
 		print_other_cpu_stall(gs2, gps);
-		if (rcu_cpu_stall_ftrace_dump)
+		if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
 			rcu_ftrace_dump(DUMP_ALL);
 	}
 }
-- 
cgit v1.2.3


From c0f97f20e5d97a1358ade650fcf6a322c0c9bc72 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Fri, 24 Jul 2020 20:22:05 -0700
Subject: rcu: Move rcu_cpu_started per-CPU variable to rcu_data

When the rcu_cpu_started per-CPU variable was added by commit
f64c6013a202 ("rcu/x86: Provide early rcu_cpu_starting() callback"),
there were multiple sets of per-CPU rcu_data structures.  Therefore, the
rcu_cpu_started flag was added as a separate per-CPU variable.  But now
there is only one set of per-CPU rcu_data structures, so this commit
moves rcu_cpu_started to a new ->cpu_started field in that structure.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 11 ++++-------
 kernel/rcu/tree.h |  1 +
 2 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index da05afc53493..52108dd92169 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3967,8 +3967,6 @@ int rcutree_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-static DEFINE_PER_CPU(int, rcu_cpu_started);
-
 /*
  * Mark the specified CPU as being online so that subsequent grace periods
  * (both expedited and normal) will wait on it.  Note that this means that
@@ -3988,12 +3986,11 @@ void rcu_cpu_starting(unsigned int cpu)
 	struct rcu_node *rnp;
 	bool newcpu;
 
-	if (per_cpu(rcu_cpu_started, cpu))
+	rdp = per_cpu_ptr(&rcu_data, cpu);
+	if (rdp->cpu_started)
 		return;
+	rdp->cpu_started = true;
 
-	per_cpu(rcu_cpu_started, cpu) = 1;
-
-	rdp = per_cpu_ptr(&rcu_data, cpu);
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -4053,7 +4050,7 @@ void rcu_report_dead(unsigned int cpu)
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	raw_spin_unlock(&rcu_state.ofl_lock);
 
-	per_cpu(rcu_cpu_started, cpu) = 0;
+	rdp->cpu_started = false;
 }
 
 /*
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c96ae351688b..309bc7f41d35 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -156,6 +156,7 @@ struct rcu_data {
 	bool		beenonline;	/* CPU online at least once. */
 	bool		gpwrap;		/* Possible ->gp_seq wrap. */
 	bool		exp_deferred_qs; /* This CPU awaiting a deferred QS? */
+	bool		cpu_started;	/* RCU watching this onlining CPU. */
 	struct rcu_node *mynode;	/* This CPU's leaf of hierarchy */
 	unsigned long grpmask;		/* Mask to apply to leaf qsmask. */
 	unsigned long	ticks_this_gp;	/* The number of scheduling-clock */
-- 
cgit v1.2.3


From 4569c5ee95d5695bfd794ae968c2d59b3e69129a Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 5 Aug 2020 10:35:16 -0700
Subject: rcu/nocb: Add a warning for non-GP kthread running GP code

This commit increases RCU's ability to defend itself by emitting a warning
if one of the nocb CB kthreads invokes the GP kthread's wait function.
This warning augments a similar check that is carried out at the end
of rcutorture testing and when RCU CPU stall warnings are emitted.
The problem with those checks is that the miscreants have long since
departed and disposed of any and all evidence.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 4d63ee3de7a9..cb1e8c8befb9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1926,6 +1926,7 @@ static void nocb_gp_wait(struct rcu_data *my_rdp)
 	 * nearest grace period (if any) to wait for next.  The CB kthreads
 	 * and the global grace-period kthread are awakened if needed.
 	 */
+	WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
 	for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
 		trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
 		rcu_nocb_lock_irqsave(rdp, flags);
-- 
cgit v1.2.3


From f37599e6f06da47e49c3408afe66c5b6e83a90bd Mon Sep 17 00:00:00 2001
From: Joel Fernandes (Google)
Date: Fri, 7 Aug 2020 13:07:19 -0400
Subject: rcu: Clarify comments about FQS loop reporting quiescent states

Since at least v4.19, the FQS loop no longer reports quiescent states
for offline CPUs except in emergency situations.

This commit therefore fixes the comment in rcu_gp_init() to match the
current code.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 52108dd92169..2c7afe491c45 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1706,10 +1706,13 @@ static bool rcu_gp_init(void)
 	raw_spin_unlock_irq_rcu_node(rnp);
 
 	/*
-	 * Apply per-leaf buffered online and offline operations to the
-	 * rcu_node tree.  Note that this new grace period need not wait
-	 * for subsequent online CPUs, and that quiescent-state forcing
-	 * will handle subsequent offline CPUs.
+	 * Apply per-leaf buffered online and offline operations to
+	 * the rcu_node tree. Note that this new grace period need not
+	 * wait for subsequent online CPUs, and that RCU hooks in the CPU
+	 * offlining path, when combined with checks in this function,
+	 * will handle CPUs that are currently going offline or that will
+	 * go offline later.  Please also refer to "Hotplug CPU" section
+	 * of RCU's Requirements documentation.
 	 */
 	rcu_state.gp_state = RCU_GP_ONOFF;
 	rcu_for_each_leaf_node(rnp) {
-- 
cgit v1.2.3


From 666ca2907e6b75960ce2f0fe50afc5d8a46f296d Mon Sep 17 00:00:00 2001
From: Joel Fernandes (Google)
Date: Fri, 7 Aug 2020 13:07:20 -0400
Subject: rcu: Make FQS more aggressive in complaining about offline CPUs

The RCU grace-period kthread's force-quiescent state (FQS) loop should
never see an offline CPU that has not yet reported a quiescent state.
After all, the offline CPU should have reported a quiescent state
during the CPU-offline process, or, failing that, by rcu_gp_init()
if it ran concurrently with either the CPU going offline or the last
task on a leaf rcu_node structure exiting its RCU read-side critical
section while all CPUs corresponding to that structure are offline.
The FQS loop should therefore complain if it does see an offline CPU
that has not yet reported a quiescent state.

And it does, but only once the grace period has been in force for a
full second.  This commit therefore makes this warning more aggressive,
so that it will trigger as soon as the condition makes its appearance.

Light testing with TREE03 and hotplug shows no warnings.  This commit
also converts the warning to WARN_ON_ONCE() in order to stave off possible
log spam.

Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 2c7afe491c45..396abe0e0d01 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1214,13 +1214,28 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 		return 1;
 	}
 
-	/* If waiting too long on an offline CPU, complain. */
-	if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp)) &&
-	    time_after(jiffies, rcu_state.gp_start + HZ)) {
+	/*
+	 * Complain if a CPU that is considered to be offline from RCU's
+	 * perspective has not yet reported a quiescent state.  After all,
+	 * the offline CPU should have reported a quiescent state during
+	 * the CPU-offline process, or, failing that, by rcu_gp_init()
+	 * if it ran concurrently with either the CPU going offline or the
+	 * last task on a leaf rcu_node structure exiting its RCU read-side
+	 * critical section while all CPUs corresponding to that structure
+	 * are offline.  This added warning detects bugs in any of these
+	 * code paths.
+	 *
+	 * The rcu_node structure's ->lock is held here, which excludes
+	 * the relevant portions the CPU-hotplug code, the grace-period
+	 * initialization code, and the rcu_read_unlock() code paths.
+	 *
+	 * For more detail, please refer to the "Hotplug CPU" section
+	 * of RCU's Requirements documentation.
+	 */
+	if (WARN_ON_ONCE(!(rdp->grpmask & rcu_rnp_online_cpus(rnp)))) {
 		bool onl;
 		struct rcu_node *rnp1;
 
-		WARN_ON(1);  /* Offline CPUs are supposed to report QS! */
 		pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
 			__func__, rnp->grplo, rnp->grphi, rnp->level,
 			(long)rnp->gp_seq, (long)rnp->completedqs);
-- 
cgit v1.2.3


From 7f2a53c231fe5d9522c3b695ab454203904031ac Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 17 Aug 2020 10:37:22 -0700
Subject: rcu: Remove unused __rcu_is_watching() function

The x86/entry work removed all uses of __rcu_is_watching(), therefore
this commit removes it entirely.

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <x86@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcutiny.h | 1 -
 include/linux/rcutree.h | 1 -
 kernel/entry/common.c   | 2 +-
 kernel/rcu/tree.c       | 5 -----
 4 files changed, 1 insertion(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 5cc9637cac16..7c1ecdb356d8 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -103,7 +103,6 @@ static inline void rcu_scheduler_starting(void) { }
 static inline void rcu_end_inkernel_boot(void) { }
 static inline bool rcu_inkernel_boot_has_ended(void) { return true; }
 static inline bool rcu_is_watching(void) { return true; }
-static inline bool __rcu_is_watching(void) { return true; }
 static inline void rcu_momentary_dyntick_idle(void) { }
 static inline void kfree_rcu_scheduler_running(void) { }
 static inline bool rcu_gp_might_be_stalled(void) { return false; }
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2f4064ebd1d..59eb5cd567d7 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -64,7 +64,6 @@ extern int rcu_scheduler_active __read_mostly;
 void rcu_end_inkernel_boot(void);
 bool rcu_inkernel_boot_has_ended(void);
 bool rcu_is_watching(void);
-bool __rcu_is_watching(void);
 #ifndef CONFIG_PREEMPTION
 void rcu_all_qs(void);
 #endif
diff --git a/kernel/entry/common.c b/kernel/entry/common.c
index 9852e0d62d95..ad794a10fa80 100644
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -278,7 +278,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs)
 	 * terminate a grace period, if and only if the timer interrupt is
 	 * not nested into another interrupt.
 	 *
-	 * Checking for __rcu_is_watching() here would prevent the nesting
+	 * Checking for rcu_is_watching() here would prevent the nesting
 	 * interrupt to invoke rcu_irq_enter(). If that nested interrupt is
 	 * the tick then rcu_flavor_sched_clock_irq() would wrongfully
 	 * assume that it is the first interupt and eventually claim
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 396abe0e0d01..232362293678 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1077,11 +1077,6 @@ static void rcu_disable_urgency_upon_qs(struct rcu_data *rdp)
 	}
 }
 
-noinstr bool __rcu_is_watching(void)
-{
-	return !rcu_dynticks_curr_cpu_in_eqs();
-}
-
 /**
  * rcu_is_watching - see if RCU thinks that the current CPU is not idle
  *
-- 
cgit v1.2.3


From e9d338a0b1799c988b678e8ccb66a442272e6aa3 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 24 Jun 2020 15:59:59 -0700
Subject: scftorture: Add smp_call_function() torture test

This commit adds an smp_call_function() torture test that repeatedly
invokes this function and complains if things go badly awry.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  92 +++++++
 kernel/Makefile                                 |   2 +
 kernel/scftorture.c                             | 350 ++++++++++++++++++++++++
 lib/Kconfig.debug                               |  10 +
 4 files changed, 454 insertions(+)
 create mode 100644 kernel/scftorture.c

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdc1f33fd3d1..91a56382ae56 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4637,6 +4637,98 @@
 			Format: integer between 0 and 10
 			Default is 0.
 
+	scftorture.holdoff= [KNL]
+			Number of seconds to hold off before starting
+			test.  Defaults to zero for module insertion and
+			to 10 seconds for built-in smp_call_function()
+			tests.
+
+	scftorture.longwait= [KNL]
+			Request ridiculously long waits randomly selected
+			up to the chosen limit in seconds.  Zero (the
+			default) disables this feature.  Please note
+			that requesting even small non-zero numbers of
+			seconds can result in RCU CPU stall warnings,
+			softlockup complaints, and so on.
+
+	scftorture.nthreads= [KNL]
+			Number of kthreads to spawn to invoke the
+			smp_call_function() family of functions.
+			The default of -1 specifies a number of kthreads
+			equal to the number of CPUs.
+
+	scftorture.onoff_holdoff= [KNL]
+			Number seconds to wait after the start of the
+			test before initiating CPU-hotplug operations.
+
+	scftorture.onoff_interval= [KNL]
+			Number seconds to wait between successive
+			CPU-hotplug operations.  Specifying zero (which
+			is the default) disables CPU-hotplug operations.
+
+	scftorture.shutdown_secs= [KNL]
+			The number of seconds following the start of the
+			test after which to shut down the system.  The
+			default of zero avoids shutting down the system.
+			Non-zero values are useful for automated tests.
+
+	scftorture.stat_interval= [KNL]
+			The number of seconds between outputting the
+			current test statistics to the console.  A value
+			of zero disables statistics output.
+
+	scftorture.stutter_cpus= [KNL]
+			The number of jiffies to wait between each change
+			to the set of CPUs under test.
+
+	scftorture.use_cpus_read_lock= [KNL]
+			Use use_cpus_read_lock() instead of the default
+			preempt_disable() to disable CPU hotplug
+			while invoking one of the smp_call_function*()
+			functions.
+
+	scftorture.verbose= [KNL]
+			Enable additional printk() statements.
+
+	scftorture.weight_single= [KNL]
+			The probability weighting to use for the
+			smp_call_function_single() function with a zero
+			"wait" parameter.  A value of -1 selects the
+			default if all other weights are -1.  However,
+			if at least one weight has some other value, a
+			value of -1 will instead select a weight of zero.
+
+	scftorture.weight_single_wait= [KNL]
+			The probability weighting to use for the
+			smp_call_function_single() function with a
+			non-zero "wait" parameter.  See weight_single.
+
+	scftorture.weight_many= [KNL]
+			The probability weighting to use for the
+			smp_call_function_many() function with a zero
+			"wait" parameter.  See weight_single.
+			Note well that setting a high probability for
+			this weighting can place serious IPI load
+			on the system.
+
+	scftorture.weight_many_wait= [KNL]
+			The probability weighting to use for the
+			smp_call_function_many() function with a
+			non-zero "wait" parameter.  See weight_single
+			and weight_many.
+
+	scftorture.weight_all= [KNL]
+			The probability weighting to use for the
+			smp_call_function_all() function with a zero
+			"wait" parameter.  See weight_single and
+			weight_many.
+
+	scftorture.weight_all_wait= [KNL]
+			The probability weighting to use for the
+			smp_call_function_all() function with a
+			non-zero "wait" parameter.  See weight_single
+			and weight_many.
+
 	skew_tick=	[KNL] Offset the periodic timer tick per cpu to mitigate
 			xtime_lock contention on larger systems, and/or RCU lock
 			contention on all systems with CONFIG_MAXSMP set.
diff --git a/kernel/Makefile b/kernel/Makefile
index 9a20016d4900..c45f551deaaa 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -133,6 +133,8 @@ KASAN_SANITIZE_stackleak.o := n
 KCSAN_SANITIZE_stackleak.o := n
 KCOV_INSTRUMENT_stackleak.o := n
 
+obj-$(CONFIG_SCF_TORTURE_TEST) += scftorture.o
+
 $(obj)/configs.o: $(obj)/config_data.gz
 
 targets += config_data.gz
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
new file mode 100644
index 000000000000..44f1e49ba6e9
--- /dev/null
+++ b/kernel/scftorture.c
@@ -0,0 +1,350 @@
+// SPDX-License-Identifier: GPL-2.0+
+//
+// Torture test for smp_call_function() and friends.
+//
+// Copyright (C) Facebook, 2020.
+//
+// Author: Paul E. McKenney <paulmck@kernel.org>
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/notifier.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/rcupdate_trace.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <linux/torture.h>
+#include <linux/types.h>
+
+#define SCFTORT_STRING "scftorture"
+#define SCFTORT_FLAG SCFTORT_STRING ": "
+
+#define SCFTORTOUT(s, x...) \
+	pr_alert(SCFTORT_FLAG s, ## x)
+
+#define VERBOSE_SCFTORTOUT(s, x...) \
+	do { if (verbose) pr_alert(SCFTORT_FLAG s, ## x); } while (0)
+
+#define VERBOSE_SCFTORTOUT_ERRSTRING(s, x...) \
+	do { if (verbose) pr_alert(SCFTORT_FLAG "!!! " s, ## x); } while (0)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@kernel.org>");
+
+// Wait until there are multiple CPUs before starting test.
+torture_param(int, holdoff, IS_BUILTIN(CONFIG_SCF_TORTURE_TEST) ? 10 : 0,
+	      "Holdoff time before test start (s)");
+torture_param(int, longwait, 0, "Include ridiculously long waits? (seconds)");
+torture_param(int, nthreads, -1, "# threads, defaults to -1 for all CPUs.");
+torture_param(int, onoff_holdoff, 0, "Time after boot before CPU hotplugs (s)");
+torture_param(int, onoff_interval, 0, "Time between CPU hotplugs (s), 0=disable");
+torture_param(int, shutdown_secs, 0, "Shutdown time (ms), <= zero to disable.");
+torture_param(int, stat_interval, 60, "Number of seconds between stats printk()s.");
+torture_param(int, stutter_cpus, 5, "Number of jiffies to change CPUs under test, 0=disable");
+torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU hotplug.");
+torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
+torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
+torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations.");
+torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations.");
+torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
+torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
+
+char *torture_type = "";
+
+#ifdef MODULE
+# define SCFTORT_SHUTDOWN 0
+#else
+# define SCFTORT_SHUTDOWN 1
+#endif
+
+torture_param(bool, shutdown, SCFTORT_SHUTDOWN, "Shutdown at end of torture test.");
+
+struct scf_statistics {
+	struct task_struct *task;
+	int cpu;
+	long long n_single;
+	long long n_single_wait;
+	long long n_multi;
+	long long n_multi_wait;
+	long long n_all;
+	long long n_all_wait;
+};
+
+static struct scf_statistics *scf_stats_p;
+static struct task_struct *scf_torture_stats_task;
+static DEFINE_PER_CPU(long long, scf_invoked_count);
+
+// Use to wait for all threads to start.
+static atomic_t n_started;
+static atomic_t n_errs;
+static bool scfdone;
+
+DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
+
+// Print torture statistics.  Caller must ensure serialization.
+static void scf_torture_stats_print(void)
+{
+	int cpu;
+	long long invoked_count = 0;
+	bool isdone = READ_ONCE(scfdone);
+
+	for_each_possible_cpu(cpu)
+		invoked_count += data_race(per_cpu(scf_invoked_count, cpu));
+	pr_alert("%s scf_invoked_count %s: %lld ",
+		 SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count);
+	torture_onoff_stats();
+	pr_cont("\n");
+}
+
+// Periodically prints torture statistics, if periodic statistics printing
+// was specified via the stat_interval module parameter.
+static int
+scf_torture_stats(void *arg)
+{
+	VERBOSE_TOROUT_STRING("scf_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		scf_torture_stats_print();
+		torture_shutdown_absorb("scf_torture_stats");
+	} while (!torture_must_stop());
+	torture_kthread_stopping("scf_torture_stats");
+	return 0;
+}
+
+// Update statistics and occasionally burn up mass quantities of CPU time,
+// if told to do so via scftorture.longwait.  Otherwise, occasionally burn
+// a little bit.
+static void scf_handler(void *unused)
+{
+	int i;
+	int j;
+	unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand));
+
+	this_cpu_inc(scf_invoked_count);
+	if (longwait <= 0) {
+		if (!(r & 0xffc0))
+			udelay(r & 0x3f);
+		return;
+	}
+	if (r & 0xfff)
+		return;
+	r = (r >> 12);
+	if (longwait <= 0) {
+		udelay((r & 0xff) + 1);
+		return;
+	}
+	r = r % longwait + 1;
+	for (i = 0; i < r; i++) {
+		for (j = 0; j < 1000; j++) {
+			udelay(1000);
+			cpu_relax();
+		}
+	}
+}
+
+// Randomly do an smp_call_function*() invocation.
+static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp)
+{
+	if (use_cpus_read_lock)
+		cpus_read_lock();
+	else
+		preempt_disable();
+	scfp->n_all++;
+	smp_call_function(scf_handler, NULL, 0);
+	if (use_cpus_read_lock)
+		cpus_read_unlock();
+	else
+		preempt_enable();
+	if (!(torture_random(trsp) & 0xfff))
+		schedule_timeout_uninterruptible(1);
+}
+
+// SCF test kthread.  Repeatedly does calls to members of the
+// smp_call_function() family of functions.
+static int scftorture_invoker(void *arg)
+{
+	DEFINE_TORTURE_RANDOM(rand);
+	struct scf_statistics *scfp = (struct scf_statistics *)arg;
+
+	VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
+	set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	if (holdoff)
+		schedule_timeout_interruptible(holdoff * HZ);
+
+	VERBOSE_SCFTORTOUT("scftorture_invoker %d: Waiting for all SCF torturers from cpu %d", scfp->cpu, smp_processor_id());
+
+	// Make sure that the CPU is affinitized appropriately during testing.
+	WARN_ON_ONCE(smp_processor_id() != scfp->cpu);
+
+	if (!atomic_dec_return(&n_started))
+		while (atomic_read_acquire(&n_started)) {
+			if (torture_must_stop()) {
+				VERBOSE_SCFTORTOUT("scftorture_invoker %d ended before starting", scfp->cpu);
+				goto end;
+			}
+			schedule_timeout_uninterruptible(1);
+		}
+
+	VERBOSE_SCFTORTOUT("scftorture_invoker %d started", scfp->cpu);
+
+	do {
+		scftorture_invoke_one(scfp, &rand);
+	} while (!torture_must_stop());
+
+	VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
+end:
+	torture_kthread_stopping("scftorture_invoker");
+	return 0;
+}
+
+static void
+scftorture_print_module_parms(const char *tag)
+{
+	pr_alert(SCFTORT_FLAG
+		 "--- %s:  verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+		 verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait);
+}
+
+static void scf_cleanup_handler(void *unused)
+{
+}
+
+static void scf_torture_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	WRITE_ONCE(scfdone, true);
+	if (nthreads)
+		for (i = 0; i < nthreads; i++)
+			torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
+	else
+		goto end;
+	kfree(scf_stats_p);
+	scf_stats_p = NULL;
+	smp_call_function(scf_cleanup_handler, NULL, 0);
+	torture_stop_kthread(scf_torture_stats, scf_torture_stats_task);
+	scf_torture_stats_print();  // -After- the stats thread is stopped!
+
+	if (atomic_read(&n_errs))
+		scftorture_print_module_parms("End of test: FAILURE");
+	else if (torture_onoff_failures())
+		scftorture_print_module_parms("End of test: LOCK_HOTPLUG");
+	else
+		scftorture_print_module_parms("End of test: SUCCESS");
+
+end:
+	torture_cleanup_end();
+}
+
+static int __init scf_torture_init(void)
+{
+	long i;
+	int firsterr = 0;
+
+	if (!torture_init_begin(SCFTORT_STRING, verbose))
+		return -EBUSY;
+
+	scftorture_print_module_parms("Start of test");
+
+	if (weight_single == -1 && weight_single_wait == -1 &&
+	    weight_mult == -1 && weight_mult_wait == -1 &&
+	    weight_all == -1 && weight_all_wait == -1) {
+		weight_single = 1;
+		weight_single_wait = 1;
+		weight_mult = 1;
+		weight_mult_wait = 1;
+		weight_all = 1;
+		weight_all_wait = 1;
+	} else {
+		if (weight_single == -1)
+			weight_single = 0;
+		if (weight_single_wait == -1)
+			weight_single_wait = 0;
+		if (weight_mult == -1)
+			weight_mult = 0;
+		if (weight_mult_wait == -1)
+			weight_mult_wait = 0;
+		if (weight_all == -1)
+			weight_all = 0;
+		if (weight_all_wait == -1)
+			weight_all_wait = 0;
+	}
+	if (weight_single == 0 && weight_single_wait == 0 &&
+	    weight_mult == 0 && weight_mult_wait == 0 &&
+	    weight_all == 0 && weight_all_wait == 0) {
+		firsterr = -EINVAL;
+		goto unwind;
+	}
+
+	if (onoff_interval > 0) {
+		firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
+		if (firsterr)
+			goto unwind;
+	}
+	if (shutdown_secs > 0) {
+		firsterr = torture_shutdown_init(shutdown_secs, scf_torture_cleanup);
+		if (firsterr)
+			goto unwind;
+	}
+
+	// Worker tasks invoking smp_call_function().
+	if (nthreads < 0)
+		nthreads = num_online_cpus();
+	scf_stats_p = kcalloc(nthreads, sizeof(scf_stats_p[0]), GFP_KERNEL);
+	if (!scf_stats_p) {
+		VERBOSE_SCFTORTOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	VERBOSE_SCFTORTOUT("Starting %d smp_call_function() threads\n", nthreads);
+
+	atomic_set(&n_started, nthreads);
+	for (i = 0; i < nthreads; i++) {
+		scf_stats_p[i].cpu = i;
+		firsterr = torture_create_kthread(scftorture_invoker, (void *)&scf_stats_p[i],
+						  scf_stats_p[i].task);
+		if (firsterr)
+			goto unwind;
+	}
+	if (stat_interval > 0) {
+		firsterr = torture_create_kthread(scf_torture_stats, NULL, scf_torture_stats_task);
+		if (firsterr)
+			goto unwind;
+	}
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	scf_torture_cleanup();
+	return firsterr;
+}
+
+module_init(scf_torture_init);
+module_exit(scf_torture_cleanup);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e068c3c7189a..0c3a6c752ede 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1367,6 +1367,16 @@ config WW_MUTEX_SELFTEST
 	  Say M if you want these self tests to build as a module.
 	  Say N if you are unsure.
 
+config SCF_TORTURE_TEST
+	tristate "torture tests for smp_call_function*()"
+	depends on DEBUG_KERNEL
+	select TORTURE_TEST
+	help
+	  This option provides a kernel module that runs torture tests
+	  on the smp_call_function() family of primitives.  The kernel
+	  module may be built after the fact on the running kernel to
+	  be tested, if desired.
+
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
-- 
cgit v1.2.3


From 5022b8ac608f8b80b042a8041fe2738c4b9ea8cf Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 25 Jun 2020 17:05:58 -0700
Subject: scftorture: Implement weighted primitive selection

This commit uses the scftorture.weight* kernel parameters to randomly
chooses between smp_call_function_single(), smp_call_function_many(),
and smp_call_function().  For each variant, it also randomly chooses
whether to invoke it synchronously (wait=1) or asynchronously (wait=0).
The percentage weighting for each option are dumped to the console log
(search for "scf_sel_dump").

This accumulates statistics, which a later commit will dump out at the
end of the run.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 182 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 155 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 44f1e49ba6e9..5f1984522ae4 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -64,8 +64,8 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU
 torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
 torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
 torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
-torture_param(int, weight_mult, -1, "Testing weight for multi-CPU no-wait operations.");
-torture_param(int, weight_mult_wait, -1, "Testing weight for multi-CPU operations.");
+torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
+torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
 torture_param(int, weight_all, -1, "Testing weight for all-CPU no-wait operations.");
 torture_param(int, weight_all_wait, -1, "Testing weight for all-CPU operations.");
 
@@ -83,9 +83,11 @@ struct scf_statistics {
 	struct task_struct *task;
 	int cpu;
 	long long n_single;
+	long long n_single_ofl;
 	long long n_single_wait;
-	long long n_multi;
-	long long n_multi_wait;
+	long long n_single_wait_ofl;
+	long long n_many;
+	long long n_many_wait;
 	long long n_all;
 	long long n_all_wait;
 };
@@ -94,6 +96,27 @@ static struct scf_statistics *scf_stats_p;
 static struct task_struct *scf_torture_stats_task;
 static DEFINE_PER_CPU(long long, scf_invoked_count);
 
+// Data for random primitive selection
+#define SCF_PRIM_SINGLE		0
+#define SCF_PRIM_MANY		1
+#define SCF_PRIM_ALL		2
+#define SCF_NPRIMS		(2 * 3) // Need wait and no-wait versions of each.
+
+static char *scf_prim_name[] = {
+	"smp_call_function_single",
+	"smp_call_function_many",
+	"smp_call_function",
+};
+
+struct scf_selector {
+	unsigned long scfs_weight;
+	int scfs_prim;
+	bool scfs_wait;
+};
+static struct scf_selector scf_sel_array[SCF_NPRIMS];
+static int scf_sel_array_len;
+static unsigned long scf_sel_totweight;
+
 // Use to wait for all threads to start.
 static atomic_t n_started;
 static atomic_t n_errs;
@@ -131,6 +154,57 @@ scf_torture_stats(void *arg)
 	return 0;
 }
 
+// Add a primitive to the scf_sel_array[].
+static void scf_sel_add(unsigned long weight, int prim, bool wait)
+{
+	struct scf_selector *scfsp = &scf_sel_array[scf_sel_array_len];
+
+	// If no weight, if array would overflow, if computing three-place
+	// percentages would overflow, or if the scf_prim_name[] array would
+	// overflow, don't bother.  In the last three two cases, complain.
+	if (!weight ||
+	    WARN_ON_ONCE(scf_sel_array_len >= ARRAY_SIZE(scf_sel_array)) ||
+	    WARN_ON_ONCE(0 - 100000 * weight <= 100000 * scf_sel_totweight) ||
+	    WARN_ON_ONCE(prim >= ARRAY_SIZE(scf_prim_name)))
+		return;
+	scf_sel_totweight += weight;
+	scfsp->scfs_weight = scf_sel_totweight;
+	scfsp->scfs_prim = prim;
+	scfsp->scfs_wait = wait;
+	scf_sel_array_len++;
+}
+
+// Dump out weighting percentages for scf_prim_name[] array.
+static void scf_sel_dump(void)
+{
+	int i;
+	unsigned long oldw = 0;
+	struct scf_selector *scfsp;
+	unsigned long w;
+
+	for (i = 0; i < scf_sel_array_len; i++) {
+		scfsp = &scf_sel_array[i];
+		w = (scfsp->scfs_weight - oldw) * 100000 / scf_sel_totweight;
+		pr_info("%s: %3lu.%03lu %s(%s)\n", __func__, w / 1000, w % 1000,
+			scf_prim_name[scfsp->scfs_prim],
+			scfsp->scfs_wait ? "wait" : "nowait");
+		oldw = scfsp->scfs_weight;
+	}
+}
+
+// Randomly pick a primitive and wait/nowait, based on weightings.
+static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
+{
+	int i;
+	unsigned long w = torture_random(trsp) % (scf_sel_totweight + 1);
+
+	for (i = 0; i < scf_sel_array_len; i++)
+		if (scf_sel_array[i].scfs_weight >= w)
+			return &scf_sel_array[i];
+	WARN_ON_ONCE(1);
+	return &scf_sel_array[0];
+}
+
 // Update statistics and occasionally burn up mass quantities of CPU time,
 // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
 // a little bit.
@@ -162,15 +236,55 @@ static void scf_handler(void *unused)
 	}
 }
 
+// As above, but check for correct CPU.
+static void scf_handler_1(void *me)
+{
+	if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me))
+		atomic_inc(&n_errs);
+	scf_handler(NULL);
+}
+
 // Randomly do an smp_call_function*() invocation.
-static void scftorture_invoke_one(struct scf_statistics *scfp,struct torture_random_state *trsp)
+static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
 {
+	uintptr_t cpu;
+	int ret;
+	struct scf_selector *scfsp = scf_sel_rand(trsp);
+
 	if (use_cpus_read_lock)
 		cpus_read_lock();
 	else
 		preempt_disable();
-	scfp->n_all++;
-	smp_call_function(scf_handler, NULL, 0);
+	switch (scfsp->scfs_prim) {
+	case SCF_PRIM_SINGLE:
+		cpu = torture_random(trsp) % nr_cpu_ids;
+		if (scfsp->scfs_wait)
+			scfp->n_single_wait++;
+		else
+			scfp->n_single++;
+		ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait);
+		if (ret) {
+			if (scfsp->scfs_wait)
+				scfp->n_single_wait_ofl++;
+			else
+				scfp->n_single_ofl++;
+		}
+		break;
+	case SCF_PRIM_MANY:
+		if (scfsp->scfs_wait)
+			scfp->n_many_wait++;
+		else
+			scfp->n_many++;
+		smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait);
+		break;
+	case SCF_PRIM_ALL:
+		if (scfsp->scfs_wait)
+			scfp->n_all_wait++;
+		else
+			scfp->n_all++;
+		smp_call_function(scf_handler, NULL, scfsp->scfs_wait);
+		break;
+	}
 	if (use_cpus_read_lock)
 		cpus_read_unlock();
 	else
@@ -222,8 +336,8 @@ static void
 scftorture_print_module_parms(const char *tag)
 {
 	pr_alert(SCFTORT_FLAG
-		 "--- %s:  verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_mult=%d, weight_mult_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
-		 verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_mult, weight_mult_wait, weight_all, weight_all_wait);
+		 "--- %s:  verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter_cpus=%d use_cpus_read_lock=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+		 verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter_cpus, use_cpus_read_lock, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
 }
 
 static void scf_cleanup_handler(void *unused)
@@ -264,6 +378,12 @@ static int __init scf_torture_init(void)
 {
 	long i;
 	int firsterr = 0;
+	unsigned long weight_single1 = weight_single;
+	unsigned long weight_single_wait1 = weight_single_wait;
+	unsigned long weight_many1 = weight_many;
+	unsigned long weight_many_wait1 = weight_many_wait;
+	unsigned long weight_all1 = weight_all;
+	unsigned long weight_all_wait1 = weight_all_wait;
 
 	if (!torture_init_begin(SCFTORT_STRING, verbose))
 		return -EBUSY;
@@ -271,34 +391,42 @@ static int __init scf_torture_init(void)
 	scftorture_print_module_parms("Start of test");
 
 	if (weight_single == -1 && weight_single_wait == -1 &&
-	    weight_mult == -1 && weight_mult_wait == -1 &&
+	    weight_many == -1 && weight_many_wait == -1 &&
 	    weight_all == -1 && weight_all_wait == -1) {
-		weight_single = 1;
-		weight_single_wait = 1;
-		weight_mult = 1;
-		weight_mult_wait = 1;
-		weight_all = 1;
-		weight_all_wait = 1;
+		weight_single1 = 2 * nr_cpu_ids;
+		weight_single_wait1 = 2 * nr_cpu_ids;
+		weight_many1 = 2;
+		weight_many_wait1 = 2;
+		weight_all1 = 1;
+		weight_all_wait1 = 1;
 	} else {
 		if (weight_single == -1)
-			weight_single = 0;
+			weight_single1 = 0;
 		if (weight_single_wait == -1)
-			weight_single_wait = 0;
-		if (weight_mult == -1)
-			weight_mult = 0;
-		if (weight_mult_wait == -1)
-			weight_mult_wait = 0;
+			weight_single_wait1 = 0;
+		if (weight_many == -1)
+			weight_many1 = 0;
+		if (weight_many_wait == -1)
+			weight_many_wait1 = 0;
 		if (weight_all == -1)
-			weight_all = 0;
+			weight_all1 = 0;
 		if (weight_all_wait == -1)
-			weight_all_wait = 0;
+			weight_all_wait1 = 0;
 	}
-	if (weight_single == 0 && weight_single_wait == 0 &&
-	    weight_mult == 0 && weight_mult_wait == 0 &&
-	    weight_all == 0 && weight_all_wait == 0) {
+	if (weight_single1 == 0 && weight_single_wait1 == 0 &&
+	    weight_many1 == 0 && weight_many_wait1 == 0 &&
+	    weight_all1 == 0 && weight_all_wait1 == 0) {
+		VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
 		firsterr = -EINVAL;
 		goto unwind;
 	}
+	scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+	scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
+	scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
+	scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
+	scf_sel_add(weight_all1, SCF_PRIM_ALL, false);
+	scf_sel_add(weight_all_wait1, SCF_PRIM_ALL, true);
+	scf_sel_dump();
 
 	if (onoff_interval > 0) {
 		firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval, NULL);
-- 
cgit v1.2.3


From bca37119c57bdc2c68c84b313a5118005e8693cf Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Fri, 26 Jun 2020 13:39:41 -0700
Subject: tick-sched: Clarify "NOHZ: local_softirq_pending" warning

Currently, can_stop_idle_tick() prints "NOHZ: local_softirq_pending HH"
(where "HH" is the hexadecimal softirq vector number) when one or more
non-RCU softirq handlers are still enabled when checking to stop the
scheduler-tick interrupt.  This message is not as enlightening as one
might hope, so this commit changes it to "NOHZ tick-stop error: Non-RCU
local softirq work is pending, handler #HH".

Reported-by: Andy Lutomirski <luto@kernel.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/time/tick-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f0199a4ba1ad..81632cd5e3b7 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -927,7 +927,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts)
 
 		if (ratelimit < 10 &&
 		    (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) {
-			pr_warn("NOHZ: local_softirq_pending %02x\n",
+			pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n",
 				(unsigned int) local_softirq_pending());
 			ratelimit++;
 		}
-- 
cgit v1.2.3


From dba3142b37f343734bf61dbce2914acb76e69fb6 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 30 Jun 2020 16:13:37 -0700
Subject: scftorture: Summarize per-thread statistics

This commit summarizes the per-thread statistics, providing counts of
the number of single, many, and all calls, both no-wait and wait, and,
for the single case, the number where the target CPU was offline.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 5f1984522ae4..09a62424bb8c 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -128,13 +128,27 @@ DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
 static void scf_torture_stats_print(void)
 {
 	int cpu;
+	int i;
 	long long invoked_count = 0;
 	bool isdone = READ_ONCE(scfdone);
+	struct scf_statistics scfs = {};
 
 	for_each_possible_cpu(cpu)
 		invoked_count += data_race(per_cpu(scf_invoked_count, cpu));
-	pr_alert("%s scf_invoked_count %s: %lld ",
-		 SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count);
+	for (i = 0; i < nthreads; i++) {
+		scfs.n_single += scf_stats_p[i].n_single;
+		scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+		scfs.n_single_wait += scf_stats_p[i].n_single_wait;
+		scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
+		scfs.n_many += scf_stats_p[i].n_many;
+		scfs.n_many_wait += scf_stats_p[i].n_many_wait;
+		scfs.n_all += scf_stats_p[i].n_all;
+		scfs.n_all_wait += scf_stats_p[i].n_all_wait;
+	}
+	pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+		 SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count,
+		 scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+		 scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
 	torture_onoff_stats();
 	pr_cont("\n");
 }
@@ -357,11 +371,11 @@ static void scf_torture_cleanup(void)
 			torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
 	else
 		goto end;
-	kfree(scf_stats_p);
-	scf_stats_p = NULL;
 	smp_call_function(scf_cleanup_handler, NULL, 0);
 	torture_stop_kthread(scf_torture_stats, scf_torture_stats_task);
 	scf_torture_stats_print();  // -After- the stats thread is stopped!
+	kfree(scf_stats_p);  // -After- the last stats print has completed!
+	scf_stats_p = NULL;
 
 	if (atomic_read(&n_errs))
 		scftorture_print_module_parms("End of test: FAILURE");
-- 
cgit v1.2.3


From b93e21a51e1c8ed3816da888d34f88193ad1b917 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 30 Jun 2020 20:49:50 -0700
Subject: scftorture: Add smp_call_function_single() memory-ordering checks

This commit adds checks for memory misordering across calls to
smp_call_function_single() and also across returns in the case where
the caller waits.  Misordering results in a splat.

[ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 56 +++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 48 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 09a62424bb8c..9b42271d64f1 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -117,9 +117,20 @@ static struct scf_selector scf_sel_array[SCF_NPRIMS];
 static int scf_sel_array_len;
 static unsigned long scf_sel_totweight;
 
+// Communicate between caller and handler.
+struct scf_check {
+	bool scfc_in;
+	bool scfc_out;
+	int scfc_cpu; // -1 for not _single().
+	bool scfc_wait;
+};
+
 // Use to wait for all threads to start.
 static atomic_t n_started;
 static atomic_t n_errs;
+static atomic_t n_mb_in_errs;
+static atomic_t n_mb_out_errs;
+static atomic_t n_alloc_errs;
 static bool scfdone;
 
 DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
@@ -222,24 +233,27 @@ static struct scf_selector *scf_sel_rand(struct torture_random_state *trsp)
 // Update statistics and occasionally burn up mass quantities of CPU time,
 // if told to do so via scftorture.longwait.  Otherwise, occasionally burn
 // a little bit.
-static void scf_handler(void *unused)
+static void scf_handler(void *scfc_in)
 {
 	int i;
 	int j;
 	unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand));
+	struct scf_check *scfcp = scfc_in;
 
+	if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in))))
+		atomic_inc(&n_mb_in_errs);
 	this_cpu_inc(scf_invoked_count);
 	if (longwait <= 0) {
 		if (!(r & 0xffc0))
 			udelay(r & 0x3f);
-		return;
+		goto out;
 	}
 	if (r & 0xfff)
-		return;
+		goto out;
 	r = (r >> 12);
 	if (longwait <= 0) {
 		udelay((r & 0xff) + 1);
-		return;
+		goto out;
 	}
 	r = r % longwait + 1;
 	for (i = 0; i < r; i++) {
@@ -248,14 +262,24 @@ static void scf_handler(void *unused)
 			cpu_relax();
 		}
 	}
+out:
+	if (unlikely(!scfcp))
+		return;
+	if (scfcp->scfc_wait)
+		WRITE_ONCE(scfcp->scfc_out, true);
+	else
+		kfree(scfcp);
 }
 
 // As above, but check for correct CPU.
-static void scf_handler_1(void *me)
+static void scf_handler_1(void *scfc_in)
 {
-	if (WARN_ON_ONCE(smp_processor_id() != (uintptr_t)me))
+	struct scf_check *scfcp = scfc_in;
+
+	if (likely(scfcp) && WARN_ONCE(smp_processor_id() != scfcp->scfc_cpu, "%s: Wanted CPU %d got CPU %d\n", __func__, scfcp->scfc_cpu, smp_processor_id())) {
 		atomic_inc(&n_errs);
-	scf_handler(NULL);
+	}
+	scf_handler(scfcp);
 }
 
 // Randomly do an smp_call_function*() invocation.
@@ -263,6 +287,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 {
 	uintptr_t cpu;
 	int ret;
+	struct scf_check *scfcp = NULL;
 	struct scf_selector *scfsp = scf_sel_rand(trsp);
 
 	if (use_cpus_read_lock)
@@ -271,17 +296,32 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		preempt_disable();
 	switch (scfsp->scfs_prim) {
 	case SCF_PRIM_SINGLE:
+		scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
+		if (WARN_ON_ONCE(!scfcp))
+			atomic_inc(&n_alloc_errs);
 		cpu = torture_random(trsp) % nr_cpu_ids;
 		if (scfsp->scfs_wait)
 			scfp->n_single_wait++;
 		else
 			scfp->n_single++;
-		ret = smp_call_function_single(cpu, scf_handler_1, (void *)cpu, scfsp->scfs_wait);
+		if (scfcp) {
+			scfcp->scfc_cpu = cpu;
+			scfcp->scfc_wait = scfsp->scfs_wait;
+			scfcp->scfc_out = false;
+			scfcp->scfc_in = true;
+		}
+		ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait);
 		if (ret) {
 			if (scfsp->scfs_wait)
 				scfp->n_single_wait_ofl++;
 			else
 				scfp->n_single_ofl++;
+			kfree(scfcp);
+		} else if (scfcp && scfsp->scfs_wait) {
+			if (WARN_ON_ONCE(!scfcp->scfc_out))
+				atomic_inc(&n_mb_out_errs); // Leak rather than trash!
+			else
+				kfree(scfcp);
 		}
 		break;
 	case SCF_PRIM_MANY:
-- 
cgit v1.2.3


From 980205ee8489d53c4380f7762debac87312b0fb3 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 1 Jul 2020 12:30:02 -0700
Subject: scftorture: Add smp_call_function_many() memory-ordering checks

This commit adds checks for memory misordering across calls to and
returns from smp_call_function_many() in the case where the caller waits.
Misordering results in a splat.

Note that in contrast to smp_call_function_single(), this code does not
test memory ordering into the handler in the no-wait case because none
of the handlers would be able to free the scf_check structure without
introducing heavy synchronization to work out which was last.

[ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 9b42271d64f1..3519ad1b3278 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -240,8 +240,11 @@ static void scf_handler(void *scfc_in)
 	unsigned long r = torture_random(this_cpu_ptr(&scf_torture_rand));
 	struct scf_check *scfcp = scfc_in;
 
-	if (likely(scfcp) && WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in))))
-		atomic_inc(&n_mb_in_errs);
+	if (likely(scfcp)) {
+		WRITE_ONCE(scfcp->scfc_out, false); // For multiple receivers.
+		if (WARN_ON_ONCE(unlikely(!READ_ONCE(scfcp->scfc_in))))
+			atomic_inc(&n_mb_in_errs);
+	}
 	this_cpu_inc(scf_invoked_count);
 	if (longwait <= 0) {
 		if (!(r & 0xffc0))
@@ -325,11 +328,28 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		}
 		break;
 	case SCF_PRIM_MANY:
+		if (scfsp->scfs_wait) {
+			scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
+			if (WARN_ON_ONCE(!scfcp))
+				atomic_inc(&n_alloc_errs);
+		}
 		if (scfsp->scfs_wait)
 			scfp->n_many_wait++;
 		else
 			scfp->n_many++;
-		smp_call_function_many(cpu_online_mask, scf_handler, NULL, scfsp->scfs_wait);
+		if (scfcp) {
+			scfcp->scfc_cpu = -1;
+			scfcp->scfc_wait = true;
+			scfcp->scfc_out = false;
+			scfcp->scfc_in = true;
+		}
+		smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
+		if (scfcp) {
+			if (WARN_ON_ONCE(!scfcp->scfc_out))
+				atomic_inc(&n_mb_out_errs);  // Leak rather than trash!
+			else
+				kfree(scfcp);
+		}
 		break;
 	case SCF_PRIM_ALL:
 		if (scfsp->scfs_wait)
-- 
cgit v1.2.3


From 34e8c4837adb579962e528a4f7dd1f75cb120be4 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 1 Jul 2020 13:49:06 -0700
Subject: scftorture: Add smp_call_function() memory-ordering checks

This commit adds checks for memory misordering across calls to and
returns from smp_call_function() in the case where the caller waits.
Misordering results in a splat.

Note that in contrast to smp_call_function_single(), this code does not
test memory ordering into the handler in the no-wait case because none
of the handlers would be able to free the scf_check structure without
introducing heavy synchronization to work out which was last.

[ paulmck: s/GFP_KERNEL/GFP_ATOMIC/ per kernel test robot feedback. ]
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 3519ad1b3278..0d7299d32dd0 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -297,11 +297,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		cpus_read_lock();
 	else
 		preempt_disable();
-	switch (scfsp->scfs_prim) {
-	case SCF_PRIM_SINGLE:
+	if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
 		scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
 		if (WARN_ON_ONCE(!scfcp))
 			atomic_inc(&n_alloc_errs);
+	}
+	switch (scfsp->scfs_prim) {
+	case SCF_PRIM_SINGLE:
 		cpu = torture_random(trsp) % nr_cpu_ids;
 		if (scfsp->scfs_wait)
 			scfp->n_single_wait++;
@@ -328,11 +330,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		}
 		break;
 	case SCF_PRIM_MANY:
-		if (scfsp->scfs_wait) {
-			scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
-			if (WARN_ON_ONCE(!scfcp))
-				atomic_inc(&n_alloc_errs);
-		}
 		if (scfsp->scfs_wait)
 			scfp->n_many_wait++;
 		else
@@ -356,7 +353,19 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_all_wait++;
 		else
 			scfp->n_all++;
-		smp_call_function(scf_handler, NULL, scfsp->scfs_wait);
+		if (scfcp) {
+			scfcp->scfc_cpu = -1;
+			scfcp->scfc_wait = true;
+			scfcp->scfc_out = false;
+			scfcp->scfc_in = true;
+		}
+		smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
+		if (scfcp) {
+			if (WARN_ON_ONCE(!scfcp->scfc_out))
+				atomic_inc(&n_mb_out_errs);  // Leak rather than trash!
+			else
+				kfree(scfcp);
+		}
 		break;
 	}
 	if (use_cpus_read_lock)
-- 
cgit v1.2.3


From 676e5469643e716df7f39ef77ba8f09c85b0c4f8 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 1 Jul 2020 14:13:02 -0700
Subject: scftorture: Consolidate scftorture_invoke_one() check and kfree()

This commit moves checking of the ->scfc_out field and the freeing of
the scf_check structure down below the end of switch statement, thus
saving a few lines of code.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 0d7299d32dd0..f220cd364e23 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -289,7 +289,7 @@ static void scf_handler_1(void *scfc_in)
 static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_random_state *trsp)
 {
 	uintptr_t cpu;
-	int ret;
+	int ret = 0;
 	struct scf_check *scfcp = NULL;
 	struct scf_selector *scfsp = scf_sel_rand(trsp);
 
@@ -322,11 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			else
 				scfp->n_single_ofl++;
 			kfree(scfcp);
-		} else if (scfcp && scfsp->scfs_wait) {
-			if (WARN_ON_ONCE(!scfcp->scfc_out))
-				atomic_inc(&n_mb_out_errs); // Leak rather than trash!
-			else
-				kfree(scfcp);
+			scfcp = NULL;
 		}
 		break;
 	case SCF_PRIM_MANY:
@@ -341,12 +337,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfcp->scfc_in = true;
 		}
 		smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
-		if (scfcp) {
-			if (WARN_ON_ONCE(!scfcp->scfc_out))
-				atomic_inc(&n_mb_out_errs);  // Leak rather than trash!
-			else
-				kfree(scfcp);
-		}
 		break;
 	case SCF_PRIM_ALL:
 		if (scfsp->scfs_wait)
@@ -360,14 +350,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfcp->scfc_in = true;
 		}
 		smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
-		if (scfcp) {
-			if (WARN_ON_ONCE(!scfcp->scfc_out))
-				atomic_inc(&n_mb_out_errs);  // Leak rather than trash!
-			else
-				kfree(scfcp);
-		}
 		break;
 	}
+	if (scfcp && scfsp->scfs_wait) {
+		if (WARN_ON_ONCE(!scfcp->scfc_out))
+			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
+		else
+			kfree(scfcp);
+	}
 	if (use_cpus_read_lock)
 		cpus_read_unlock();
 	else
-- 
cgit v1.2.3


From 4df55bddc1a360e94c86e227fe417ac9422cb615 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 9 Jul 2020 13:58:32 -0700
Subject: scftorture: Consolidate scftorture_invoke_one() scf_check
 initialization

This commit hoists much of the initialization of the scf_check
structure out of the switch statement, thus saving a few lines of code.
The initialization of the ->scfc_in field remains in each leg of the
switch statement in order to more heavily stress memory ordering.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index f220cd364e23..8ab72e545a61 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -299,8 +299,13 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		preempt_disable();
 	if (scfsp->scfs_prim == SCF_PRIM_SINGLE || scfsp->scfs_wait) {
 		scfcp = kmalloc(sizeof(*scfcp), GFP_ATOMIC);
-		if (WARN_ON_ONCE(!scfcp))
+		if (WARN_ON_ONCE(!scfcp)) {
 			atomic_inc(&n_alloc_errs);
+		} else {
+			scfcp->scfc_cpu = -1;
+			scfcp->scfc_wait = scfsp->scfs_wait;
+			scfcp->scfc_out = false;
+		}
 	}
 	switch (scfsp->scfs_prim) {
 	case SCF_PRIM_SINGLE:
@@ -311,8 +316,6 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_single++;
 		if (scfcp) {
 			scfcp->scfc_cpu = cpu;
-			scfcp->scfc_wait = scfsp->scfs_wait;
-			scfcp->scfc_out = false;
 			scfcp->scfc_in = true;
 		}
 		ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait);
@@ -330,12 +333,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_many_wait++;
 		else
 			scfp->n_many++;
-		if (scfcp) {
-			scfcp->scfc_cpu = -1;
-			scfcp->scfc_wait = true;
-			scfcp->scfc_out = false;
+		if (scfcp)
 			scfcp->scfc_in = true;
-		}
 		smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
 		break;
 	case SCF_PRIM_ALL:
@@ -343,12 +342,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_all_wait++;
 		else
 			scfp->n_all++;
-		if (scfcp) {
-			scfcp->scfc_cpu = -1;
-			scfcp->scfc_wait = true;
-			scfcp->scfc_out = false;
+		if (scfcp)
 			scfcp->scfc_in = true;
-		}
 		smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
 		break;
 	}
-- 
cgit v1.2.3


From dbf83b655a7853bc430af10e9a3e7eb1f4c90f86 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 1 Jul 2020 16:06:22 -0700
Subject: scftorture: Flag errors in torture-compatible manner

This commit prints error counts on the statistics line and also adds a
"!!!" if any of the counters are non-zero.  Allocation failures are
(somewhat) forgiven, but all other errors result in a "FAILURE" print
at the end of the test.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 8ab72e545a61..880c2cef13e7 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -132,6 +132,7 @@ static atomic_t n_mb_in_errs;
 static atomic_t n_mb_out_errs;
 static atomic_t n_alloc_errs;
 static bool scfdone;
+static char *bangstr = "";
 
 DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
 
@@ -156,12 +157,17 @@ static void scf_torture_stats_print(void)
 		scfs.n_all += scf_stats_p[i].n_all;
 		scfs.n_all_wait += scf_stats_p[i].n_all_wait;
 	}
-	pr_alert("%s scf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
-		 SCFTORT_FLAG, isdone ? "VER" : "ver", invoked_count,
+	if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
+	    atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
+		bangstr = "!!! ";
+	pr_alert("%s %sscf_invoked_count %s: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+		 SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count,
 		 scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
 		 scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
 	torture_onoff_stats();
-	pr_cont("\n");
+	pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
+		atomic_read(&n_mb_in_errs), atomic_read(&n_mb_out_errs),
+		atomic_read(&n_alloc_errs));
 }
 
 // Periodically prints torture statistics, if periodic statistics printing
@@ -431,7 +437,7 @@ static void scf_torture_cleanup(void)
 	kfree(scf_stats_p);  // -After- the last stats print has completed!
 	scf_stats_p = NULL;
 
-	if (atomic_read(&n_errs))
+	if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) || atomic_read(&n_mb_out_errs))
 		scftorture_print_module_parms("End of test: FAILURE");
 	else if (torture_onoff_failures())
 		scftorture_print_module_parms("End of test: LOCK_HOTPLUG");
-- 
cgit v1.2.3


From ee7035d29576dcb59b1191e5f609517cacab1e56 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 1 Jul 2020 16:38:16 -0700
Subject: scftorture: Prevent compiler from reducing race probabilities

Detecting smp_call_function() memory misordering requires close timing,
so it is necessary to have the checks immediately before and after
the call to the smp_call_function*() function under test.  This commit
therefore inserts barrier() calls to prevent the compiler from optimizing
memory-misordering detection down into the zone of extreme improbability.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 880c2cef13e7..83496810fc48 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -322,6 +322,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_single++;
 		if (scfcp) {
 			scfcp->scfc_cpu = cpu;
+			barrier(); // Prevent race-reduction compiler optimizations.
 			scfcp->scfc_in = true;
 		}
 		ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, scfsp->scfs_wait);
@@ -339,8 +340,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_many_wait++;
 		else
 			scfp->n_many++;
-		if (scfcp)
+		if (scfcp) {
+			barrier(); // Prevent race-reduction compiler optimizations.
 			scfcp->scfc_in = true;
+		}
 		smp_call_function_many(cpu_online_mask, scf_handler, scfcp, scfsp->scfs_wait);
 		break;
 	case SCF_PRIM_ALL:
@@ -348,8 +351,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfp->n_all_wait++;
 		else
 			scfp->n_all++;
-		if (scfcp)
+		if (scfcp) {
+			barrier(); // Prevent race-reduction compiler optimizations.
 			scfcp->scfc_in = true;
+		}
 		smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
 		break;
 	}
@@ -358,6 +363,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
 		else
 			kfree(scfcp);
+		barrier(); // Prevent race-reduction compiler optimizations.
 	}
 	if (use_cpus_read_lock)
 		cpus_read_unlock();
-- 
cgit v1.2.3


From 9a52a574676f8d4aa55f69319231ce6c343b00bb Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Thu, 2 Jul 2020 09:56:50 -0700
Subject: scftorture: Make symbol 'scf_torture_rand' static

The sparse tool complains as follows

kernel/scftorture.c:124:1: warning:
 symbol '__pcpu_scope_scf_torture_rand' was not declared. Should it be static?

And this per-CPU variable is not used outside of scftorture.c,
so this commit marks it static.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 83496810fc48..9180de73e4e8 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -134,7 +134,7 @@ static atomic_t n_alloc_errs;
 static bool scfdone;
 static char *bangstr = "";
 
-DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
+static DEFINE_TORTURE_RANDOM_PERCPU(scf_torture_rand);
 
 // Print torture statistics.  Caller must ensure serialization.
 static void scf_torture_stats_print(void)
-- 
cgit v1.2.3


From de77d4da54d10df97d265e7e99112bfc2fef7d4a Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 2 Jul 2020 12:15:37 -0700
Subject: scftorture: Check unexpected "switch" statement value

This commit adds a "default" case to the switch statement in
scftorture_invoke_one() which contains a WARN_ON_ONCE() and an assignment
to ->scfc_out to suppress knock-on warnings.  These knock-on warnings
could otherwise cause the user to think that there was a memory-ordering
problem in smp_call_function() instead of a bug in scftorture.c itself.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 9180de73e4e8..d9c01c722e2a 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -357,6 +357,10 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 		}
 		smp_call_function(scf_handler, scfcp, scfsp->scfs_wait);
 		break;
+	default:
+		WARN_ON_ONCE(1);
+		if (scfcp)
+			scfcp->scfc_out = true;
 	}
 	if (scfcp && scfsp->scfs_wait) {
 		if (WARN_ON_ONCE(!scfcp->scfc_out))
-- 
cgit v1.2.3


From a7c072ef26644b632241d549869f10f8d2dd3b5c Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 2 Jul 2020 14:15:33 -0700
Subject: scftorture: Block scftorture_invoker() kthreads for offline CPUs

Currently, CPU-hotplug operations might result in all but two
of (say) 100 CPUs being offline, which in turn might result in
false-positive diagnostics due to overload.  This commit therefore
causes scftorture_invoker() kthreads for offline CPUs to loop blocking
for 200 milliseconds at a time, thus continuously adjusting the number
of threads to match the number of online CPUs.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index d9c01c722e2a..04d3a4279413 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -381,11 +381,14 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 // smp_call_function() family of functions.
 static int scftorture_invoker(void *arg)
 {
+	int cpu;
 	DEFINE_TORTURE_RANDOM(rand);
 	struct scf_statistics *scfp = (struct scf_statistics *)arg;
+	bool was_offline = false;
 
 	VERBOSE_SCFTORTOUT("scftorture_invoker %d: task started", scfp->cpu);
-	set_cpus_allowed_ptr(current, cpumask_of(scfp->cpu % nr_cpu_ids));
+	cpu = scfp->cpu % nr_cpu_ids;
+	set_cpus_allowed_ptr(current, cpumask_of(cpu));
 	set_user_nice(current, MAX_NICE);
 	if (holdoff)
 		schedule_timeout_interruptible(holdoff * HZ);
@@ -408,6 +411,14 @@ static int scftorture_invoker(void *arg)
 
 	do {
 		scftorture_invoke_one(scfp, &rand);
+		while (cpu_is_offline(cpu) && !torture_must_stop()) {
+			schedule_timeout_interruptible(HZ / 5);
+			was_offline = true;
+		}
+		if (was_offline) {
+			set_cpus_allowed_ptr(current, cpumask_of(cpu));
+			was_offline = false;
+		}
 	} while (!torture_must_stop());
 
 	VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
-- 
cgit v1.2.3


From 9e66bf03f9c538863e614a72c5799bcd9579630e Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Fri, 3 Jul 2020 15:23:19 -0700
Subject: scftorture: Adapt memory-ordering test to UP operation

On uniprocessor systems, smp_call_function() does nothing.  This commit
therefore avoids complaining about the lack of handler accesses in the
single-CPU case where there is no handler.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 04d3a4279413..fc22bcc9a589 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -363,7 +363,8 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
 			scfcp->scfc_out = true;
 	}
 	if (scfcp && scfsp->scfs_wait) {
-		if (WARN_ON_ONCE(!scfcp->scfc_out))
+		if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
+				 !scfcp->scfc_out))
 			atomic_inc(&n_mb_out_errs); // Leak rather than trash!
 		else
 			kfree(scfcp);
-- 
cgit v1.2.3


From 65bd77f554336407f5fd7ced7a6df686767fba21 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 23 Jul 2020 15:53:02 -0700
Subject: scftorture: Add cond_resched() to test loop

Although the test loop does randomly delay, which would provide quiescent
states and so forth, it is possible for there to be a series of long
smp_call_function*() handler runtimes with no delays, which results in
softlockup and RCU CPU stall warning messages.  This commit therefore
inserts a cond_resched() into the main test loop.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/scftorture.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index fc22bcc9a589..554a521ee235 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -420,6 +420,7 @@ static int scftorture_invoker(void *arg)
 			set_cpus_allowed_ptr(current, cpumask_of(cpu));
 			was_offline = false;
 		}
+		cond_resched();
 	} while (!torture_must_stop());
 
 	VERBOSE_SCFTORTOUT("scftorture_invoker %d ended", scfp->cpu);
-- 
cgit v1.2.3


From 4e88ec4a9eb17527e640b063f79e5b875733eb53 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 11 Aug 2020 21:18:12 -0700
Subject: rcuperf: Change rcuperf to rcuscale

This commit further avoids conflation of rcuperf with the kernel's perf
feature by renaming kernel/rcu/rcuperf.c to kernel/rcu/rcuscale.c, and
also by similarly renaming the functions and variables inside this file.
This has the side effect of changing the names of the kernel boot
parameters, so kernel-parameters.txt and ver_functions.sh are also
updated.  The rcutorture --torture type was also updated from rcuperf
to rcuscale.

[ paulmck: Fix bugs located by Stephen Rothwell. ]
Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt    |  36 +-
 MAINTAINERS                                        |   3 +-
 kernel/rcu/Kconfig.debug                           |   2 +-
 kernel/rcu/Makefile                                |   2 +-
 kernel/rcu/rcuperf.c                               | 853 ---------------------
 kernel/rcu/rcuscale.c                              | 853 +++++++++++++++++++++
 .../rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh   | 109 ---
 .../rcutorture/bin/kvm-recheck-rcuperf.sh          |  83 --
 .../rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh  | 109 +++
 .../rcutorture/bin/kvm-recheck-rcuscale.sh         |  83 ++
 tools/testing/selftests/rcutorture/bin/kvm.sh      |   8 +-
 .../selftests/rcutorture/bin/parse-console.sh      |   4 +-
 .../selftests/rcutorture/configs/rcuperf/CFLIST    |   1 -
 .../selftests/rcutorture/configs/rcuperf/CFcommon  |   2 -
 .../selftests/rcutorture/configs/rcuperf/TINY      |  16 -
 .../selftests/rcutorture/configs/rcuperf/TREE      |  19 -
 .../selftests/rcutorture/configs/rcuperf/TREE54    |  22 -
 .../rcutorture/configs/rcuperf/ver_functions.sh    |  16 -
 .../selftests/rcutorture/configs/rcuscale/CFLIST   |   1 +
 .../selftests/rcutorture/configs/rcuscale/CFcommon |   2 +
 .../selftests/rcutorture/configs/rcuscale/TINY     |  16 +
 .../selftests/rcutorture/configs/rcuscale/TREE     |  19 +
 .../selftests/rcutorture/configs/rcuscale/TREE54   |  22 +
 .../rcutorture/configs/rcuscale/ver_functions.sh   |  16 +
 24 files changed, 1149 insertions(+), 1148 deletions(-)
 delete mode 100644 kernel/rcu/rcuperf.c
 create mode 100644 kernel/rcu/rcuscale.c
 delete mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
 delete mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
 create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh
 create mode 100755 tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TINY
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
 delete mode 100644 tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/TINY
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/TREE
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
 create mode 100644 tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 91a56382ae56..c27bbe95e7cb 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4157,41 +4157,41 @@
 			rcu_node tree with an eye towards determining
 			why a new grace period has not yet started.
 
-	rcuperf.gp_async= [KNL]
+	rcuscale.gp_async= [KNL]
 			Measure performance of asynchronous
 			grace-period primitives such as call_rcu().
 
-	rcuperf.gp_async_max= [KNL]
+	rcuscale.gp_async_max= [KNL]
 			Specify the maximum number of outstanding
 			callbacks per writer thread.  When a writer
 			thread exceeds this limit, it invokes the
 			corresponding flavor of rcu_barrier() to allow
 			previously posted callbacks to drain.
 
-	rcuperf.gp_exp= [KNL]
+	rcuscale.gp_exp= [KNL]
 			Measure performance of expedited synchronous
 			grace-period primitives.
 
-	rcuperf.holdoff= [KNL]
+	rcuscale.holdoff= [KNL]
 			Set test-start holdoff period.  The purpose of
 			this parameter is to delay the start of the
 			test until boot completes in order to avoid
 			interference.
 
-	rcuperf.kfree_rcu_test= [KNL]
+	rcuscale.kfree_rcu_test= [KNL]
 			Set to measure performance of kfree_rcu() flooding.
 
-	rcuperf.kfree_nthreads= [KNL]
+	rcuscale.kfree_nthreads= [KNL]
 			The number of threads running loops of kfree_rcu().
 
-	rcuperf.kfree_alloc_num= [KNL]
+	rcuscale.kfree_alloc_num= [KNL]
 			Number of allocations and frees done in an iteration.
 
-	rcuperf.kfree_loops= [KNL]
-			Number of loops doing rcuperf.kfree_alloc_num number
+	rcuscale.kfree_loops= [KNL]
+			Number of loops doing rcuscale.kfree_alloc_num number
 			of allocations and frees.
 
-	rcuperf.nreaders= [KNL]
+	rcuscale.nreaders= [KNL]
 			Set number of RCU readers.  The value -1 selects
 			N, where N is the number of CPUs.  A value
 			"n" less than -1 selects N-n+1, where N is again
@@ -4200,23 +4200,23 @@
 			A value of "n" less than or equal to -N selects
 			a single reader.
 
-	rcuperf.nwriters= [KNL]
+	rcuscale.nwriters= [KNL]
 			Set number of RCU writers.  The values operate
-			the same as for rcuperf.nreaders.
+			the same as for rcuscale.nreaders.
 			N, where N is the number of CPUs
 
-	rcuperf.perf_type= [KNL]
+	rcuscale.perf_type= [KNL]
 			Specify the RCU implementation to test.
 
-	rcuperf.shutdown= [KNL]
+	rcuscale.shutdown= [KNL]
 			Shut the system down after performance tests
 			complete.  This is useful for hands-off automated
 			testing.
 
-	rcuperf.verbose= [KNL]
+	rcuscale.verbose= [KNL]
 			Enable additional printk() statements.
 
-	rcuperf.writer_holdoff= [KNL]
+	rcuscale.writer_holdoff= [KNL]
 			Write-side holdoff between grace periods,
 			in microseconds.  The default of zero says
 			no holdoff.
@@ -4490,8 +4490,8 @@
 	refscale.shutdown= [KNL]
 			Shut down the system at the end of the performance
 			test.  This defaults to 1 (shut it down) when
-			rcuperf is built into the kernel and to 0 (leave
-			it running) when rcuperf is built as a module.
+			refscale is built into the kernel and to 0 (leave
+			it running) when refscale is built as a module.
 
 	refscale.verbose= [KNL]
 			Enable additional printk() statements.
diff --git a/MAINTAINERS b/MAINTAINERS
index deaafb617361..d299e3bb10ad 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -17510,8 +17510,9 @@ S:	Supported
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/paulmck/linux-rcu.git dev
 F:	Documentation/RCU/torture.rst
 F:	kernel/locking/locktorture.c
-F:	kernel/rcu/rcuperf.c
+F:	kernel/rcu/rcuscale.c
 F:	kernel/rcu/rcutorture.c
+F:	kernel/rcu/refscale.c
 F:	kernel/torture.c
 
 TOSHIBA ACPI EXTRAS DRIVER
diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 3cf6132a4bb9..5cb175df6ece 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -23,7 +23,7 @@ config TORTURE_TEST
 	tristate
 	default n
 
-config RCU_PERF_TEST
+config RCU_SCALE_TEST
 	tristate "performance tests for RCU"
 	depends on DEBUG_KERNEL
 	select TORTURE_TEST
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
index 95f5117ef8da..0cfb009a99b9 100644
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -11,7 +11,7 @@ obj-y += update.o sync.o
 obj-$(CONFIG_TREE_SRCU) += srcutree.o
 obj-$(CONFIG_TINY_SRCU) += srcutiny.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
-obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o
+obj-$(CONFIG_RCU_SCALE_TEST) += rcuscale.o
 obj-$(CONFIG_RCU_REF_SCALE_TEST) += refscale.o
 obj-$(CONFIG_TREE_RCU) += tree.o
 obj-$(CONFIG_TINY_RCU) += tiny.o
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
deleted file mode 100644
index 21448d3374e2..000000000000
--- a/kernel/rcu/rcuperf.c
+++ /dev/null
@@ -1,853 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0+
-/*
- * Read-Copy Update module-based performance-test facility
- *
- * Copyright (C) IBM Corporation, 2015
- *
- * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
- */
-
-#define pr_fmt(fmt) fmt
-
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/err.h>
-#include <linux/spinlock.h>
-#include <linux/smp.h>
-#include <linux/rcupdate.h>
-#include <linux/interrupt.h>
-#include <linux/sched.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/atomic.h>
-#include <linux/bitops.h>
-#include <linux/completion.h>
-#include <linux/moduleparam.h>
-#include <linux/percpu.h>
-#include <linux/notifier.h>
-#include <linux/reboot.h>
-#include <linux/freezer.h>
-#include <linux/cpu.h>
-#include <linux/delay.h>
-#include <linux/stat.h>
-#include <linux/srcu.h>
-#include <linux/slab.h>
-#include <asm/byteorder.h>
-#include <linux/torture.h>
-#include <linux/vmalloc.h>
-
-#include "rcu.h"
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
-
-#define PERF_FLAG "-perf:"
-#define PERFOUT_STRING(s) \
-	pr_alert("%s" PERF_FLAG " %s\n", perf_type, s)
-#define VERBOSE_PERFOUT_STRING(s) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG " %s\n", perf_type, s); } while (0)
-#define VERBOSE_PERFOUT_ERRSTRING(s) \
-	do { if (verbose) pr_alert("%s" PERF_FLAG "!!! %s\n", perf_type, s); } while (0)
-
-/*
- * The intended use cases for the nreaders and nwriters module parameters
- * are as follows:
- *
- * 1.	Specify only the nr_cpus kernel boot parameter.  This will
- *	set both nreaders and nwriters to the value specified by
- *	nr_cpus for a mixed reader/writer test.
- *
- * 2.	Specify the nr_cpus kernel boot parameter, but set
- *	rcuperf.nreaders to zero.  This will set nwriters to the
- *	value specified by nr_cpus for an update-only test.
- *
- * 3.	Specify the nr_cpus kernel boot parameter, but set
- *	rcuperf.nwriters to zero.  This will set nreaders to the
- *	value specified by nr_cpus for a read-only test.
- *
- * Various other use cases may of course be specified.
- *
- * Note that this test's readers are intended only as a test load for
- * the writers.  The reader performance statistics will be overly
- * pessimistic due to the per-critical-section interrupt disabling,
- * test-end checks, and the pair of calls through pointers.
- */
-
-#ifdef MODULE
-# define RCUPERF_SHUTDOWN 0
-#else
-# define RCUPERF_SHUTDOWN 1
-#endif
-
-torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
-torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
-torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
-torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
-torture_param(int, nreaders, -1, "Number of RCU reader threads");
-torture_param(int, nwriters, -1, "Number of RCU updater threads");
-torture_param(bool, shutdown, RCUPERF_SHUTDOWN,
-	      "Shutdown at end of performance tests.");
-torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
-torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
-torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() perf test?");
-torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
-
-static char *perf_type = "rcu";
-module_param(perf_type, charp, 0444);
-MODULE_PARM_DESC(perf_type, "Type of RCU to performance-test (rcu, srcu, ...)");
-
-static int nrealreaders;
-static int nrealwriters;
-static struct task_struct **writer_tasks;
-static struct task_struct **reader_tasks;
-static struct task_struct *shutdown_task;
-
-static u64 **writer_durations;
-static int *writer_n_durations;
-static atomic_t n_rcu_perf_reader_started;
-static atomic_t n_rcu_perf_writer_started;
-static atomic_t n_rcu_perf_writer_finished;
-static wait_queue_head_t shutdown_wq;
-static u64 t_rcu_perf_writer_started;
-static u64 t_rcu_perf_writer_finished;
-static unsigned long b_rcu_gp_test_started;
-static unsigned long b_rcu_gp_test_finished;
-static DEFINE_PER_CPU(atomic_t, n_async_inflight);
-
-#define MAX_MEAS 10000
-#define MIN_MEAS 100
-
-/*
- * Operations vector for selecting different types of tests.
- */
-
-struct rcu_perf_ops {
-	int ptype;
-	void (*init)(void);
-	void (*cleanup)(void);
-	int (*readlock)(void);
-	void (*readunlock)(int idx);
-	unsigned long (*get_gp_seq)(void);
-	unsigned long (*gp_diff)(unsigned long new, unsigned long old);
-	unsigned long (*exp_completed)(void);
-	void (*async)(struct rcu_head *head, rcu_callback_t func);
-	void (*gp_barrier)(void);
-	void (*sync)(void);
-	void (*exp_sync)(void);
-	const char *name;
-};
-
-static struct rcu_perf_ops *cur_ops;
-
-/*
- * Definitions for rcu perf testing.
- */
-
-static int rcu_perf_read_lock(void) __acquires(RCU)
-{
-	rcu_read_lock();
-	return 0;
-}
-
-static void rcu_perf_read_unlock(int idx) __releases(RCU)
-{
-	rcu_read_unlock();
-}
-
-static unsigned long __maybe_unused rcu_no_completed(void)
-{
-	return 0;
-}
-
-static void rcu_sync_perf_init(void)
-{
-}
-
-static struct rcu_perf_ops rcu_ops = {
-	.ptype		= RCU_FLAVOR,
-	.init		= rcu_sync_perf_init,
-	.readlock	= rcu_perf_read_lock,
-	.readunlock	= rcu_perf_read_unlock,
-	.get_gp_seq	= rcu_get_gp_seq,
-	.gp_diff	= rcu_seq_diff,
-	.exp_completed	= rcu_exp_batches_completed,
-	.async		= call_rcu,
-	.gp_barrier	= rcu_barrier,
-	.sync		= synchronize_rcu,
-	.exp_sync	= synchronize_rcu_expedited,
-	.name		= "rcu"
-};
-
-/*
- * Definitions for srcu perf testing.
- */
-
-DEFINE_STATIC_SRCU(srcu_ctl_perf);
-static struct srcu_struct *srcu_ctlp = &srcu_ctl_perf;
-
-static int srcu_perf_read_lock(void) __acquires(srcu_ctlp)
-{
-	return srcu_read_lock(srcu_ctlp);
-}
-
-static void srcu_perf_read_unlock(int idx) __releases(srcu_ctlp)
-{
-	srcu_read_unlock(srcu_ctlp, idx);
-}
-
-static unsigned long srcu_perf_completed(void)
-{
-	return srcu_batches_completed(srcu_ctlp);
-}
-
-static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
-{
-	call_srcu(srcu_ctlp, head, func);
-}
-
-static void srcu_rcu_barrier(void)
-{
-	srcu_barrier(srcu_ctlp);
-}
-
-static void srcu_perf_synchronize(void)
-{
-	synchronize_srcu(srcu_ctlp);
-}
-
-static void srcu_perf_synchronize_expedited(void)
-{
-	synchronize_srcu_expedited(srcu_ctlp);
-}
-
-static struct rcu_perf_ops srcu_ops = {
-	.ptype		= SRCU_FLAVOR,
-	.init		= rcu_sync_perf_init,
-	.readlock	= srcu_perf_read_lock,
-	.readunlock	= srcu_perf_read_unlock,
-	.get_gp_seq	= srcu_perf_completed,
-	.gp_diff	= rcu_seq_diff,
-	.exp_completed	= srcu_perf_completed,
-	.async		= srcu_call_rcu,
-	.gp_barrier	= srcu_rcu_barrier,
-	.sync		= srcu_perf_synchronize,
-	.exp_sync	= srcu_perf_synchronize_expedited,
-	.name		= "srcu"
-};
-
-static struct srcu_struct srcud;
-
-static void srcu_sync_perf_init(void)
-{
-	srcu_ctlp = &srcud;
-	init_srcu_struct(srcu_ctlp);
-}
-
-static void srcu_sync_perf_cleanup(void)
-{
-	cleanup_srcu_struct(srcu_ctlp);
-}
-
-static struct rcu_perf_ops srcud_ops = {
-	.ptype		= SRCU_FLAVOR,
-	.init		= srcu_sync_perf_init,
-	.cleanup	= srcu_sync_perf_cleanup,
-	.readlock	= srcu_perf_read_lock,
-	.readunlock	= srcu_perf_read_unlock,
-	.get_gp_seq	= srcu_perf_completed,
-	.gp_diff	= rcu_seq_diff,
-	.exp_completed	= srcu_perf_completed,
-	.async		= srcu_call_rcu,
-	.gp_barrier	= srcu_rcu_barrier,
-	.sync		= srcu_perf_synchronize,
-	.exp_sync	= srcu_perf_synchronize_expedited,
-	.name		= "srcud"
-};
-
-/*
- * Definitions for RCU-tasks perf testing.
- */
-
-static int tasks_perf_read_lock(void)
-{
-	return 0;
-}
-
-static void tasks_perf_read_unlock(int idx)
-{
-}
-
-static struct rcu_perf_ops tasks_ops = {
-	.ptype		= RCU_TASKS_FLAVOR,
-	.init		= rcu_sync_perf_init,
-	.readlock	= tasks_perf_read_lock,
-	.readunlock	= tasks_perf_read_unlock,
-	.get_gp_seq	= rcu_no_completed,
-	.gp_diff	= rcu_seq_diff,
-	.async		= call_rcu_tasks,
-	.gp_barrier	= rcu_barrier_tasks,
-	.sync		= synchronize_rcu_tasks,
-	.exp_sync	= synchronize_rcu_tasks,
-	.name		= "tasks"
-};
-
-static unsigned long rcuperf_seq_diff(unsigned long new, unsigned long old)
-{
-	if (!cur_ops->gp_diff)
-		return new - old;
-	return cur_ops->gp_diff(new, old);
-}
-
-/*
- * If performance tests complete, wait for shutdown to commence.
- */
-static void rcu_perf_wait_shutdown(void)
-{
-	cond_resched_tasks_rcu_qs();
-	if (atomic_read(&n_rcu_perf_writer_finished) < nrealwriters)
-		return;
-	while (!torture_must_stop())
-		schedule_timeout_uninterruptible(1);
-}
-
-/*
- * RCU perf reader kthread.  Repeatedly does empty RCU read-side critical
- * section, minimizing update-side interference.  However, the point of
- * this test is not to evaluate reader performance, but instead to serve
- * as a test load for update-side performance testing.
- */
-static int
-rcu_perf_reader(void *arg)
-{
-	unsigned long flags;
-	int idx;
-	long me = (long)arg;
-
-	VERBOSE_PERFOUT_STRING("rcu_perf_reader task started");
-	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
-	set_user_nice(current, MAX_NICE);
-	atomic_inc(&n_rcu_perf_reader_started);
-
-	do {
-		local_irq_save(flags);
-		idx = cur_ops->readlock();
-		cur_ops->readunlock(idx);
-		local_irq_restore(flags);
-		rcu_perf_wait_shutdown();
-	} while (!torture_must_stop());
-	torture_kthread_stopping("rcu_perf_reader");
-	return 0;
-}
-
-/*
- * Callback function for asynchronous grace periods from rcu_perf_writer().
- */
-static void rcu_perf_async_cb(struct rcu_head *rhp)
-{
-	atomic_dec(this_cpu_ptr(&n_async_inflight));
-	kfree(rhp);
-}
-
-/*
- * RCU perf writer kthread.  Repeatedly does a grace period.
- */
-static int
-rcu_perf_writer(void *arg)
-{
-	int i = 0;
-	int i_max;
-	long me = (long)arg;
-	struct rcu_head *rhp = NULL;
-	bool started = false, done = false, alldone = false;
-	u64 t;
-	u64 *wdp;
-	u64 *wdpp = writer_durations[me];
-
-	VERBOSE_PERFOUT_STRING("rcu_perf_writer task started");
-	WARN_ON(!wdpp);
-	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
-	sched_set_fifo_low(current);
-
-	if (holdoff)
-		schedule_timeout_uninterruptible(holdoff * HZ);
-
-	/*
-	 * Wait until rcu_end_inkernel_boot() is called for normal GP tests
-	 * so that RCU is not always expedited for normal GP tests.
-	 * The system_state test is approximate, but works well in practice.
-	 */
-	while (!gp_exp && system_state != SYSTEM_RUNNING)
-		schedule_timeout_uninterruptible(1);
-
-	t = ktime_get_mono_fast_ns();
-	if (atomic_inc_return(&n_rcu_perf_writer_started) >= nrealwriters) {
-		t_rcu_perf_writer_started = t;
-		if (gp_exp) {
-			b_rcu_gp_test_started =
-				cur_ops->exp_completed() / 2;
-		} else {
-			b_rcu_gp_test_started = cur_ops->get_gp_seq();
-		}
-	}
-
-	do {
-		if (writer_holdoff)
-			udelay(writer_holdoff);
-		wdp = &wdpp[i];
-		*wdp = ktime_get_mono_fast_ns();
-		if (gp_async) {
-retry:
-			if (!rhp)
-				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
-			if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
-				atomic_inc(this_cpu_ptr(&n_async_inflight));
-				cur_ops->async(rhp, rcu_perf_async_cb);
-				rhp = NULL;
-			} else if (!kthread_should_stop()) {
-				cur_ops->gp_barrier();
-				goto retry;
-			} else {
-				kfree(rhp); /* Because we are stopping. */
-			}
-		} else if (gp_exp) {
-			cur_ops->exp_sync();
-		} else {
-			cur_ops->sync();
-		}
-		t = ktime_get_mono_fast_ns();
-		*wdp = t - *wdp;
-		i_max = i;
-		if (!started &&
-		    atomic_read(&n_rcu_perf_writer_started) >= nrealwriters)
-			started = true;
-		if (!done && i >= MIN_MEAS) {
-			done = true;
-			sched_set_normal(current, 0);
-			pr_alert("%s%s rcu_perf_writer %ld has %d measurements\n",
-				 perf_type, PERF_FLAG, me, MIN_MEAS);
-			if (atomic_inc_return(&n_rcu_perf_writer_finished) >=
-			    nrealwriters) {
-				schedule_timeout_interruptible(10);
-				rcu_ftrace_dump(DUMP_ALL);
-				PERFOUT_STRING("Test complete");
-				t_rcu_perf_writer_finished = t;
-				if (gp_exp) {
-					b_rcu_gp_test_finished =
-						cur_ops->exp_completed() / 2;
-				} else {
-					b_rcu_gp_test_finished =
-						cur_ops->get_gp_seq();
-				}
-				if (shutdown) {
-					smp_mb(); /* Assign before wake. */
-					wake_up(&shutdown_wq);
-				}
-			}
-		}
-		if (done && !alldone &&
-		    atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters)
-			alldone = true;
-		if (started && !alldone && i < MAX_MEAS - 1)
-			i++;
-		rcu_perf_wait_shutdown();
-	} while (!torture_must_stop());
-	if (gp_async) {
-		cur_ops->gp_barrier();
-	}
-	writer_n_durations[me] = i_max;
-	torture_kthread_stopping("rcu_perf_writer");
-	return 0;
-}
-
-static void
-rcu_perf_print_module_parms(struct rcu_perf_ops *cur_ops, const char *tag)
-{
-	pr_alert("%s" PERF_FLAG
-		 "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
-		 perf_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
-}
-
-static void
-rcu_perf_cleanup(void)
-{
-	int i;
-	int j;
-	int ngps = 0;
-	u64 *wdp;
-	u64 *wdpp;
-
-	/*
-	 * Would like warning at start, but everything is expedited
-	 * during the mid-boot phase, so have to wait till the end.
-	 */
-	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
-		VERBOSE_PERFOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
-	if (rcu_gp_is_normal() && gp_exp)
-		VERBOSE_PERFOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
-	if (gp_exp && gp_async)
-		VERBOSE_PERFOUT_ERRSTRING("No expedited async GPs, so went with async!");
-
-	if (torture_cleanup_begin())
-		return;
-	if (!cur_ops) {
-		torture_cleanup_end();
-		return;
-	}
-
-	if (reader_tasks) {
-		for (i = 0; i < nrealreaders; i++)
-			torture_stop_kthread(rcu_perf_reader,
-					     reader_tasks[i]);
-		kfree(reader_tasks);
-	}
-
-	if (writer_tasks) {
-		for (i = 0; i < nrealwriters; i++) {
-			torture_stop_kthread(rcu_perf_writer,
-					     writer_tasks[i]);
-			if (!writer_n_durations)
-				continue;
-			j = writer_n_durations[i];
-			pr_alert("%s%s writer %d gps: %d\n",
-				 perf_type, PERF_FLAG, i, j);
-			ngps += j;
-		}
-		pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
-			 perf_type, PERF_FLAG,
-			 t_rcu_perf_writer_started, t_rcu_perf_writer_finished,
-			 t_rcu_perf_writer_finished -
-			 t_rcu_perf_writer_started,
-			 ngps,
-			 rcuperf_seq_diff(b_rcu_gp_test_finished,
-					  b_rcu_gp_test_started));
-		for (i = 0; i < nrealwriters; i++) {
-			if (!writer_durations)
-				break;
-			if (!writer_n_durations)
-				continue;
-			wdpp = writer_durations[i];
-			if (!wdpp)
-				continue;
-			for (j = 0; j <= writer_n_durations[i]; j++) {
-				wdp = &wdpp[j];
-				pr_alert("%s%s %4d writer-duration: %5d %llu\n",
-					perf_type, PERF_FLAG,
-					i, j, *wdp);
-				if (j % 100 == 0)
-					schedule_timeout_uninterruptible(1);
-			}
-			kfree(writer_durations[i]);
-		}
-		kfree(writer_tasks);
-		kfree(writer_durations);
-		kfree(writer_n_durations);
-	}
-
-	/* Do torture-type-specific cleanup operations.  */
-	if (cur_ops->cleanup != NULL)
-		cur_ops->cleanup();
-
-	torture_cleanup_end();
-}
-
-/*
- * Return the number if non-negative.  If -1, the number of CPUs.
- * If less than -1, that much less than the number of CPUs, but
- * at least one.
- */
-static int compute_real(int n)
-{
-	int nr;
-
-	if (n >= 0) {
-		nr = n;
-	} else {
-		nr = num_online_cpus() + 1 + n;
-		if (nr <= 0)
-			nr = 1;
-	}
-	return nr;
-}
-
-/*
- * RCU perf shutdown kthread.  Just waits to be awakened, then shuts
- * down system.
- */
-static int
-rcu_perf_shutdown(void *arg)
-{
-	wait_event(shutdown_wq,
-		   atomic_read(&n_rcu_perf_writer_finished) >= nrealwriters);
-	smp_mb(); /* Wake before output. */
-	rcu_perf_cleanup();
-	kernel_power_off();
-	return -EINVAL;
-}
-
-/*
- * kfree_rcu() performance tests: Start a kfree_rcu() loop on all CPUs for number
- * of iterations and measure total time and number of GP for all iterations to complete.
- */
-
-torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
-torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
-torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
-
-static struct task_struct **kfree_reader_tasks;
-static int kfree_nrealthreads;
-static atomic_t n_kfree_perf_thread_started;
-static atomic_t n_kfree_perf_thread_ended;
-
-struct kfree_obj {
-	char kfree_obj[8];
-	struct rcu_head rh;
-};
-
-static int
-kfree_perf_thread(void *arg)
-{
-	int i, loop = 0;
-	long me = (long)arg;
-	struct kfree_obj *alloc_ptr;
-	u64 start_time, end_time;
-	long long mem_begin, mem_during = 0;
-
-	VERBOSE_PERFOUT_STRING("kfree_perf_thread task started");
-	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
-	set_user_nice(current, MAX_NICE);
-
-	start_time = ktime_get_mono_fast_ns();
-
-	if (atomic_inc_return(&n_kfree_perf_thread_started) >= kfree_nrealthreads) {
-		if (gp_exp)
-			b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
-		else
-			b_rcu_gp_test_started = cur_ops->get_gp_seq();
-	}
-
-	do {
-		if (!mem_during) {
-			mem_during = mem_begin = si_mem_available();
-		} else if (loop % (kfree_loops / 4) == 0) {
-			mem_during = (mem_during + si_mem_available()) / 2;
-		}
-
-		for (i = 0; i < kfree_alloc_num; i++) {
-			alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
-			if (!alloc_ptr)
-				return -ENOMEM;
-
-			kfree_rcu(alloc_ptr, rh);
-		}
-
-		cond_resched();
-	} while (!torture_must_stop() && ++loop < kfree_loops);
-
-	if (atomic_inc_return(&n_kfree_perf_thread_ended) >= kfree_nrealthreads) {
-		end_time = ktime_get_mono_fast_ns();
-
-		if (gp_exp)
-			b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
-		else
-			b_rcu_gp_test_finished = cur_ops->get_gp_seq();
-
-		pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
-		       (unsigned long long)(end_time - start_time), kfree_loops,
-		       rcuperf_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
-		       (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
-
-		if (shutdown) {
-			smp_mb(); /* Assign before wake. */
-			wake_up(&shutdown_wq);
-		}
-	}
-
-	torture_kthread_stopping("kfree_perf_thread");
-	return 0;
-}
-
-static void
-kfree_perf_cleanup(void)
-{
-	int i;
-
-	if (torture_cleanup_begin())
-		return;
-
-	if (kfree_reader_tasks) {
-		for (i = 0; i < kfree_nrealthreads; i++)
-			torture_stop_kthread(kfree_perf_thread,
-					     kfree_reader_tasks[i]);
-		kfree(kfree_reader_tasks);
-	}
-
-	torture_cleanup_end();
-}
-
-/*
- * shutdown kthread.  Just waits to be awakened, then shuts down system.
- */
-static int
-kfree_perf_shutdown(void *arg)
-{
-	wait_event(shutdown_wq,
-		   atomic_read(&n_kfree_perf_thread_ended) >= kfree_nrealthreads);
-
-	smp_mb(); /* Wake before output. */
-
-	kfree_perf_cleanup();
-	kernel_power_off();
-	return -EINVAL;
-}
-
-static int __init
-kfree_perf_init(void)
-{
-	long i;
-	int firsterr = 0;
-
-	kfree_nrealthreads = compute_real(kfree_nthreads);
-	/* Start up the kthreads. */
-	if (shutdown) {
-		init_waitqueue_head(&shutdown_wq);
-		firsterr = torture_create_kthread(kfree_perf_shutdown, NULL,
-						  shutdown_task);
-		if (firsterr)
-			goto unwind;
-		schedule_timeout_uninterruptible(1);
-	}
-
-	pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
-
-	kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
-			       GFP_KERNEL);
-	if (kfree_reader_tasks == NULL) {
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-
-	for (i = 0; i < kfree_nrealthreads; i++) {
-		firsterr = torture_create_kthread(kfree_perf_thread, (void *)i,
-						  kfree_reader_tasks[i]);
-		if (firsterr)
-			goto unwind;
-	}
-
-	while (atomic_read(&n_kfree_perf_thread_started) < kfree_nrealthreads)
-		schedule_timeout_uninterruptible(1);
-
-	torture_init_end();
-	return 0;
-
-unwind:
-	torture_init_end();
-	kfree_perf_cleanup();
-	return firsterr;
-}
-
-static int __init
-rcu_perf_init(void)
-{
-	long i;
-	int firsterr = 0;
-	static struct rcu_perf_ops *perf_ops[] = {
-		&rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
-	};
-
-	if (!torture_init_begin(perf_type, verbose))
-		return -EBUSY;
-
-	/* Process args and tell the world that the perf'er is on the job. */
-	for (i = 0; i < ARRAY_SIZE(perf_ops); i++) {
-		cur_ops = perf_ops[i];
-		if (strcmp(perf_type, cur_ops->name) == 0)
-			break;
-	}
-	if (i == ARRAY_SIZE(perf_ops)) {
-		pr_alert("rcu-perf: invalid perf type: \"%s\"\n", perf_type);
-		pr_alert("rcu-perf types:");
-		for (i = 0; i < ARRAY_SIZE(perf_ops); i++)
-			pr_cont(" %s", perf_ops[i]->name);
-		pr_cont("\n");
-		WARN_ON(!IS_MODULE(CONFIG_RCU_PERF_TEST));
-		firsterr = -EINVAL;
-		cur_ops = NULL;
-		goto unwind;
-	}
-	if (cur_ops->init)
-		cur_ops->init();
-
-	if (kfree_rcu_test)
-		return kfree_perf_init();
-
-	nrealwriters = compute_real(nwriters);
-	nrealreaders = compute_real(nreaders);
-	atomic_set(&n_rcu_perf_reader_started, 0);
-	atomic_set(&n_rcu_perf_writer_started, 0);
-	atomic_set(&n_rcu_perf_writer_finished, 0);
-	rcu_perf_print_module_parms(cur_ops, "Start of test");
-
-	/* Start up the kthreads. */
-
-	if (shutdown) {
-		init_waitqueue_head(&shutdown_wq);
-		firsterr = torture_create_kthread(rcu_perf_shutdown, NULL,
-						  shutdown_task);
-		if (firsterr)
-			goto unwind;
-		schedule_timeout_uninterruptible(1);
-	}
-	reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	if (reader_tasks == NULL) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nrealreaders; i++) {
-		firsterr = torture_create_kthread(rcu_perf_reader, (void *)i,
-						  reader_tasks[i]);
-		if (firsterr)
-			goto unwind;
-	}
-	while (atomic_read(&n_rcu_perf_reader_started) < nrealreaders)
-		schedule_timeout_uninterruptible(1);
-	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
-			       GFP_KERNEL);
-	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
-				   GFP_KERNEL);
-	writer_n_durations =
-		kcalloc(nrealwriters, sizeof(*writer_n_durations),
-			GFP_KERNEL);
-	if (!writer_tasks || !writer_durations || !writer_n_durations) {
-		VERBOSE_PERFOUT_ERRSTRING("out of memory");
-		firsterr = -ENOMEM;
-		goto unwind;
-	}
-	for (i = 0; i < nrealwriters; i++) {
-		writer_durations[i] =
-			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
-				GFP_KERNEL);
-		if (!writer_durations[i]) {
-			firsterr = -ENOMEM;
-			goto unwind;
-		}
-		firsterr = torture_create_kthread(rcu_perf_writer, (void *)i,
-						  writer_tasks[i]);
-		if (firsterr)
-			goto unwind;
-	}
-	torture_init_end();
-	return 0;
-
-unwind:
-	torture_init_end();
-	rcu_perf_cleanup();
-	return firsterr;
-}
-
-module_init(rcu_perf_init);
-module_exit(rcu_perf_cleanup);
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
new file mode 100644
index 000000000000..2819b95479af
--- /dev/null
+++ b/kernel/rcu/rcuscale.c
@@ -0,0 +1,853 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * Read-Copy Update module-based scalability-test facility
+ *
+ * Copyright (C) IBM Corporation, 2015
+ *
+ * Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+ */
+
+#define pr_fmt(fmt) fmt
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/atomic.h>
+#include <linux/bitops.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/freezer.h>
+#include <linux/cpu.h>
+#include <linux/delay.h>
+#include <linux/stat.h>
+#include <linux/srcu.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+#include <linux/torture.h>
+#include <linux/vmalloc.h>
+
+#include "rcu.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com>");
+
+#define SCALE_FLAG "-scale:"
+#define SCALEOUT_STRING(s) \
+	pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s)
+#define VERBOSE_SCALEOUT_STRING(s) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG " %s\n", scale_type, s); } while (0)
+#define VERBOSE_SCALEOUT_ERRSTRING(s) \
+	do { if (verbose) pr_alert("%s" SCALE_FLAG "!!! %s\n", scale_type, s); } while (0)
+
+/*
+ * The intended use cases for the nreaders and nwriters module parameters
+ * are as follows:
+ *
+ * 1.	Specify only the nr_cpus kernel boot parameter.  This will
+ *	set both nreaders and nwriters to the value specified by
+ *	nr_cpus for a mixed reader/writer test.
+ *
+ * 2.	Specify the nr_cpus kernel boot parameter, but set
+ *	rcuscale.nreaders to zero.  This will set nwriters to the
+ *	value specified by nr_cpus for an update-only test.
+ *
+ * 3.	Specify the nr_cpus kernel boot parameter, but set
+ *	rcuscale.nwriters to zero.  This will set nreaders to the
+ *	value specified by nr_cpus for a read-only test.
+ *
+ * Various other use cases may of course be specified.
+ *
+ * Note that this test's readers are intended only as a test load for
+ * the writers.  The reader scalability statistics will be overly
+ * pessimistic due to the per-critical-section interrupt disabling,
+ * test-end checks, and the pair of calls through pointers.
+ */
+
+#ifdef MODULE
+# define RCUSCALE_SHUTDOWN 0
+#else
+# define RCUSCALE_SHUTDOWN 1
+#endif
+
+torture_param(bool, gp_async, false, "Use asynchronous GP wait primitives");
+torture_param(int, gp_async_max, 1000, "Max # outstanding waits per reader");
+torture_param(bool, gp_exp, false, "Use expedited GP wait primitives");
+torture_param(int, holdoff, 10, "Holdoff time before test start (s)");
+torture_param(int, nreaders, -1, "Number of RCU reader threads");
+torture_param(int, nwriters, -1, "Number of RCU updater threads");
+torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
+	      "Shutdown at end of scalability tests.");
+torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
+torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
+torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
+torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
+
+static char *scale_type = "rcu";
+module_param(scale_type, charp, 0444);
+MODULE_PARM_DESC(scale_type, "Type of RCU to scalability-test (rcu, srcu, ...)");
+
+static int nrealreaders;
+static int nrealwriters;
+static struct task_struct **writer_tasks;
+static struct task_struct **reader_tasks;
+static struct task_struct *shutdown_task;
+
+static u64 **writer_durations;
+static int *writer_n_durations;
+static atomic_t n_rcu_scale_reader_started;
+static atomic_t n_rcu_scale_writer_started;
+static atomic_t n_rcu_scale_writer_finished;
+static wait_queue_head_t shutdown_wq;
+static u64 t_rcu_scale_writer_started;
+static u64 t_rcu_scale_writer_finished;
+static unsigned long b_rcu_gp_test_started;
+static unsigned long b_rcu_gp_test_finished;
+static DEFINE_PER_CPU(atomic_t, n_async_inflight);
+
+#define MAX_MEAS 10000
+#define MIN_MEAS 100
+
+/*
+ * Operations vector for selecting different types of tests.
+ */
+
+struct rcu_scale_ops {
+	int ptype;
+	void (*init)(void);
+	void (*cleanup)(void);
+	int (*readlock)(void);
+	void (*readunlock)(int idx);
+	unsigned long (*get_gp_seq)(void);
+	unsigned long (*gp_diff)(unsigned long new, unsigned long old);
+	unsigned long (*exp_completed)(void);
+	void (*async)(struct rcu_head *head, rcu_callback_t func);
+	void (*gp_barrier)(void);
+	void (*sync)(void);
+	void (*exp_sync)(void);
+	const char *name;
+};
+
+static struct rcu_scale_ops *cur_ops;
+
+/*
+ * Definitions for rcu scalability testing.
+ */
+
+static int rcu_scale_read_lock(void) __acquires(RCU)
+{
+	rcu_read_lock();
+	return 0;
+}
+
+static void rcu_scale_read_unlock(int idx) __releases(RCU)
+{
+	rcu_read_unlock();
+}
+
+static unsigned long __maybe_unused rcu_no_completed(void)
+{
+	return 0;
+}
+
+static void rcu_sync_scale_init(void)
+{
+}
+
+static struct rcu_scale_ops rcu_ops = {
+	.ptype		= RCU_FLAVOR,
+	.init		= rcu_sync_scale_init,
+	.readlock	= rcu_scale_read_lock,
+	.readunlock	= rcu_scale_read_unlock,
+	.get_gp_seq	= rcu_get_gp_seq,
+	.gp_diff	= rcu_seq_diff,
+	.exp_completed	= rcu_exp_batches_completed,
+	.async		= call_rcu,
+	.gp_barrier	= rcu_barrier,
+	.sync		= synchronize_rcu,
+	.exp_sync	= synchronize_rcu_expedited,
+	.name		= "rcu"
+};
+
+/*
+ * Definitions for srcu scalability testing.
+ */
+
+DEFINE_STATIC_SRCU(srcu_ctl_scale);
+static struct srcu_struct *srcu_ctlp = &srcu_ctl_scale;
+
+static int srcu_scale_read_lock(void) __acquires(srcu_ctlp)
+{
+	return srcu_read_lock(srcu_ctlp);
+}
+
+static void srcu_scale_read_unlock(int idx) __releases(srcu_ctlp)
+{
+	srcu_read_unlock(srcu_ctlp, idx);
+}
+
+static unsigned long srcu_scale_completed(void)
+{
+	return srcu_batches_completed(srcu_ctlp);
+}
+
+static void srcu_call_rcu(struct rcu_head *head, rcu_callback_t func)
+{
+	call_srcu(srcu_ctlp, head, func);
+}
+
+static void srcu_rcu_barrier(void)
+{
+	srcu_barrier(srcu_ctlp);
+}
+
+static void srcu_scale_synchronize(void)
+{
+	synchronize_srcu(srcu_ctlp);
+}
+
+static void srcu_scale_synchronize_expedited(void)
+{
+	synchronize_srcu_expedited(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcu_ops = {
+	.ptype		= SRCU_FLAVOR,
+	.init		= rcu_sync_scale_init,
+	.readlock	= srcu_scale_read_lock,
+	.readunlock	= srcu_scale_read_unlock,
+	.get_gp_seq	= srcu_scale_completed,
+	.gp_diff	= rcu_seq_diff,
+	.exp_completed	= srcu_scale_completed,
+	.async		= srcu_call_rcu,
+	.gp_barrier	= srcu_rcu_barrier,
+	.sync		= srcu_scale_synchronize,
+	.exp_sync	= srcu_scale_synchronize_expedited,
+	.name		= "srcu"
+};
+
+static struct srcu_struct srcud;
+
+static void srcu_sync_scale_init(void)
+{
+	srcu_ctlp = &srcud;
+	init_srcu_struct(srcu_ctlp);
+}
+
+static void srcu_sync_scale_cleanup(void)
+{
+	cleanup_srcu_struct(srcu_ctlp);
+}
+
+static struct rcu_scale_ops srcud_ops = {
+	.ptype		= SRCU_FLAVOR,
+	.init		= srcu_sync_scale_init,
+	.cleanup	= srcu_sync_scale_cleanup,
+	.readlock	= srcu_scale_read_lock,
+	.readunlock	= srcu_scale_read_unlock,
+	.get_gp_seq	= srcu_scale_completed,
+	.gp_diff	= rcu_seq_diff,
+	.exp_completed	= srcu_scale_completed,
+	.async		= srcu_call_rcu,
+	.gp_barrier	= srcu_rcu_barrier,
+	.sync		= srcu_scale_synchronize,
+	.exp_sync	= srcu_scale_synchronize_expedited,
+	.name		= "srcud"
+};
+
+/*
+ * Definitions for RCU-tasks scalability testing.
+ */
+
+static int tasks_scale_read_lock(void)
+{
+	return 0;
+}
+
+static void tasks_scale_read_unlock(int idx)
+{
+}
+
+static struct rcu_scale_ops tasks_ops = {
+	.ptype		= RCU_TASKS_FLAVOR,
+	.init		= rcu_sync_scale_init,
+	.readlock	= tasks_scale_read_lock,
+	.readunlock	= tasks_scale_read_unlock,
+	.get_gp_seq	= rcu_no_completed,
+	.gp_diff	= rcu_seq_diff,
+	.async		= call_rcu_tasks,
+	.gp_barrier	= rcu_barrier_tasks,
+	.sync		= synchronize_rcu_tasks,
+	.exp_sync	= synchronize_rcu_tasks,
+	.name		= "tasks"
+};
+
+static unsigned long rcuscale_seq_diff(unsigned long new, unsigned long old)
+{
+	if (!cur_ops->gp_diff)
+		return new - old;
+	return cur_ops->gp_diff(new, old);
+}
+
+/*
+ * If scalability tests complete, wait for shutdown to commence.
+ */
+static void rcu_scale_wait_shutdown(void)
+{
+	cond_resched_tasks_rcu_qs();
+	if (atomic_read(&n_rcu_scale_writer_finished) < nrealwriters)
+		return;
+	while (!torture_must_stop())
+		schedule_timeout_uninterruptible(1);
+}
+
+/*
+ * RCU scalability reader kthread.  Repeatedly does empty RCU read-side
+ * critical section, minimizing update-side interference.  However, the
+ * point of this test is not to evaluate reader scalability, but instead
+ * to serve as a test load for update-side scalability testing.
+ */
+static int
+rcu_scale_reader(void *arg)
+{
+	unsigned long flags;
+	int idx;
+	long me = (long)arg;
+
+	VERBOSE_SCALEOUT_STRING("rcu_scale_reader task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+	atomic_inc(&n_rcu_scale_reader_started);
+
+	do {
+		local_irq_save(flags);
+		idx = cur_ops->readlock();
+		cur_ops->readunlock(idx);
+		local_irq_restore(flags);
+		rcu_scale_wait_shutdown();
+	} while (!torture_must_stop());
+	torture_kthread_stopping("rcu_scale_reader");
+	return 0;
+}
+
+/*
+ * Callback function for asynchronous grace periods from rcu_scale_writer().
+ */
+static void rcu_scale_async_cb(struct rcu_head *rhp)
+{
+	atomic_dec(this_cpu_ptr(&n_async_inflight));
+	kfree(rhp);
+}
+
+/*
+ * RCU scale writer kthread.  Repeatedly does a grace period.
+ */
+static int
+rcu_scale_writer(void *arg)
+{
+	int i = 0;
+	int i_max;
+	long me = (long)arg;
+	struct rcu_head *rhp = NULL;
+	bool started = false, done = false, alldone = false;
+	u64 t;
+	u64 *wdp;
+	u64 *wdpp = writer_durations[me];
+
+	VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
+	WARN_ON(!wdpp);
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	sched_set_fifo_low(current);
+
+	if (holdoff)
+		schedule_timeout_uninterruptible(holdoff * HZ);
+
+	/*
+	 * Wait until rcu_end_inkernel_boot() is called for normal GP tests
+	 * so that RCU is not always expedited for normal GP tests.
+	 * The system_state test is approximate, but works well in practice.
+	 */
+	while (!gp_exp && system_state != SYSTEM_RUNNING)
+		schedule_timeout_uninterruptible(1);
+
+	t = ktime_get_mono_fast_ns();
+	if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) {
+		t_rcu_scale_writer_started = t;
+		if (gp_exp) {
+			b_rcu_gp_test_started =
+				cur_ops->exp_completed() / 2;
+		} else {
+			b_rcu_gp_test_started = cur_ops->get_gp_seq();
+		}
+	}
+
+	do {
+		if (writer_holdoff)
+			udelay(writer_holdoff);
+		wdp = &wdpp[i];
+		*wdp = ktime_get_mono_fast_ns();
+		if (gp_async) {
+retry:
+			if (!rhp)
+				rhp = kmalloc(sizeof(*rhp), GFP_KERNEL);
+			if (rhp && atomic_read(this_cpu_ptr(&n_async_inflight)) < gp_async_max) {
+				atomic_inc(this_cpu_ptr(&n_async_inflight));
+				cur_ops->async(rhp, rcu_scale_async_cb);
+				rhp = NULL;
+			} else if (!kthread_should_stop()) {
+				cur_ops->gp_barrier();
+				goto retry;
+			} else {
+				kfree(rhp); /* Because we are stopping. */
+			}
+		} else if (gp_exp) {
+			cur_ops->exp_sync();
+		} else {
+			cur_ops->sync();
+		}
+		t = ktime_get_mono_fast_ns();
+		*wdp = t - *wdp;
+		i_max = i;
+		if (!started &&
+		    atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
+			started = true;
+		if (!done && i >= MIN_MEAS) {
+			done = true;
+			sched_set_normal(current, 0);
+			pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
+				 scale_type, SCALE_FLAG, me, MIN_MEAS);
+			if (atomic_inc_return(&n_rcu_scale_writer_finished) >=
+			    nrealwriters) {
+				schedule_timeout_interruptible(10);
+				rcu_ftrace_dump(DUMP_ALL);
+				SCALEOUT_STRING("Test complete");
+				t_rcu_scale_writer_finished = t;
+				if (gp_exp) {
+					b_rcu_gp_test_finished =
+						cur_ops->exp_completed() / 2;
+				} else {
+					b_rcu_gp_test_finished =
+						cur_ops->get_gp_seq();
+				}
+				if (shutdown) {
+					smp_mb(); /* Assign before wake. */
+					wake_up(&shutdown_wq);
+				}
+			}
+		}
+		if (done && !alldone &&
+		    atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
+			alldone = true;
+		if (started && !alldone && i < MAX_MEAS - 1)
+			i++;
+		rcu_scale_wait_shutdown();
+	} while (!torture_must_stop());
+	if (gp_async) {
+		cur_ops->gp_barrier();
+	}
+	writer_n_durations[me] = i_max;
+	torture_kthread_stopping("rcu_scale_writer");
+	return 0;
+}
+
+static void
+rcu_scale_print_module_parms(struct rcu_scale_ops *cur_ops, const char *tag)
+{
+	pr_alert("%s" SCALE_FLAG
+		 "--- %s: nreaders=%d nwriters=%d verbose=%d shutdown=%d\n",
+		 scale_type, tag, nrealreaders, nrealwriters, verbose, shutdown);
+}
+
+static void
+rcu_scale_cleanup(void)
+{
+	int i;
+	int j;
+	int ngps = 0;
+	u64 *wdp;
+	u64 *wdpp;
+
+	/*
+	 * Would like warning at start, but everything is expedited
+	 * during the mid-boot phase, so have to wait till the end.
+	 */
+	if (rcu_gp_is_expedited() && !rcu_gp_is_normal() && !gp_exp)
+		VERBOSE_SCALEOUT_ERRSTRING("All grace periods expedited, no normal ones to measure!");
+	if (rcu_gp_is_normal() && gp_exp)
+		VERBOSE_SCALEOUT_ERRSTRING("All grace periods normal, no expedited ones to measure!");
+	if (gp_exp && gp_async)
+		VERBOSE_SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
+
+	if (torture_cleanup_begin())
+		return;
+	if (!cur_ops) {
+		torture_cleanup_end();
+		return;
+	}
+
+	if (reader_tasks) {
+		for (i = 0; i < nrealreaders; i++)
+			torture_stop_kthread(rcu_scale_reader,
+					     reader_tasks[i]);
+		kfree(reader_tasks);
+	}
+
+	if (writer_tasks) {
+		for (i = 0; i < nrealwriters; i++) {
+			torture_stop_kthread(rcu_scale_writer,
+					     writer_tasks[i]);
+			if (!writer_n_durations)
+				continue;
+			j = writer_n_durations[i];
+			pr_alert("%s%s writer %d gps: %d\n",
+				 scale_type, SCALE_FLAG, i, j);
+			ngps += j;
+		}
+		pr_alert("%s%s start: %llu end: %llu duration: %llu gps: %d batches: %ld\n",
+			 scale_type, SCALE_FLAG,
+			 t_rcu_scale_writer_started, t_rcu_scale_writer_finished,
+			 t_rcu_scale_writer_finished -
+			 t_rcu_scale_writer_started,
+			 ngps,
+			 rcuscale_seq_diff(b_rcu_gp_test_finished,
+					   b_rcu_gp_test_started));
+		for (i = 0; i < nrealwriters; i++) {
+			if (!writer_durations)
+				break;
+			if (!writer_n_durations)
+				continue;
+			wdpp = writer_durations[i];
+			if (!wdpp)
+				continue;
+			for (j = 0; j <= writer_n_durations[i]; j++) {
+				wdp = &wdpp[j];
+				pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+					scale_type, SCALE_FLAG,
+					i, j, *wdp);
+				if (j % 100 == 0)
+					schedule_timeout_uninterruptible(1);
+			}
+			kfree(writer_durations[i]);
+		}
+		kfree(writer_tasks);
+		kfree(writer_durations);
+		kfree(writer_n_durations);
+	}
+
+	/* Do torture-type-specific cleanup operations.  */
+	if (cur_ops->cleanup != NULL)
+		cur_ops->cleanup();
+
+	torture_cleanup_end();
+}
+
+/*
+ * Return the number if non-negative.  If -1, the number of CPUs.
+ * If less than -1, that much less than the number of CPUs, but
+ * at least one.
+ */
+static int compute_real(int n)
+{
+	int nr;
+
+	if (n >= 0) {
+		nr = n;
+	} else {
+		nr = num_online_cpus() + 1 + n;
+		if (nr <= 0)
+			nr = 1;
+	}
+	return nr;
+}
+
+/*
+ * RCU scalability shutdown kthread.  Just waits to be awakened, then shuts
+ * down system.
+ */
+static int
+rcu_scale_shutdown(void *arg)
+{
+	wait_event(shutdown_wq,
+		   atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters);
+	smp_mb(); /* Wake before output. */
+	rcu_scale_cleanup();
+	kernel_power_off();
+	return -EINVAL;
+}
+
+/*
+ * kfree_rcu() scalability tests: Start a kfree_rcu() loop on all CPUs for number
+ * of iterations and measure total time and number of GP for all iterations to complete.
+ */
+
+torture_param(int, kfree_nthreads, -1, "Number of threads running loops of kfree_rcu().");
+torture_param(int, kfree_alloc_num, 8000, "Number of allocations and frees done in an iteration.");
+torture_param(int, kfree_loops, 10, "Number of loops doing kfree_alloc_num allocations and frees.");
+
+static struct task_struct **kfree_reader_tasks;
+static int kfree_nrealthreads;
+static atomic_t n_kfree_scale_thread_started;
+static atomic_t n_kfree_scale_thread_ended;
+
+struct kfree_obj {
+	char kfree_obj[8];
+	struct rcu_head rh;
+};
+
+static int
+kfree_scale_thread(void *arg)
+{
+	int i, loop = 0;
+	long me = (long)arg;
+	struct kfree_obj *alloc_ptr;
+	u64 start_time, end_time;
+	long long mem_begin, mem_during = 0;
+
+	VERBOSE_SCALEOUT_STRING("kfree_scale_thread task started");
+	set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
+	set_user_nice(current, MAX_NICE);
+
+	start_time = ktime_get_mono_fast_ns();
+
+	if (atomic_inc_return(&n_kfree_scale_thread_started) >= kfree_nrealthreads) {
+		if (gp_exp)
+			b_rcu_gp_test_started = cur_ops->exp_completed() / 2;
+		else
+			b_rcu_gp_test_started = cur_ops->get_gp_seq();
+	}
+
+	do {
+		if (!mem_during) {
+			mem_during = mem_begin = si_mem_available();
+		} else if (loop % (kfree_loops / 4) == 0) {
+			mem_during = (mem_during + si_mem_available()) / 2;
+		}
+
+		for (i = 0; i < kfree_alloc_num; i++) {
+			alloc_ptr = kmalloc(kfree_mult * sizeof(struct kfree_obj), GFP_KERNEL);
+			if (!alloc_ptr)
+				return -ENOMEM;
+
+			kfree_rcu(alloc_ptr, rh);
+		}
+
+		cond_resched();
+	} while (!torture_must_stop() && ++loop < kfree_loops);
+
+	if (atomic_inc_return(&n_kfree_scale_thread_ended) >= kfree_nrealthreads) {
+		end_time = ktime_get_mono_fast_ns();
+
+		if (gp_exp)
+			b_rcu_gp_test_finished = cur_ops->exp_completed() / 2;
+		else
+			b_rcu_gp_test_finished = cur_ops->get_gp_seq();
+
+		pr_alert("Total time taken by all kfree'ers: %llu ns, loops: %d, batches: %ld, memory footprint: %lldMB\n",
+		       (unsigned long long)(end_time - start_time), kfree_loops,
+		       rcuscale_seq_diff(b_rcu_gp_test_finished, b_rcu_gp_test_started),
+		       (mem_begin - mem_during) >> (20 - PAGE_SHIFT));
+
+		if (shutdown) {
+			smp_mb(); /* Assign before wake. */
+			wake_up(&shutdown_wq);
+		}
+	}
+
+	torture_kthread_stopping("kfree_scale_thread");
+	return 0;
+}
+
+static void
+kfree_scale_cleanup(void)
+{
+	int i;
+
+	if (torture_cleanup_begin())
+		return;
+
+	if (kfree_reader_tasks) {
+		for (i = 0; i < kfree_nrealthreads; i++)
+			torture_stop_kthread(kfree_scale_thread,
+					     kfree_reader_tasks[i]);
+		kfree(kfree_reader_tasks);
+	}
+
+	torture_cleanup_end();
+}
+
+/*
+ * shutdown kthread.  Just waits to be awakened, then shuts down system.
+ */
+static int
+kfree_scale_shutdown(void *arg)
+{
+	wait_event(shutdown_wq,
+		   atomic_read(&n_kfree_scale_thread_ended) >= kfree_nrealthreads);
+
+	smp_mb(); /* Wake before output. */
+
+	kfree_scale_cleanup();
+	kernel_power_off();
+	return -EINVAL;
+}
+
+static int __init
+kfree_scale_init(void)
+{
+	long i;
+	int firsterr = 0;
+
+	kfree_nrealthreads = compute_real(kfree_nthreads);
+	/* Start up the kthreads. */
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(kfree_scale_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+
+	pr_alert("kfree object size=%zu\n", kfree_mult * sizeof(struct kfree_obj));
+
+	kfree_reader_tasks = kcalloc(kfree_nrealthreads, sizeof(kfree_reader_tasks[0]),
+			       GFP_KERNEL);
+	if (kfree_reader_tasks == NULL) {
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+
+	for (i = 0; i < kfree_nrealthreads; i++) {
+		firsterr = torture_create_kthread(kfree_scale_thread, (void *)i,
+						  kfree_reader_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+
+	while (atomic_read(&n_kfree_scale_thread_started) < kfree_nrealthreads)
+		schedule_timeout_uninterruptible(1);
+
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	kfree_scale_cleanup();
+	return firsterr;
+}
+
+static int __init
+rcu_scale_init(void)
+{
+	long i;
+	int firsterr = 0;
+	static struct rcu_scale_ops *scale_ops[] = {
+		&rcu_ops, &srcu_ops, &srcud_ops, &tasks_ops,
+	};
+
+	if (!torture_init_begin(scale_type, verbose))
+		return -EBUSY;
+
+	/* Process args and announce that the scalability'er is on the job. */
+	for (i = 0; i < ARRAY_SIZE(scale_ops); i++) {
+		cur_ops = scale_ops[i];
+		if (strcmp(scale_type, cur_ops->name) == 0)
+			break;
+	}
+	if (i == ARRAY_SIZE(scale_ops)) {
+		pr_alert("rcu-scale: invalid scale type: \"%s\"\n", scale_type);
+		pr_alert("rcu-scale types:");
+		for (i = 0; i < ARRAY_SIZE(scale_ops); i++)
+			pr_cont(" %s", scale_ops[i]->name);
+		pr_cont("\n");
+		WARN_ON(!IS_MODULE(CONFIG_RCU_SCALE_TEST));
+		firsterr = -EINVAL;
+		cur_ops = NULL;
+		goto unwind;
+	}
+	if (cur_ops->init)
+		cur_ops->init();
+
+	if (kfree_rcu_test)
+		return kfree_scale_init();
+
+	nrealwriters = compute_real(nwriters);
+	nrealreaders = compute_real(nreaders);
+	atomic_set(&n_rcu_scale_reader_started, 0);
+	atomic_set(&n_rcu_scale_writer_started, 0);
+	atomic_set(&n_rcu_scale_writer_finished, 0);
+	rcu_scale_print_module_parms(cur_ops, "Start of test");
+
+	/* Start up the kthreads. */
+
+	if (shutdown) {
+		init_waitqueue_head(&shutdown_wq);
+		firsterr = torture_create_kthread(rcu_scale_shutdown, NULL,
+						  shutdown_task);
+		if (firsterr)
+			goto unwind;
+		schedule_timeout_uninterruptible(1);
+	}
+	reader_tasks = kcalloc(nrealreaders, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		firsterr = torture_create_kthread(rcu_scale_reader, (void *)i,
+						  reader_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	while (atomic_read(&n_rcu_scale_reader_started) < nrealreaders)
+		schedule_timeout_uninterruptible(1);
+	writer_tasks = kcalloc(nrealwriters, sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	writer_durations = kcalloc(nrealwriters, sizeof(*writer_durations),
+				   GFP_KERNEL);
+	writer_n_durations =
+		kcalloc(nrealwriters, sizeof(*writer_n_durations),
+			GFP_KERNEL);
+	if (!writer_tasks || !writer_durations || !writer_n_durations) {
+		VERBOSE_SCALEOUT_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealwriters; i++) {
+		writer_durations[i] =
+			kcalloc(MAX_MEAS, sizeof(*writer_durations[i]),
+				GFP_KERNEL);
+		if (!writer_durations[i]) {
+			firsterr = -ENOMEM;
+			goto unwind;
+		}
+		firsterr = torture_create_kthread(rcu_scale_writer, (void *)i,
+						  writer_tasks[i]);
+		if (firsterr)
+			goto unwind;
+	}
+	torture_init_end();
+	return 0;
+
+unwind:
+	torture_init_end();
+	rcu_scale_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_scale_init);
+module_exit(rcu_scale_cleanup);
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
deleted file mode 100755
index 7d3c2be66c64..000000000000
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf-ftrace.sh
+++ /dev/null
@@ -1,109 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0+
-#
-# Analyze a given results directory for rcuperf performance measurements,
-# looking for ftrace data.  Exits with 0 if data was found, analyzed, and
-# printed.  Intended to be invoked from kvm-recheck-rcuperf.sh after
-# argument checking.
-#
-# Usage: kvm-recheck-rcuperf-ftrace.sh resdir
-#
-# Copyright (C) IBM Corporation, 2016
-#
-# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
-
-i="$1"
-. functions.sh
-
-if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100
-then
-	exit 10
-fi
-
-sed -e 's/^\[[^]]*]//' < $i/console.log |
-grep 'us : rcu_exp_grace_period' |
-sed -e 's/us : / : /' |
-tr -d '\015' |
-awk '
-$8 == "start" {
-	if (startseq != "")
-		nlost++;
-	starttask = $1;
-	starttime = $3;
-	startseq = $7;
-	seqtask[startseq] = starttask;
-}
-
-$8 == "end" {
-	if (startseq == $7) {
-		curgpdur = $3 - starttime;
-		gptimes[++n] = curgpdur;
-		gptaskcnt[starttask]++;
-		sum += curgpdur;
-		if (curgpdur > 1000)
-			print "Long GP " starttime "us to " $3 "us (" curgpdur "us)";
-		startseq = "";
-	} else {
-		# Lost a message or some such, reset.
-		startseq = "";
-		nlost++;
-	}
-}
-
-$8 == "done" && seqtask[$7] != $1 {
-	piggybackcnt[$1]++;
-}
-
-END {
-	newNR = asort(gptimes);
-	if (newNR <= 0) {
-		print "No ftrace records found???"
-		exit 10;
-	}
-	pct50 = int(newNR * 50 / 100);
-	if (pct50 < 1)
-		pct50 = 1;
-	pct90 = int(newNR * 90 / 100);
-	if (pct90 < 1)
-		pct90 = 1;
-	pct99 = int(newNR * 99 / 100);
-	if (pct99 < 1)
-		pct99 = 1;
-	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
-	print "Histogram bucket size: " div;
-	last = gptimes[1] - 10;
-	count = 0;
-	for (i = 1; i <= newNR; i++) {
-		current = div * int(gptimes[i] / div);
-		if (last == current) {
-			count++;
-		} else {
-			if (count > 0)
-				print last, count;
-			count = 1;
-			last = current;
-		}
-	}
-	if (count > 0)
-		print last, count;
-	print "Distribution of grace periods across tasks:";
-	for (i in gptaskcnt) {
-		print "\t" i, gptaskcnt[i];
-		nbatches += gptaskcnt[i];
-	}
-	ngps = nbatches;
-	print "Distribution of piggybacking across tasks:";
-	for (i in piggybackcnt) {
-		print "\t" i, piggybackcnt[i];
-		ngps += piggybackcnt[i];
-	}
-	print "Average grace-period duration: " sum / newNR " microseconds";
-	print "Minimum grace-period duration: " gptimes[1];
-	print "50th percentile grace-period duration: " gptimes[pct50];
-	print "90th percentile grace-period duration: " gptimes[pct90];
-	print "99th percentile grace-period duration: " gptimes[pct99];
-	print "Maximum grace-period duration: " gptimes[newNR];
-	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0;
-	print "Computed from ftrace data.";
-}'
-exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
deleted file mode 100755
index db0375a57f28..000000000000
--- a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuperf.sh
+++ /dev/null
@@ -1,83 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0+
-#
-# Analyze a given results directory for rcuperf performance measurements.
-#
-# Usage: kvm-recheck-rcuperf.sh resdir
-#
-# Copyright (C) IBM Corporation, 2016
-#
-# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
-
-i="$1"
-if test -d "$i" -a -r "$i"
-then
-	:
-else
-	echo Unreadable results directory: $i
-	exit 1
-fi
-PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
-. functions.sh
-
-if kvm-recheck-rcuperf-ftrace.sh $i
-then
-	# ftrace data was successfully analyzed, call it good!
-	exit 0
-fi
-
-configfile=`echo $i | sed -e 's/^.*\///'`
-
-sed -e 's/^\[[^]]*]//' < $i/console.log |
-awk '
-/-perf: .* gps: .* batches:/ {
-	ngps = $9;
-	nbatches = $11;
-}
-
-/-perf: .*writer-duration/ {
-	gptimes[++n] = $5 / 1000.;
-	sum += $5 / 1000.;
-}
-
-END {
-	newNR = asort(gptimes);
-	if (newNR <= 0) {
-		print "No rcuperf records found???"
-		exit;
-	}
-	pct50 = int(newNR * 50 / 100);
-	if (pct50 < 1)
-		pct50 = 1;
-	pct90 = int(newNR * 90 / 100);
-	if (pct90 < 1)
-		pct90 = 1;
-	pct99 = int(newNR * 99 / 100);
-	if (pct99 < 1)
-		pct99 = 1;
-	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
-	print "Histogram bucket size: " div;
-	last = gptimes[1] - 10;
-	count = 0;
-	for (i = 1; i <= newNR; i++) {
-		current = div * int(gptimes[i] / div);
-		if (last == current) {
-			count++;
-		} else {
-			if (count > 0)
-				print last, count;
-			count = 1;
-			last = current;
-		}
-	}
-	if (count > 0)
-		print last, count;
-	print "Average grace-period duration: " sum / newNR " microseconds";
-	print "Minimum grace-period duration: " gptimes[1];
-	print "50th percentile grace-period duration: " gptimes[pct50];
-	print "90th percentile grace-period duration: " gptimes[pct90];
-	print "99th percentile grace-period duration: " gptimes[pct99];
-	print "Maximum grace-period duration: " gptimes[newNR];
-	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
-	print "Computed from rcuperf printk output.";
-}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh
new file mode 100755
index 000000000000..d4bec538086d
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale-ftrace.sh
@@ -0,0 +1,109 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcuscale performance measurements,
+# looking for ftrace data.  Exits with 0 if data was found, analyzed, and
+# printed.  Intended to be invoked from kvm-recheck-rcuscale.sh after
+# argument checking.
+#
+# Usage: kvm-recheck-rcuscale-ftrace.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+. functions.sh
+
+if test "`grep -c 'rcu_exp_grace_period.*start' < $i/console.log`" -lt 100
+then
+	exit 10
+fi
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+grep 'us : rcu_exp_grace_period' |
+sed -e 's/us : / : /' |
+tr -d '\015' |
+awk '
+$8 == "start" {
+	if (startseq != "")
+		nlost++;
+	starttask = $1;
+	starttime = $3;
+	startseq = $7;
+	seqtask[startseq] = starttask;
+}
+
+$8 == "end" {
+	if (startseq == $7) {
+		curgpdur = $3 - starttime;
+		gptimes[++n] = curgpdur;
+		gptaskcnt[starttask]++;
+		sum += curgpdur;
+		if (curgpdur > 1000)
+			print "Long GP " starttime "us to " $3 "us (" curgpdur "us)";
+		startseq = "";
+	} else {
+		# Lost a message or some such, reset.
+		startseq = "";
+		nlost++;
+	}
+}
+
+$8 == "done" && seqtask[$7] != $1 {
+	piggybackcnt[$1]++;
+}
+
+END {
+	newNR = asort(gptimes);
+	if (newNR <= 0) {
+		print "No ftrace records found???"
+		exit 10;
+	}
+	pct50 = int(newNR * 50 / 100);
+	if (pct50 < 1)
+		pct50 = 1;
+	pct90 = int(newNR * 90 / 100);
+	if (pct90 < 1)
+		pct90 = 1;
+	pct99 = int(newNR * 99 / 100);
+	if (pct99 < 1)
+		pct99 = 1;
+	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+	print "Histogram bucket size: " div;
+	last = gptimes[1] - 10;
+	count = 0;
+	for (i = 1; i <= newNR; i++) {
+		current = div * int(gptimes[i] / div);
+		if (last == current) {
+			count++;
+		} else {
+			if (count > 0)
+				print last, count;
+			count = 1;
+			last = current;
+		}
+	}
+	if (count > 0)
+		print last, count;
+	print "Distribution of grace periods across tasks:";
+	for (i in gptaskcnt) {
+		print "\t" i, gptaskcnt[i];
+		nbatches += gptaskcnt[i];
+	}
+	ngps = nbatches;
+	print "Distribution of piggybacking across tasks:";
+	for (i in piggybackcnt) {
+		print "\t" i, piggybackcnt[i];
+		ngps += piggybackcnt[i];
+	}
+	print "Average grace-period duration: " sum / newNR " microseconds";
+	print "Minimum grace-period duration: " gptimes[1];
+	print "50th percentile grace-period duration: " gptimes[pct50];
+	print "90th percentile grace-period duration: " gptimes[pct90];
+	print "99th percentile grace-period duration: " gptimes[pct99];
+	print "Maximum grace-period duration: " gptimes[newNR];
+	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches " Lost: " nlost + 0;
+	print "Computed from ftrace data.";
+}'
+exit 0
diff --git a/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
new file mode 100755
index 000000000000..aa745152a525
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/bin/kvm-recheck-rcuscale.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Analyze a given results directory for rcuscale scalability measurements.
+#
+# Usage: kvm-recheck-rcuscale.sh resdir
+#
+# Copyright (C) IBM Corporation, 2016
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+i="$1"
+if test -d "$i" -a -r "$i"
+then
+	:
+else
+	echo Unreadable results directory: $i
+	exit 1
+fi
+PATH=`pwd`/tools/testing/selftests/rcutorture/bin:$PATH; export PATH
+. functions.sh
+
+if kvm-recheck-rcuscale-ftrace.sh $i
+then
+	# ftrace data was successfully analyzed, call it good!
+	exit 0
+fi
+
+configfile=`echo $i | sed -e 's/^.*\///'`
+
+sed -e 's/^\[[^]]*]//' < $i/console.log |
+awk '
+/-scale: .* gps: .* batches:/ {
+	ngps = $9;
+	nbatches = $11;
+}
+
+/-scale: .*writer-duration/ {
+	gptimes[++n] = $5 / 1000.;
+	sum += $5 / 1000.;
+}
+
+END {
+	newNR = asort(gptimes);
+	if (newNR <= 0) {
+		print "No rcuscale records found???"
+		exit;
+	}
+	pct50 = int(newNR * 50 / 100);
+	if (pct50 < 1)
+		pct50 = 1;
+	pct90 = int(newNR * 90 / 100);
+	if (pct90 < 1)
+		pct90 = 1;
+	pct99 = int(newNR * 99 / 100);
+	if (pct99 < 1)
+		pct99 = 1;
+	div = 10 ** int(log(gptimes[pct90]) / log(10) + .5) / 100;
+	print "Histogram bucket size: " div;
+	last = gptimes[1] - 10;
+	count = 0;
+	for (i = 1; i <= newNR; i++) {
+		current = div * int(gptimes[i] / div);
+		if (last == current) {
+			count++;
+		} else {
+			if (count > 0)
+				print last, count;
+			count = 1;
+			last = current;
+		}
+	}
+	if (count > 0)
+		print last, count;
+	print "Average grace-period duration: " sum / newNR " microseconds";
+	print "Minimum grace-period duration: " gptimes[1];
+	print "50th percentile grace-period duration: " gptimes[pct50];
+	print "90th percentile grace-period duration: " gptimes[pct90];
+	print "99th percentile grace-period duration: " gptimes[pct99];
+	print "Maximum grace-period duration: " gptimes[newNR];
+	print "Grace periods: " ngps + 0 " Batches: " nbatches + 0 " Ratio: " ngps / nbatches;
+	print "Computed from rcuscale printk output.";
+}'
diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 44dfdd9be67e..0489c198a72a 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -65,7 +65,7 @@ usage () {
 	echo "       --qemu-args qemu-arguments"
 	echo "       --qemu-cmd qemu-system-..."
 	echo "       --results absolute-pathname"
-	echo "       --torture rcu"
+	echo "       --torture lock|rcu|rcuscale|refscale|scf"
 	echo "       --trust-make"
 	exit 1
 }
@@ -184,13 +184,13 @@ do
 		shift
 		;;
 	--torture)
-		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuperf\|refscale\|scf\)$' '^--'
+		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\|rcuscale\|refscale\|scf\)$' '^--'
 		TORTURE_SUITE=$2
 		shift
-		if test "$TORTURE_SUITE" = rcuperf || test "$TORTURE_SUITE" = refscale
+		if test "$TORTURE_SUITE" = rcuscale || test "$TORTURE_SUITE" = refscale
 		then
 			# If you really want jitter for refscale or
-			# rcuperf, specify it after specifying the rcuperf
+			# rcuscale, specify it after specifying the rcuscale
 			# or the refscale.  (But why jitter in these cases?)
 			jitter=0
 		fi
diff --git a/tools/testing/selftests/rcutorture/bin/parse-console.sh b/tools/testing/selftests/rcutorture/bin/parse-console.sh
index 4e081a25761e..e03338091a06 100755
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -33,8 +33,8 @@ then
 fi
 cat /dev/null > $file.diags
 
-# Check for proper termination, except for rcuperf and refscale.
-if test "$TORTURE_SUITE" != rcuperf && test "$TORTURE_SUITE" != refscale
+# Check for proper termination, except for rcuscale and refscale.
+if test "$TORTURE_SUITE" != rcuscale && test "$TORTURE_SUITE" != refscale
 then
 	# check for abject failure
 
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
deleted file mode 100644
index c9f56cf20775..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/CFLIST
+++ /dev/null
@@ -1 +0,0 @@
-TREE
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
deleted file mode 100644
index a09816b8c0f3..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/CFcommon
+++ /dev/null
@@ -1,2 +0,0 @@
-CONFIG_RCU_PERF_TEST=y
-CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TINY b/tools/testing/selftests/rcutorture/configs/rcuperf/TINY
deleted file mode 100644
index fb05ef5279b4..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TINY
+++ /dev/null
@@ -1,16 +0,0 @@
-CONFIG_SMP=n
-CONFIG_PREEMPT_NONE=y
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=n
-#CHECK#CONFIG_TINY_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_PROVE_LOCKING=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
deleted file mode 100644
index 721cfda76ab2..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE
+++ /dev/null
@@ -1,19 +0,0 @@
-CONFIG_SMP=y
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_PROVE_LOCKING=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
deleted file mode 100644
index 7629f5dd73b2..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/TREE54
+++ /dev/null
@@ -1,22 +0,0 @@
-CONFIG_SMP=y
-CONFIG_NR_CPUS=54
-CONFIG_PREEMPT_NONE=n
-CONFIG_PREEMPT_VOLUNTARY=n
-CONFIG_PREEMPT=y
-#CHECK#CONFIG_PREEMPT_RCU=y
-CONFIG_HZ_PERIODIC=n
-CONFIG_NO_HZ_IDLE=y
-CONFIG_NO_HZ_FULL=n
-CONFIG_RCU_FAST_NO_HZ=n
-CONFIG_HOTPLUG_CPU=n
-CONFIG_SUSPEND=n
-CONFIG_HIBERNATION=n
-CONFIG_RCU_FANOUT=3
-CONFIG_RCU_FANOUT_LEAF=2
-CONFIG_RCU_NOCB_CPU=n
-CONFIG_DEBUG_LOCK_ALLOC=n
-CONFIG_PROVE_LOCKING=n
-CONFIG_RCU_BOOST=n
-CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
-CONFIG_RCU_EXPERT=y
-CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
deleted file mode 100644
index 777d5b0c190f..000000000000
--- a/tools/testing/selftests/rcutorture/configs/rcuperf/ver_functions.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-# SPDX-License-Identifier: GPL-2.0+
-#
-# Torture-suite-dependent shell functions for the rest of the scripts.
-#
-# Copyright (C) IBM Corporation, 2015
-#
-# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
-
-# per_version_boot_params bootparam-string config-file seconds
-#
-# Adds per-version torture-module parameters to kernels supporting them.
-per_version_boot_params () {
-	echo $1 rcuperf.shutdown=1 \
-		rcuperf.verbose=1
-}
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST b/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST
new file mode 100644
index 000000000000..c9f56cf20775
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFLIST
@@ -0,0 +1 @@
+TREE
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
new file mode 100644
index 000000000000..87caa0e932c7
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/CFcommon
@@ -0,0 +1,2 @@
+CONFIG_RCU_SCALE_TEST=y
+CONFIG_PRINTK_TIME=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TINY b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
new file mode 100644
index 000000000000..fb05ef5279b4
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TINY
@@ -0,0 +1,16 @@
+CONFIG_SMP=n
+CONFIG_PREEMPT_NONE=y
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=n
+#CHECK#CONFIG_TINY_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
new file mode 100644
index 000000000000..721cfda76ab2
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE
@@ -0,0 +1,19 @@
+CONFIG_SMP=y
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54 b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
new file mode 100644
index 000000000000..7629f5dd73b2
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/TREE54
@@ -0,0 +1,22 @@
+CONFIG_SMP=y
+CONFIG_NR_CPUS=54
+CONFIG_PREEMPT_NONE=n
+CONFIG_PREEMPT_VOLUNTARY=n
+CONFIG_PREEMPT=y
+#CHECK#CONFIG_PREEMPT_RCU=y
+CONFIG_HZ_PERIODIC=n
+CONFIG_NO_HZ_IDLE=y
+CONFIG_NO_HZ_FULL=n
+CONFIG_RCU_FAST_NO_HZ=n
+CONFIG_HOTPLUG_CPU=n
+CONFIG_SUSPEND=n
+CONFIG_HIBERNATION=n
+CONFIG_RCU_FANOUT=3
+CONFIG_RCU_FANOUT_LEAF=2
+CONFIG_RCU_NOCB_CPU=n
+CONFIG_DEBUG_LOCK_ALLOC=n
+CONFIG_PROVE_LOCKING=n
+CONFIG_RCU_BOOST=n
+CONFIG_DEBUG_OBJECTS_RCU_HEAD=n
+CONFIG_RCU_EXPERT=y
+CONFIG_RCU_TRACE=y
diff --git a/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
new file mode 100644
index 000000000000..0333e9b18522
--- /dev/null
+++ b/tools/testing/selftests/rcutorture/configs/rcuscale/ver_functions.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0+
+#
+# Torture-suite-dependent shell functions for the rest of the scripts.
+#
+# Copyright (C) IBM Corporation, 2015
+#
+# Authors: Paul E. McKenney <paulmck@linux.ibm.com>
+
+# per_version_boot_params bootparam-string config-file seconds
+#
+# Adds per-version torture-module parameters to kernels supporting them.
+per_version_boot_params () {
+	echo $1 rcuscale.shutdown=1 \
+		rcuscale.verbose=1
+}
-- 
cgit v1.2.3


From 8cbd0e38a9f2de38e8991c5c1c6f9024b2731d17 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 5 Aug 2020 15:51:20 -0700
Subject: rcu: Add Kconfig option for strict RCU grace periods

People running automated tests have asked for a way to make RCU minimize
grace-period duration in order to increase the probability of KASAN
detecting a pointer being improperly leaked from an RCU read-side critical
section, for example, like this:

	rcu_read_lock();
	p = rcu_dereference(gp);
	do_something_with(p); // OK
	rcu_read_unlock();
	do_something_else_with(p); // BUG!!!

The rcupdate.rcu_expedited boot parameter is a start in this direction,
given that it makes calls to synchronize_rcu() instead invoke the faster
(and more wasteful) synchronize_rcu_expedited().  However, this does
nothing to shorten RCU grace periods that are instead initiated by
call_rcu(), and RCU pointer-leak bugs can involve call_rcu() just as
surely as they can synchronize_rcu().

This commit therefore adds a RCU_STRICT_GRACE_PERIOD Kconfig option
that will be used to shorten normal (non-expedited) RCU grace periods.
This commit also dumps out a message when this option is in effect.
Later commits will actually shorten grace periods.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig.debug | 15 +++++++++++++++
 kernel/rcu/tree_plugin.h |  2 ++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig.debug b/kernel/rcu/Kconfig.debug
index 3cf6132a4bb9..cab5a4bebe9c 100644
--- a/kernel/rcu/Kconfig.debug
+++ b/kernel/rcu/Kconfig.debug
@@ -114,4 +114,19 @@ config RCU_EQS_DEBUG
 	  Say N here if you need ultimate kernel/user switch latencies
 	  Say Y if you are unsure
 
+config RCU_STRICT_GRACE_PERIOD
+	bool "Provide debug RCU implementation with short grace periods"
+	depends on DEBUG_KERNEL && RCU_EXPERT
+	default n
+	select PREEMPT_COUNT if PREEMPT=n
+	help
+	  Select this option to build an RCU variant that is strict about
+	  grace periods, making them as short as it can.  This limits
+	  scalability, destroys real-time response, degrades battery
+	  lifetime and kills performance.  Don't try this on large
+	  machines, as in systems with more than about 10 or 20 CPUs.
+	  But in conjunction with tools like KASAN, it can be helpful
+	  when looking for certain types of RCU usage bugs, for example,
+	  too-short RCU read-side critical sections.
+
 endmenu # "RCU Debugging"
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 982fc5be5269..44cf77db7cae 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -36,6 +36,8 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
 	if (IS_ENABLED(CONFIG_PROVE_RCU))
 		pr_info("\tRCU lockdep checking is enabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		pr_info("\tRCU strict (and thus non-scalable) grace periods enabled.\n");
 	if (RCU_NUM_LVLS >= 4)
 		pr_info("\tFour(or more)-level hierarchy is enabled.\n");
 	if (RCU_FANOUT_LEAF != 16)
-- 
cgit v1.2.3


From dc1269186bed3afc5a2018527516be84fe55d3e0 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 5 Aug 2020 16:52:17 -0700
Subject: rcu: Reduce leaf fanout for strict RCU grace periods

Because strict RCU grace periods will complete more quickly, they will
experience greater lock contention on each leaf rcu_node structure's
->lock.  This commit therefore reduces the leaf fanout in order to reduce
this lock contention.

Note that this also has the effect of reducing the number of CPUs
supported to 16 in the case of CONFIG_RCU_FANOUT_LEAF=2 or 81 in the
case of CONFIG_RCU_FANOUT_LEAF=3.  However, greater numbers of CPUs are
probably a bad idea when using CONFIG_RCU_STRICT_GRACE_PERIOD=y.  Those
wishing to live dangerously are free to edit their kernel/rcu/Kconfig
files accordingly.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/Kconfig | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index 0ebe15a84985..b71e21f73c40 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -135,10 +135,12 @@ config RCU_FANOUT
 
 config RCU_FANOUT_LEAF
 	int "Tree-based hierarchical RCU leaf-level fanout value"
-	range 2 64 if 64BIT
-	range 2 32 if !64BIT
+	range 2 64 if 64BIT && !RCU_STRICT_GRACE_PERIOD
+	range 2 32 if !64BIT && !RCU_STRICT_GRACE_PERIOD
+	range 2 3 if RCU_STRICT_GRACE_PERIOD
 	depends on TREE_RCU && RCU_EXPERT
-	default 16
+	default 16 if !RCU_STRICT_GRACE_PERIOD
+	default 2 if RCU_STRICT_GRACE_PERIOD
 	help
 	  This option controls the leaf-level fanout of hierarchical
 	  implementations of RCU, and allows trading off cache misses
-- 
cgit v1.2.3


From aecd34b9765de3b58c98a1d75b982fc64becd1e9 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 5 Aug 2020 17:25:23 -0700
Subject: rcu: Restrict default jiffies_till_first_fqs for strict RCU GPs

If there are idle CPUs, RCU's grace-period kthread will wait several
jiffies before even thinking about polling them.  This promotes
efficiency, which is normally a good thing, but when the kernel
has been built with CONFIG_RCU_STRICT_GRACE_PERIOD=y, we care more
about short grace periods.  This commit therefore restricts the
default jiffies_till_first_fqs value to zero in kernels built with
CONFIG_RCU_STRICT_GRACE_PERIOD=y, which causes RCU's grace-period kthread
to poll for idle CPUs immediately after starting a grace period.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8ce77d9ac716..85511590fc38 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -485,7 +485,7 @@ module_param(qhimark, long, 0444);
 module_param(qlowmark, long, 0444);
 module_param(qovld, long, 0444);
 
-static ulong jiffies_till_first_fqs = ULONG_MAX;
+static ulong jiffies_till_first_fqs = IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 0 : ULONG_MAX;
 static ulong jiffies_till_next_fqs = ULONG_MAX;
 static bool rcu_kick_kthreads;
 static int rcu_divisor = 7;
-- 
cgit v1.2.3


From 29fc5f93320cb447f83baedfe103ed784cadb073 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 06:39:30 -0700
Subject: rcu: Force DEFAULT_RCU_BLIMIT to 1000 for strict RCU GPs

The value of DEFAULT_RCU_BLIMIT is normally set to 10, the idea being to
avoid needless response-time degradation due to RCU callback invocation.
However, when CONFIG_RCU_STRICT_GRACE_PERIOD=y it is better to avoid
throttling callback execution in order to better detect pointer
leaks from RCU read-side critical sections.  This commit therefore
sets the value of DEFAULT_RCU_BLIMIT to 1000 in kernels built with
CONFIG_RCU_STRICT_GRACE_PERIOD=y.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 85511590fc38..443685704f5e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -468,17 +468,18 @@ static int rcu_is_cpu_rrupt_from_idle(void)
 	return __this_cpu_read(rcu_data.dynticks_nesting) == 0;
 }
 
-#define DEFAULT_RCU_BLIMIT 10     /* Maximum callbacks per rcu_do_batch ... */
-#define DEFAULT_MAX_RCU_BLIMIT 10000 /* ... even during callback flood. */
+#define DEFAULT_RCU_BLIMIT (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ? 1000 : 10)
+				// Maximum callbacks per rcu_do_batch ...
+#define DEFAULT_MAX_RCU_BLIMIT 10000 // ... even during callback flood.
 static long blimit = DEFAULT_RCU_BLIMIT;
-#define DEFAULT_RCU_QHIMARK 10000 /* If this many pending, ignore blimit. */
+#define DEFAULT_RCU_QHIMARK 10000 // If this many pending, ignore blimit.
 static long qhimark = DEFAULT_RCU_QHIMARK;
-#define DEFAULT_RCU_QLOMARK 100   /* Once only this many pending, use blimit. */
+#define DEFAULT_RCU_QLOMARK 100   // Once only this many pending, use blimit.
 static long qlowmark = DEFAULT_RCU_QLOMARK;
 #define DEFAULT_RCU_QOVLD_MULT 2
 #define DEFAULT_RCU_QOVLD (DEFAULT_RCU_QOVLD_MULT * DEFAULT_RCU_QHIMARK)
-static long qovld = DEFAULT_RCU_QOVLD; /* If this many pending, hammer QS. */
-static long qovld_calc = -1;	  /* No pre-initialization lock acquisitions! */
+static long qovld = DEFAULT_RCU_QOVLD; // If this many pending, hammer QS.
+static long qovld_calc = -1;	  // No pre-initialization lock acquisitions!
 
 module_param(blimit, long, 0444);
 module_param(qhimark, long, 0444);
-- 
cgit v1.2.3


From f19920e412fdeed1e15691bcee5b40e18b8e96ff Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 09:40:18 -0700
Subject: rcu: Always set .need_qs from __rcu_read_lock() for strict GPs

The ->rcu_read_unlock_special.b.need_qs field in the task_struct
structure indicates that the RCU core needs a quiscent state from the
corresponding task.  The __rcu_read_unlock() function checks this (via
an eventual call to rcu_preempt_deferred_qs_irqrestore()), and if set
reports a quiscent state immediately upon exit from the outermost RCU
read-side critical section.

Currently, this flag is only set when the scheduling-clock interrupt
decides that the current RCU grace period is too old, as in about
one full second too old.  But if the kernel has been built with
CONFIG_RCU_STRICT_GRACE_PERIOD=y, we clearly do not want to wait that
long.  This commit therefore sets the .need_qs field immediately at the
start of the RCU read-side critical section from within __rcu_read_lock()
in order to unconditionally enlist help from __rcu_read_unlock().

But note the additional check for rcu_state.gp_kthread, which prevents
attempts to awaken RCU's grace-period kthread during early boot before
there is a scheduler.  Leaving off this check results in early boot hangs.
So early that there is no console output.  Thus, this additional check
fails until such time as RCU's grace-period kthread has been created,
avoiding these empty-console hangs.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 44cf77db7cae..668bbd2be807 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -376,6 +376,8 @@ void __rcu_read_lock(void)
 	rcu_preempt_read_enter();
 	if (IS_ENABLED(CONFIG_PROVE_LOCKING))
 		WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
+		WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
 	barrier();  /* critical section after entry code. */
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
-- 
cgit v1.2.3


From 44bad5b3cca2d452d17ef82841b20b42a2cf11a0 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 15:12:50 -0700
Subject: rcu: Do full report for .need_qs for strict GPs

The rcu_preempt_deferred_qs_irqrestore() function is invoked at
the end of an RCU read-side critical section (for example, directly
from rcu_read_unlock()) and, if .need_qs is set, invokes rcu_qs() to
report the new quiescent state.  This works, except that rcu_qs() only
updates per-CPU state, leaving reporting of the actual quiescent state
to a later call to rcu_report_qs_rdp(), for example from within a later
RCU_SOFTIRQ instance.  Although this approach is exactly what you want if
you are more concerned about efficiency than about short grace periods,
in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, short grace periods are
the name of the game.

This commit therefore makes rcu_preempt_deferred_qs_irqrestore() directly
invoke rcu_report_qs_rdp() in CONFIG_RCU_STRICT_GRACE_PERIOD=y, thus
shortening grace periods.

Historical note:  To the best of my knowledge, causing rcu_read_unlock()
to directly report a quiescent state first appeared in Jim Houston's
and Joe Korty's JRCU.  This is the second instance of a Linux-kernel RCU
feature being inspired by JRCU, the first being RCU callback offloading
(as in the RCU_NOCB_CPU Kconfig option).

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree_plugin.h | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 668bbd2be807..dfdb9020f136 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -459,8 +459,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 		return;
 	}
 	t->rcu_read_unlock_special.s = 0;
-	if (special.b.need_qs)
-		rcu_qs();
+	if (special.b.need_qs) {
+		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+			rcu_report_qs_rdp(rdp->cpu, rdp);
+		else
+			rcu_qs();
+	}
 
 	/*
 	 * Respond to a request by an expedited grace period for a
-- 
cgit v1.2.3


From 1a2f5d57a33f7b9189b6b3e997eb858301482d79 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 16:35:08 -0700
Subject: rcu: Attempt QS when CPU discovers GP for strict GPs

A given CPU normally notes a new grace period during one RCU_SOFTIRQ,
but avoids reporting the corresponding quiescent state until some later
RCU_SOFTIRQ.  This leisurly approach improves efficiency by increasing
the number of update requests served by each grace period, but is not
what is needed for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y.

This commit therefore adds a new rcu_strict_gp_check_qs() function
which, in CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, simply enters and
immediately exist an RCU read-side critical section.  If the CPU is
in a quiescent state, the rcu_read_unlock() will attempt to report an
immediate quiescent state.  This rcu_strict_gp_check_qs() function is
invoked from note_gp_changes(), so that a CPU just noticing a new grace
period might immediately report a quiescent state for that grace period.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 443685704f5e..36a860c4648b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1574,6 +1574,19 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
 	raw_spin_unlock_rcu_node(rnp);
 }
 
+/*
+ * In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels, attempt to generate a
+ * quiescent state.  This is intended to be invoked when the CPU notices
+ * a new grace period.
+ */
+static void rcu_strict_gp_check_qs(void)
+{
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
+		rcu_read_lock();
+		rcu_read_unlock();
+	}
+}
+
 /*
  * Update CPU-local rcu_data state to record the beginnings and ends of
  * grace periods.  The caller must hold the ->lock of the leaf rcu_node
@@ -1644,6 +1657,7 @@ static void note_gp_changes(struct rcu_data *rdp)
 	}
 	needwake = __note_gp_changes(rnp, rdp);
 	raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+	rcu_strict_gp_check_qs();
 	if (needwake)
 		rcu_gp_kthread_wake();
 }
-- 
cgit v1.2.3


From 933ada2c3310aa88807e65c8d498b74a2159a9a2 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 19:21:48 -0700
Subject: rcu: IPI all CPUs at GP start for strict GPs

Currently, each CPU discovers the beginning of a given grace period
on its own time, which is again good for efficiency but bad for fast
grace periods.  This commit therefore uses on_each_cpu() to IPI each
CPU after grace-period initialization in order to inform each CPU of
the new grace period in a timely manner, but only in kernels build with
CONFIG_RCU_STRICT_GRACE_PERIOD=y.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 36a860c4648b..88f4fa639964 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1695,6 +1695,15 @@ static void rcu_gp_torture_wait(void)
 	}
 }
 
+/*
+ * Handler for on_each_cpu() to invoke the target CPU's RCU core
+ * processing.
+ */
+static void rcu_strict_gp_boundary(void *unused)
+{
+	invoke_rcu_core();
+}
+
 /*
  * Initialize a new grace period.  Return false if no grace period required.
  */
@@ -1823,6 +1832,10 @@ static bool rcu_gp_init(void)
 		WRITE_ONCE(rcu_state.gp_activity, jiffies);
 	}
 
+	// If strict, make all CPUs aware of new grace period.
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
+
 	return true;
 }
 
-- 
cgit v1.2.3


From 4e025f52a1e0e8ff4e303fa0a80e2061ccfa27d6 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 6 Aug 2020 19:42:47 -0700
Subject: rcu: IPI all CPUs at GP end for strict GPs

Currently, each CPU discovers the end of a given grace period on its
own time, which is again good for efficiency but bad for fast grace
periods, given that it is things like kfree() within the RCU callbacks
that will cause trouble for pointers leaked from RCU read-side critical
sections.  This commit therefore uses on_each_cpu() to IPI each CPU
after grace-period cleanup in order to inform each CPU of the end of
the old grace period in a timely manner, but only in kernels build with
CONFIG_RCU_STRICT_GRACE_PERIOD=y.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 88f4fa639964..4bbedfc0f79b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2052,6 +2052,10 @@ static void rcu_gp_cleanup(void)
 			   rcu_state.gp_flags & RCU_GP_FLAG_INIT);
 	}
 	raw_spin_unlock_irq_rcu_node(rnp);
+
+	// If strict, make all CPUs aware of the end of the old grace period.
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		on_each_cpu(rcu_strict_gp_boundary, NULL, 0);
 }
 
 /*
-- 
cgit v1.2.3


From 3d29aaf1ef992b5b4612fe32b9e6f517f7bba904 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Fri, 7 Aug 2020 13:44:10 -0700
Subject: rcu: Provide optional RCU-reader exit delay for strict GPs

The goal of this series is to increase the probability of tools like
KASAN detecting that an RCU-protected pointer was used outside of its
RCU read-side critical section.  Thus far, the approach has been to make
grace periods and callback processing happen faster.  Another approach
is to delay the pointer leaker.  This commit therefore allows a delay
to be applied to exit from RCU read-side critical sections.

This slowdown is specified by a new rcutree.rcu_unlock_delay kernel boot
parameter that specifies this delay in microseconds, defaulting to zero.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt |  9 +++++++++
 kernel/rcu/tree_plugin.h                        | 12 ++++++++++--
 2 files changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdc1f33fd3d1..cb9062440dda 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4152,6 +4152,15 @@
 			This wake_up() will be accompanied by a
 			WARN_ONCE() splat and an ftrace_dump().
 
+	rcutree.rcu_unlock_delay= [KNL]
+			In CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels,
+			this specifies an rcu_read_unlock()-time delay
+			in microseconds.  This defaults to zero.
+			Larger delays increase the probability of
+			catching RCU pointer leaks, that is, buggy use
+			of RCU-protected pointers after the relevant
+			rcu_read_unlock() has completed.
+
 	rcutree.sysrq_rcu= [KNL]
 			Commandeer a sysrq key to dump out Tree RCU's
 			rcu_node tree with an eye towards determining
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index dfdb9020f136..3f3a4ffd4df2 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -430,6 +430,12 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 	return !list_empty(&rnp->blkd_tasks);
 }
 
+// Add delay to rcu_read_unlock() for strict grace periods.
+static int rcu_unlock_delay;
+#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
+module_param(rcu_unlock_delay, int, 0444);
+#endif
+
 /*
  * Report deferred quiescent states.  The deferral time can
  * be quite short, for example, in the case of the call from
@@ -460,10 +466,12 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	}
 	t->rcu_read_unlock_special.s = 0;
 	if (special.b.need_qs) {
-		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
 			rcu_report_qs_rdp(rdp->cpu, rdp);
-		else
+			udelay(rcu_unlock_delay);
+		} else {
 			rcu_qs();
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From a657f2617010ae237db5693f875968c28e8f732f Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Sat, 8 Aug 2020 07:56:31 -0700
Subject: rcu: Execute RCU reader shortly after rcu_core for strict GPs

A kernel built with CONFIG_RCU_STRICT_GRACE_PERIOD=y needs a quiescent
state to appear very shortly after a CPU has noticed a new grace period.
Placing an RCU reader immediately after this point is ineffective because
this normally happens in softirq context, which acts as a big RCU reader.
This commit therefore introduces a new per-CPU work_struct, which is
used at the end of rcu_core() processing to schedule an RCU read-side
critical section from within a clean environment.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 13 +++++++++++++
 kernel/rcu/tree.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4bbedfc0f79b..31995b3f0ed9 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2646,6 +2646,14 @@ void rcu_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
 
+// Workqueue handler for an RCU reader for kernels enforcing struct RCU
+// grace periods.
+static void strict_work_handler(struct work_struct *work)
+{
+	rcu_read_lock();
+	rcu_read_unlock();
+}
+
 /* Perform RCU core processing work for the current CPU.  */
 static __latent_entropy void rcu_core(void)
 {
@@ -2690,6 +2698,10 @@ static __latent_entropy void rcu_core(void)
 	/* Do any needed deferred wakeups of rcuo kthreads. */
 	do_nocb_deferred_wakeup(rdp);
 	trace_rcu_utilization(TPS("End RCU core"));
+
+	// If strict GPs, schedule an RCU reader in a clean environment.
+	if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
+		queue_work_on(rdp->cpu, rcu_gp_wq, &rdp->strict_work);
 }
 
 static void rcu_core_si(struct softirq_action *h)
@@ -3887,6 +3899,7 @@ rcu_boot_init_percpu_data(int cpu)
 
 	/* Set up local state, ensuring consistent view of global state. */
 	rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu);
+	INIT_WORK(&rdp->strict_work, strict_work_handler);
 	WARN_ON_ONCE(rdp->dynticks_nesting != 1);
 	WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp)));
 	rdp->rcu_ofl_gp_seq = rcu_state.gp_seq;
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index c96ae351688b..5831ac0b254f 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -164,6 +164,7 @@ struct rcu_data {
 					/* period it is aware of. */
 	struct irq_work defer_qs_iw;	/* Obtain later scheduler attention. */
 	bool defer_qs_iw_pending;	/* Scheduler attention pending? */
+	struct work_struct strict_work;	/* Schedule readers for strict GPs. */
 
 	/* 2) batch handling */
 	struct rcu_segcblist cblist;	/* Segmented callback list, with */
-- 
cgit v1.2.3


From aa40c138cc8f36e2f5c721fd1bdb823a1ef1a237 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 10 Aug 2020 09:58:03 -0700
Subject: rcu: Report QS for outermost PREEMPT=n rcu_read_unlock() for strict
 GPs

The CONFIG_PREEMPT=n instance of rcu_read_unlock is even more
aggressively than that of CONFIG_PREEMPT=y in deferring reporting
quiescent states to the RCU core.  This is just what is wanted in normal
use because it reduces overhead, but the resulting delay is not what
is wanted for kernels built with CONFIG_RCU_STRICT_GRACE_PERIOD=y.
This commit therefore adds an rcu_read_unlock_strict() function that
checks for exceptional conditions, and reports the newly started
quiescent state if it is safe to do so, also doing a spin-delay if
requested via rcutree.rcu_unlock_delay.  This commit also adds a call
to rcu_read_unlock_strict() from the CONFIG_PREEMPT=n instance of
__rcu_read_unlock().

[ paulmck: Fixed bug located by kernel test robot <lkp@intel.com> ]
Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/rcupdate.h |  7 +++++++
 kernel/rcu/tree.c        |  6 ++++++
 kernel/rcu/tree_plugin.h | 24 ++++++++++++++++++------
 3 files changed, 31 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d15d46db61f7..522529a13786 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -55,6 +55,12 @@ void __rcu_read_unlock(void);
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
+#ifdef CONFIG_TINY_RCU
+#define rcu_read_unlock_strict() do { } while (0)
+#else
+void rcu_read_unlock_strict(void);
+#endif
+
 static inline void __rcu_read_lock(void)
 {
 	preempt_disable();
@@ -63,6 +69,7 @@ static inline void __rcu_read_lock(void)
 static inline void __rcu_read_unlock(void)
 {
 	preempt_enable();
+	rcu_read_unlock_strict();
 }
 
 static inline int rcu_preempt_depth(void)
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 31995b3f0ed9..a295cadf7c2f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -178,6 +178,12 @@ module_param(gp_init_delay, int, 0444);
 static int gp_cleanup_delay;
 module_param(gp_cleanup_delay, int, 0444);
 
+// Add delay to rcu_read_unlock() for strict grace periods.
+static int rcu_unlock_delay;
+#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
+module_param(rcu_unlock_delay, int, 0444);
+#endif
+
 /*
  * This rcu parameter is runtime-read-only. It reflects
  * a minimum allowed number of objects which can be cached
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 3f3a4ffd4df2..25a676dff5de 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -430,12 +430,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 	return !list_empty(&rnp->blkd_tasks);
 }
 
-// Add delay to rcu_read_unlock() for strict grace periods.
-static int rcu_unlock_delay;
-#ifdef CONFIG_RCU_STRICT_GRACE_PERIOD
-module_param(rcu_unlock_delay, int, 0444);
-#endif
-
 /*
  * Report deferred quiescent states.  The deferral time can
  * be quite short, for example, in the case of the call from
@@ -784,6 +778,24 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
 
 #else /* #ifdef CONFIG_PREEMPT_RCU */
 
+/*
+ * If strict grace periods are enabled, and if the calling
+ * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
+ * report that quiescent state and, if requested, spin for a bit.
+ */
+void rcu_read_unlock_strict(void)
+{
+	struct rcu_data *rdp;
+
+	if (!IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) ||
+	   irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
+		return;
+	rdp = this_cpu_ptr(&rcu_data);
+	rcu_report_qs_rdp(rdp->cpu, rdp);
+	udelay(rcu_unlock_delay);
+}
+EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
+
 /*
  * Tell them what RCU they are running.
  */
-- 
cgit v1.2.3


From cfeac3977ab4b6222a01f79997739d2367a8cc94 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Thu, 20 Aug 2020 11:26:14 -0700
Subject: rcu: Remove unused "cpu" parameter from rcu_report_qs_rdp()

The "cpu" parameter to rcu_report_qs_rdp() is not used, with rdp->cpu
being used instead.  Furtheremore, every call to rcu_report_qs_rdp()
invokes it on rdp->cpu.  This commit therefore removes this unused "cpu"
parameter and converts a check of rdp->cpu against smp_processor_id()
to a WARN_ON_ONCE().

Reported-by: Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c        | 8 ++++----
 kernel/rcu/tree_plugin.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a295cadf7c2f..c6127651efc6 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2240,7 +2240,7 @@ rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
  * structure.  This must be called from the specified CPU.
  */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
+rcu_report_qs_rdp(struct rcu_data *rdp)
 {
 	unsigned long flags;
 	unsigned long mask;
@@ -2249,6 +2249,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
 			       rcu_segcblist_is_offloaded(&rdp->cblist);
 	struct rcu_node *rnp;
 
+	WARN_ON_ONCE(rdp->cpu != smp_processor_id());
 	rnp = rdp->mynode;
 	raw_spin_lock_irqsave_rcu_node(rnp, flags);
 	if (rdp->cpu_no_qs.b.norm || rdp->gp_seq != rnp->gp_seq ||
@@ -2265,8 +2266,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_data *rdp)
 		return;
 	}
 	mask = rdp->grpmask;
-	if (rdp->cpu == smp_processor_id())
-		rdp->core_needs_qs = false;
+	rdp->core_needs_qs = false;
 	if ((rnp->qsmask & mask) == 0) {
 		raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
 	} else {
@@ -2315,7 +2315,7 @@ rcu_check_quiescent_state(struct rcu_data *rdp)
 	 * Tell RCU we are done (but rcu_report_qs_rdp() will be the
 	 * judge of that).
 	 */
-	rcu_report_qs_rdp(rdp->cpu, rdp);
+	rcu_report_qs_rdp(rdp);
 }
 
 /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 25a676dff5de..ca31be019f55 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -461,7 +461,7 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
 	t->rcu_read_unlock_special.s = 0;
 	if (special.b.need_qs) {
 		if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
-			rcu_report_qs_rdp(rdp->cpu, rdp);
+			rcu_report_qs_rdp(rdp);
 			udelay(rcu_unlock_delay);
 		} else {
 			rcu_qs();
@@ -791,7 +791,7 @@ void rcu_read_unlock_strict(void)
 	   irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
 		return;
 	rdp = this_cpu_ptr(&rcu_data);
-	rcu_report_qs_rdp(rdp->cpu, rdp);
+	rcu_report_qs_rdp(rdp);
 	udelay(rcu_unlock_delay);
 }
 EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
-- 
cgit v1.2.3


From 83224afd11d71e0d6effb86fe1ab5725d5415251 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 17 Jun 2020 13:22:17 -0700
Subject: rcutorture: Remove KCSAN stubs

KCSAN is now in mainline, so this commit removes the stubs for the
data_race(), ASSERT_EXCLUSIVE_WRITER(), and ASSERT_EXCLUSIVE_ACCESS()
macros.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 13 -------------
 1 file changed, 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index f453bf8d2f1e..db3786133644 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -52,19 +52,6 @@
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Paul E. McKenney <paulmck@linux.ibm.com> and Josh Triplett <josh@joshtriplett.org>");
 
-#ifndef data_race
-#define data_race(expr)							\
-	({								\
-		expr;							\
-	})
-#endif
-#ifndef ASSERT_EXCLUSIVE_WRITER
-#define ASSERT_EXCLUSIVE_WRITER(var) do { } while (0)
-#endif
-#ifndef ASSERT_EXCLUSIVE_ACCESS
-#define ASSERT_EXCLUSIVE_ACCESS(var) do { } while (0)
-#endif
-
 /* Bits for ->extendables field, extendables param, and related definitions. */
 #define RCUTORTURE_RDR_SHIFT	 8	/* Put SRCU index in upper bits. */
 #define RCUTORTURE_RDR_MASK	 ((1 << RCUTORTURE_RDR_SHIFT) - 1)
-- 
cgit v1.2.3


From 959954df0ca7da2111c3fb67a666681798d15b9d Mon Sep 17 00:00:00 2001
From: Joel Fernandes (Google)
Date: Thu, 18 Jun 2020 16:29:55 -0400
Subject: rcutorture: Output number of elapsed grace periods

This commit adds code to print the grace-period number at the start
of the test along with both the grace-period number and the number of
elapsed grace periods at the end of the test.  Note that variants of
RCU)without the notion of a grace-period number (for example, Tiny RCU)
just print zeroes.

[ paulmck: Adjust commit log. ]
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index db3786133644..c8206ff6007f 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -172,6 +172,7 @@ static long n_barrier_successes; /* did rcu_barrier test succeed? */
 static unsigned long n_read_exits;
 static struct list_head rcu_torture_removed;
 static unsigned long shutdown_jiffies;
+static unsigned long start_gp_seq;
 
 static int rcu_torture_writer_state;
 #define RTWS_FIXED_DELAY	0
@@ -2469,8 +2470,9 @@ rcu_torture_cleanup(void)
 
 	rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
 	srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
-	pr_alert("%s:  End-test grace-period state: g%lu f%#x\n",
-		 cur_ops->name, gp_seq, flags);
+	pr_alert("%s:  End-test grace-period state: g%ld f%#x total-gps=%ld\n",
+		 cur_ops->name, (long)gp_seq, flags,
+		 rcutorture_seq_diff(gp_seq, start_gp_seq));
 	torture_stop_kthread(rcu_torture_stats, stats_task);
 	torture_stop_kthread(rcu_torture_fqs, fqs_task);
 	if (rcu_torture_can_boost())
@@ -2594,6 +2596,8 @@ rcu_torture_init(void)
 	long i;
 	int cpu;
 	int firsterr = 0;
+	int flags = 0;
+	unsigned long gp_seq = 0;
 	static struct rcu_torture_ops *torture_ops[] = {
 		&rcu_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
 		&busted_srcud_ops, &tasks_ops, &tasks_rude_ops,
@@ -2636,6 +2640,11 @@ rcu_torture_init(void)
 			nrealreaders = 1;
 	}
 	rcu_torture_print_module_parms(cur_ops, "Start of test");
+	rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq);
+	srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq);
+	start_gp_seq = gp_seq;
+	pr_alert("%s:  Start-test grace-period state: g%ld f%#x\n",
+		 cur_ops->name, (long)gp_seq, flags);
 
 	/* Set up the freelist. */
 
-- 
cgit v1.2.3


From d49bed9abc3454bd123cbe974ecbeae119701b92 Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Fri, 3 Jul 2020 13:05:27 +0800
Subject: locktorture: Make function torture_percpu_rwsem_init() static

The sparse tool complains as follows:

kernel/locking/locktorture.c:569:6: warning:
 symbol 'torture_percpu_rwsem_init' was not declared. Should it be static?

And this function is not used outside of locktorture.c,
so this commit marks it static.

Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/locking/locktorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index 9cfa5e89cff7..62d215b2e39f 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -566,7 +566,7 @@ static struct lock_torture_ops rwsem_lock_ops = {
 #include <linux/percpu-rwsem.h>
 static struct percpu_rw_semaphore pcpu_rwsem;
 
-void torture_percpu_rwsem_init(void)
+static void torture_percpu_rwsem_init(void)
 {
 	BUG_ON(percpu_init_rwsem(&pcpu_rwsem));
 }
-- 
cgit v1.2.3


From c8fa63714763b7795a3f5fb7ed6d000763e6dccc Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Sun, 19 Jul 2020 14:40:31 -0700
Subject: rcutorture: Properly set rcu_fwds for OOM handling

The conversion of rcu_fwds to dynamic allocation failed to actually
allocate the required structure.  This commit therefore allocates it,
frees it, and updates rcu_fwds accordingly.  While in the area, it
abstracts the cleanup actions into rcu_torture_fwd_prog_cleanup().

Fixes: 5155be9994e5 ("rcutorture: Dynamically allocate rcu_fwds structure")
Reported-by: kernel test robot <rong.a.chen@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index c8206ff6007f..7942be453a14 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2148,9 +2148,20 @@ static int __init rcu_torture_fwd_prog_init(void)
 		return -ENOMEM;
 	spin_lock_init(&rfp->rcu_fwd_lock);
 	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+	rcu_fwds = rfp;
 	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 }
 
+static void rcu_torture_fwd_prog_cleanup(void)
+{
+	struct rcu_fwd *rfp;
+
+	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
+	rfp = rcu_fwds;
+	rcu_fwds = NULL;
+	kfree(rfp);
+}
+
 /* Callback function for RCU barrier testing. */
 static void rcu_torture_barrier_cbf(struct rcu_head *rcu)
 {
@@ -2448,7 +2459,7 @@ rcu_torture_cleanup(void)
 	show_rcu_gp_kthreads();
 	rcu_torture_read_exit_cleanup();
 	rcu_torture_barrier_cleanup();
-	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
+	rcu_torture_fwd_prog_cleanup();
 	torture_stop_kthread(rcu_torture_stall, stall_task);
 	torture_stop_kthread(rcu_torture_writer, writer_task);
 
-- 
cgit v1.2.3


From 57f602022e82ee8fa6476d0e16ddbaf3eb86b245 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 20 Jul 2020 08:34:07 -0700
Subject: rcutorture: Properly synchronize with OOM notifier

The current rcutorture forward-progress code assumes that it is the
only cause of out-of-memory (OOM) events.  For script-based rcutorture
testing, this assumption is in fact correct.  However, testing based
on modprobe/rmmod might well encounter external OOM events, which could
happen at any time.

This commit therefore properly synchronizes the interaction between
rcutorture's forward-progress testing and its OOM notifier by adding a
global mutex.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 7942be453a14..2b3f04e0af03 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -1796,6 +1796,7 @@ struct rcu_fwd {
 	unsigned long rcu_launder_gp_seq_start;
 };
 
+static DEFINE_MUTEX(rcu_fwd_mutex);
 static struct rcu_fwd *rcu_fwds;
 static bool rcu_fwd_emergency_stop;
 
@@ -2062,8 +2063,14 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp)
 static int rcutorture_oom_notify(struct notifier_block *self,
 				 unsigned long notused, void *nfreed)
 {
-	struct rcu_fwd *rfp = rcu_fwds;
+	struct rcu_fwd *rfp;
 
+	mutex_lock(&rcu_fwd_mutex);
+	rfp = rcu_fwds;
+	if (!rfp) {
+		mutex_unlock(&rcu_fwd_mutex);
+		return NOTIFY_OK;
+	}
 	WARN(1, "%s invoked upon OOM during forward-progress testing.\n",
 	     __func__);
 	rcu_torture_fwd_cb_hist(rfp);
@@ -2081,6 +2088,7 @@ static int rcutorture_oom_notify(struct notifier_block *self,
 	smp_mb(); /* Frees before return to avoid redoing OOM. */
 	(*(unsigned long *)nfreed)++; /* Forward progress CBs freed! */
 	pr_info("%s returning after OOM processing.\n", __func__);
+	mutex_unlock(&rcu_fwd_mutex);
 	return NOTIFY_OK;
 }
 
@@ -2148,7 +2156,9 @@ static int __init rcu_torture_fwd_prog_init(void)
 		return -ENOMEM;
 	spin_lock_init(&rfp->rcu_fwd_lock);
 	rfp->rcu_fwd_cb_tail = &rfp->rcu_fwd_cb_head;
+	mutex_lock(&rcu_fwd_mutex);
 	rcu_fwds = rfp;
+	mutex_unlock(&rcu_fwd_mutex);
 	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 }
 
@@ -2158,7 +2168,9 @@ static void rcu_torture_fwd_prog_cleanup(void)
 
 	torture_stop_kthread(rcu_torture_fwd_prog, fwd_prog_task);
 	rfp = rcu_fwds;
+	mutex_lock(&rcu_fwd_mutex);
 	rcu_fwds = NULL;
+	mutex_unlock(&rcu_fwd_mutex);
 	kfree(rfp);
 }
 
-- 
cgit v1.2.3


From 58db5785b0d76be4582a32a7900acce88e691d36 Mon Sep 17 00:00:00 2001
From: Colin Ian King
Date: Thu, 16 Jul 2020 15:38:56 +0100
Subject: refperf: Avoid null pointer dereference when buf fails to allocate

Currently in the unlikely event that buf fails to be allocated it
is dereferenced a few times.  Use the errexit flag to determine if
buf should be written to to avoid the null pointer dereferences.

Addresses-Coverity: ("Dereference after null check")
Fixes: f518f154ecef ("refperf: Dynamically allocate experiment-summary output buffer")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/refscale.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index d9291f883b54..952595c678b3 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -546,9 +546,11 @@ static int main_func(void *arg)
 	// Print the average of all experiments
 	SCALEOUT("END OF TEST. Calculating average duration per loop (nanoseconds)...\n");
 
-	buf[0] = 0;
-	strcat(buf, "\n");
-	strcat(buf, "Runs\tTime(ns)\n");
+	if (!errexit) {
+		buf[0] = 0;
+		strcat(buf, "\n");
+		strcat(buf, "Runs\tTime(ns)\n");
+	}
 
 	for (exp = 0; exp < nruns; exp++) {
 		u64 avg;
-- 
cgit v1.2.3


From 299c7d94f635ab93ffb0468aec6b6e2176ec5cbf Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 22 Jul 2020 10:45:12 -0700
Subject: rcutorture: Hoist OOM registry up one level

Currently, registering and unregistering the OOM notifier is done
right before and after the test, respectively.  This will not work
well for multi-threaded tests, so this commit hoists this registering
and unregistering up into the rcu_torture_fwd_prog_init() and
rcu_torture_fwd_prog_cleanup() functions.

Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcutorture.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 2b3f04e0af03..983f82fccb18 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2110,13 +2110,11 @@ static int rcu_torture_fwd_prog(void *args)
 	do {
 		schedule_timeout_interruptible(fwd_progress_holdoff * HZ);
 		WRITE_ONCE(rcu_fwd_emergency_stop, false);
-		register_oom_notifier(&rcutorture_oom_nb);
 		if (!IS_ENABLED(CONFIG_TINY_RCU) ||
 		    rcu_inkernel_boot_has_ended())
 			rcu_torture_fwd_prog_nr(rfp, &tested, &tested_tries);
 		if (rcu_inkernel_boot_has_ended())
 			rcu_torture_fwd_prog_cr(rfp);
-		unregister_oom_notifier(&rcutorture_oom_nb);
 
 		/* Avoid slow periods, better to test when busy. */
 		stutter_wait("rcu_torture_fwd_prog");
@@ -2159,6 +2157,7 @@ static int __init rcu_torture_fwd_prog_init(void)
 	mutex_lock(&rcu_fwd_mutex);
 	rcu_fwds = rfp;
 	mutex_unlock(&rcu_fwd_mutex);
+	register_oom_notifier(&rcutorture_oom_nb);
 	return torture_create_kthread(rcu_torture_fwd_prog, rfp, fwd_prog_task);
 }
 
@@ -2171,6 +2170,7 @@ static void rcu_torture_fwd_prog_cleanup(void)
 	mutex_lock(&rcu_fwd_mutex);
 	rcu_fwds = NULL;
 	mutex_unlock(&rcu_fwd_mutex);
+	unregister_oom_notifier(&rcutorture_oom_nb);
 	kfree(rfp);
 }
 
-- 
cgit v1.2.3


From d685514260e21aabd65a9aa8be045766bdaa0549 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 11 Aug 2020 10:33:39 -0700
Subject: rcutorture: Allow pointer leaks to test diagnostic code

This commit adds an rcutorture.leakpointer module parameter that
intentionally leaks an RCU-protected pointer out of the RCU read-side
critical section and checks to see if the corresponding grace period
has elapsed, emitting a WARN_ON_ONCE() if so.  This module parameter can
be used to test facilities like CONFIG_RCU_STRICT_GRACE_PERIOD that end
grace periods quickly.

While in the area, also document rcutorture.irqreader, which was
previously left out.

Reported-by Jann Horn <jannh@google.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 Documentation/admin-guide/kernel-parameters.txt | 12 ++++++++++++
 kernel/rcu/rcutorture.c                         |  4 ++++
 2 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index bdc1f33fd3d1..6d984f153669 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4269,6 +4269,18 @@
 			are zero, rcutorture acts as if is interpreted
 			they are all non-zero.
 
+	rcutorture.irqreader= [KNL]
+			Run RCU readers from irq handlers, or, more
+			accurately, from a timer handler.  Not all RCU
+			flavors take kindly to this sort of thing.
+
+	rcutorture.leakpointer= [KNL]
+			Leak an RCU-protected pointer out of the reader.
+			This can of course result in splats, and is
+			intended to test the ability of things like
+			CONFIG_RCU_STRICT_GRACE_PERIOD=y to detect
+			such leaks.
+
 	rcutorture.n_barrier_cbs= [KNL]
 			Set callbacks/threads for rcu_barrier() testing.
 
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 983f82fccb18..916ea4f66e4b 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -87,6 +87,7 @@ torture_param(bool, gp_normal, false,
 	     "Use normal (non-expedited) GP wait primitives");
 torture_param(bool, gp_sync, false, "Use synchronous GP wait primitives");
 torture_param(int, irqreader, 1, "Allow RCU readers from irq handlers");
+torture_param(int, leakpointer, 0, "Leak pointer dereferences from readers");
 torture_param(int, n_barrier_cbs, 0,
 	     "# of callbacks/kthreads for barrier testing");
 torture_param(int, nfakewriters, 4, "Number of RCU fake writer threads");
@@ -1401,6 +1402,9 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp)
 	preempt_enable();
 	rcutorture_one_extend(&readstate, 0, trsp, rtrsp);
 	WARN_ON_ONCE(readstate & RCUTORTURE_RDR_MASK);
+	// This next splat is expected behavior if leakpointer, especially
+	// for CONFIG_RCU_STRICT_GRACE_PERIOD=y kernels.
+	WARN_ON_ONCE(leakpointer && READ_ONCE(p->rtort_pipe_count) > 1);
 
 	/* If error or close call, record the sequence of reader protections. */
 	if ((pipe_count > 1 || completed > 1) && !xchg(&err_segs_recorded, 1)) {
-- 
cgit v1.2.3


From 53922270d21de707a1a0ffaf1e07644e77fcb8db Mon Sep 17 00:00:00 2001
From: Joel Fernandes (Google)
Date: Thu, 18 Jun 2020 16:29:49 -0400
Subject: rcu/segcblist: Prevent useless GP start if no CBs to accelerate

The rcu_segcblist_accelerate() function returns true iff it is necessary
to request another grace period.  A tracing session showed that this
function unnecessarily requests grace periods.

For example, consider the following sequence of events:
1. Callbacks are queued only on the NEXT segment of CPU A's callback list.
2. CPU A runs RCU_SOFTIRQ, accelerating these callbacks from NEXT to WAIT.
3. Thus rcu_segcblist_accelerate() returns true, requesting grace period N.
4. RCU's grace-period kthread wakes up on CPU B and starts grace period N.
4. CPU A notices the new grace period and invokes RCU_SOFTIRQ.
5. CPU A's RCU_SOFTIRQ again invokes rcu_segcblist_accelerate(), but
   there are no new callbacks.  However, rcu_segcblist_accelerate()
   nevertheless (uselessly) requests a new grace period N+1.

This extra grace period results in additional lock contention and also
additional wakeups, all for no good reason.

This commit therefore adds a check to rcu_segcblist_accelerate() that
prevents the return of true when there are no new callbacks.

This change reduces the number of grace periods (GPs) and wakeups in each
of eleven five-second rcutorture runs as follows:

+----+-------------------+-------------------+
| #  | Number of GPs     | Number of Wakeups |
+====+=========+=========+=========+=========+
| 1  | With    | Without | With    | Without |
+----+---------+---------+---------+---------+
| 2  |      75 |      89 |     113 |     119 |
+----+---------+---------+---------+---------+
| 3  |      62 |      91 |     105 |     123 |
+----+---------+---------+---------+---------+
| 4  |      60 |      79 |      98 |     110 |
+----+---------+---------+---------+---------+
| 5  |      63 |      79 |      99 |     112 |
+----+---------+---------+---------+---------+
| 6  |      57 |      89 |      96 |     123 |
+----+---------+---------+---------+---------+
| 7  |      64 |      85 |      97 |     118 |
+----+---------+---------+---------+---------+
| 8  |      58 |      83 |      98 |     113 |
+----+---------+---------+---------+---------+
| 9  |      57 |      77 |      89 |     104 |
+----+---------+---------+---------+---------+
| 10 |      66 |      82 |      98 |     119 |
+----+---------+---------+---------+---------+
| 11 |      52 |      82 |      83 |     117 |
+----+---------+---------+---------+---------+

The reduction in the number of wakeups ranges from 5% to 40%.

Cc: urezki@gmail.com
[ paulmck: Rework commit log and comment. ]
Signed-off-by: Joel Fernandes (Google) <joel@joelfernandes.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/rcu_segcblist.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 9a0f66133b4b..2d2a6b6b9dfb 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -475,8 +475,16 @@ bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
 	 * Also advance to the oldest segment of callbacks whose
 	 * ->gp_seq[] completion is at or after that passed in via "seq",
 	 * skipping any empty segments.
+	 *
+	 * Note that segment "i" (and any lower-numbered segments
+	 * containing older callbacks) will be unaffected, and their
+	 * grace-period numbers remain unchanged.  For example, if i ==
+	 * WAIT_TAIL, then neither WAIT_TAIL nor DONE_TAIL will be touched.
+	 * Instead, the CBs in NEXT_TAIL will be merged with those in
+	 * NEXT_READY_TAIL and the grace-period number of NEXT_READY_TAIL
+	 * would be updated.  NEXT_TAIL would then be empty.
 	 */
-	if (++i >= RCU_NEXT_TAIL)
+	if (rcu_segcblist_restempty(rsclp, i) || ++i >= RCU_NEXT_TAIL)
 		return false;
 
 	/*
-- 
cgit v1.2.3


From 70060b8770d34f83e9fa4c3526db013dd2773611 Mon Sep 17 00:00:00 2001
From: Zqiang
Date: Fri, 14 Aug 2020 14:45:57 +0800
Subject: rcu: Shrink each possible cpu krcp

CPUs can go offline shortly after kfree_call_rcu() has been invoked,
which can leave memory stranded until those CPUs come back online.
This commit therefore drains the kcrp of each CPU, not just the
ones that happen to be online.

Acked-by: Joel Fernandes <joel@joelfernandes.org>
Signed-off-by: Zqiang <qiang.zhang@windriver.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 232362293678..92450642c4d8 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3450,7 +3450,7 @@ kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 	unsigned long count = 0;
 
 	/* Snapshot count of all CPUs */
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		count += READ_ONCE(krcp->count);
@@ -3465,7 +3465,7 @@ kfree_rcu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 	int cpu, freed = 0;
 	unsigned long flags;
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		int count;
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
@@ -3498,7 +3498,7 @@ void __init kfree_rcu_scheduler_running(void)
 	int cpu;
 	unsigned long flags;
 
-	for_each_online_cpu(cpu) {
+	for_each_possible_cpu(cpu) {
 		struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
 
 		raw_spin_lock_irqsave(&krcp->lock, flags);
-- 
cgit v1.2.3


From e48c15b796d412ede883bb2ef7779b2a142f7962 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Mon, 29 Jun 2020 17:21:32 -0700
Subject: smp: Add source and destination CPUs to __call_single_data

This commit adds a destination CPU to __call_single_data, and is inspired
by an earlier commit by Peter Zijlstra.  This version adds #ifdef to
permit use by 32-bit systems and supplying the destination CPU for all
smp_call_function*() requests, not just smp_call_function_single().

If need be, 32-bit systems could be accommodated by shrinking the flags
field to 16 bits (the atomic_t variant is currently unused) and by
providing only eight bits for CPU on such systems.

It is not clear that the addition of the fields to __call_single_node
are really needed.

[ paulmck: Apply Boqun Feng feedback on 32-bit builds. ]
Link: https://lore.kernel.org/lkml/20200615164048.GC2531@hirez.programming.kicks-ass.net/
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Frederic Weisbecker <frederic@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 include/linux/smp.h       | 3 +++
 include/linux/smp_types.h | 3 +++
 kernel/smp.c              | 6 ++++++
 3 files changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 80d557ef8a11..9f13966d3d92 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -26,6 +26,9 @@ struct __call_single_data {
 		struct {
 			struct llist_node llist;
 			unsigned int flags;
+#ifdef CONFIG_64BIT
+			u16 src, dst;
+#endif
 		};
 	};
 	smp_call_func_t func;
diff --git a/include/linux/smp_types.h b/include/linux/smp_types.h
index 364b3ae3e41d..2e8461af8df6 100644
--- a/include/linux/smp_types.h
+++ b/include/linux/smp_types.h
@@ -61,6 +61,9 @@ struct __call_single_node {
 		unsigned int	u_flags;
 		atomic_t	a_flags;
 	};
+#ifdef CONFIG_64BIT
+	u16 src, dst;
+#endif
 };
 
 #endif /* __LINUX_SMP_TYPES_H */
diff --git a/kernel/smp.c b/kernel/smp.c
index d0ae8eb6bf8b..865a876f83ce 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -375,6 +375,9 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 
 	csd->func = func;
 	csd->info = info;
+#ifdef CONFIG_64BIT
+	csd->dst = cpu;
+#endif
 
 	err = generic_exec_single(cpu, csd);
 
@@ -540,6 +543,9 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 			csd->flags |= CSD_TYPE_SYNC;
 		csd->func = func;
 		csd->info = info;
+#ifdef CONFIG_64BIT
+		csd->dst = cpu;
+#endif
 		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
 			__cpumask_set_cpu(cpu, cfd->cpumask_ipi);
 	}
-- 
cgit v1.2.3


From 35feb60474bf4f7fa7840e14fc7fd344996b919d Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Tue, 30 Jun 2020 13:22:54 -0700
Subject: kernel/smp: Provide CSD lock timeout diagnostics

This commit causes csd_lock_wait() to emit diagnostics when a CPU
fails to respond quickly enough to one of the smp_call_function()
family of function calls.  These diagnostics are enabled by a new
CSD_LOCK_WAIT_DEBUG Kconfig option that depends on DEBUG_KERNEL.

This commit was inspired by an earlier patch by Josef Bacik.

[ paulmck: Fix for syzbot+0f719294463916a3fc0e@syzkaller.appspotmail.com ]
[ paulmck: Fix KASAN use-after-free issue reported by Qian Cai. ]
[ paulmck: Fix botched nr_cpu_ids comparison per Dan Carpenter. ]
[ paulmck: Apply Peter Zijlstra feedback. ]
Link: https://lore.kernel.org/lkml/00000000000042f21905a991ecea@google.com
Link: https://lore.kernel.org/lkml/0000000000002ef21705a9933cf3@google.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
---
 kernel/smp.c      | 132 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/Kconfig.debug |  11 +++++
 2 files changed, 141 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 865a876f83ce..c5d31885bd30 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -20,6 +20,9 @@
 #include <linux/sched.h>
 #include <linux/sched/idle.h>
 #include <linux/hypervisor.h>
+#include <linux/sched/clock.h>
+#include <linux/nmi.h>
+#include <linux/sched/debug.h>
 
 #include "smpboot.h"
 #include "sched/smp.h"
@@ -96,6 +99,103 @@ void __init call_function_init(void)
 	smpcfd_prepare_cpu(smp_processor_id());
 }
 
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+
+static DEFINE_PER_CPU(call_single_data_t *, cur_csd);
+static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
+static DEFINE_PER_CPU(void *, cur_csd_info);
+
+#define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
+atomic_t csd_bug_count = ATOMIC_INIT(0);
+
+/* Record current CSD work for current CPU, NULL to erase. */
+static void csd_lock_record(call_single_data_t *csd)
+{
+	if (!csd) {
+		smp_mb(); /* NULL cur_csd after unlock. */
+		__this_cpu_write(cur_csd, NULL);
+		return;
+	}
+	__this_cpu_write(cur_csd_func, csd->func);
+	__this_cpu_write(cur_csd_info, csd->info);
+	smp_wmb(); /* func and info before csd. */
+	__this_cpu_write(cur_csd, csd);
+	smp_mb(); /* Update cur_csd before function call. */
+		  /* Or before unlock, as the case may be. */
+}
+
+static __always_inline int csd_lock_wait_getcpu(call_single_data_t *csd)
+{
+	unsigned int csd_type;
+
+	csd_type = CSD_TYPE(csd);
+	if (csd_type == CSD_TYPE_ASYNC || csd_type == CSD_TYPE_SYNC)
+		return csd->dst; /* Other CSD_TYPE_ values might not have ->dst. */
+	return -1;
+}
+
+/*
+ * Complain if too much time spent waiting.  Note that only
+ * the CSD_TYPE_SYNC/ASYNC types provide the destination CPU,
+ * so waiting on other types gets much less information.
+ */
+static __always_inline bool csd_lock_wait_toolong(call_single_data_t *csd, u64 ts0, u64 *ts1, int *bug_id)
+{
+	int cpu = -1;
+	int cpux;
+	bool firsttime;
+	u64 ts2, ts_delta;
+	call_single_data_t *cpu_cur_csd;
+	unsigned int flags = READ_ONCE(csd->flags);
+
+	if (!(flags & CSD_FLAG_LOCK)) {
+		if (!unlikely(*bug_id))
+			return true;
+		cpu = csd_lock_wait_getcpu(csd);
+		pr_alert("csd: CSD lock (#%d) got unstuck on CPU#%02d, CPU#%02d released the lock.\n",
+			 *bug_id, raw_smp_processor_id(), cpu);
+		return true;
+	}
+
+	ts2 = sched_clock();
+	ts_delta = ts2 - *ts1;
+	if (likely(ts_delta <= CSD_LOCK_TIMEOUT))
+		return false;
+
+	firsttime = !*bug_id;
+	if (firsttime)
+		*bug_id = atomic_inc_return(&csd_bug_count);
+	cpu = csd_lock_wait_getcpu(csd);
+	if (WARN_ONCE(cpu < 0 || cpu >= nr_cpu_ids, "%s: cpu = %d\n", __func__, cpu))
+		cpux = 0;
+	else
+		cpux = cpu;
+	cpu_cur_csd = smp_load_acquire(&per_cpu(cur_csd, cpux)); /* Before func and info. */
+	pr_alert("csd: %s non-responsive CSD lock (#%d) on CPU#%d, waiting %llu ns for CPU#%02d %pS(%ps).\n",
+		 firsttime ? "Detected" : "Continued", *bug_id, raw_smp_processor_id(), ts2 - ts0,
+		 cpu, csd->func, csd->info);
+	if (cpu_cur_csd && csd != cpu_cur_csd) {
+		pr_alert("\tcsd: CSD lock (#%d) handling prior %pS(%ps) request.\n",
+			 *bug_id, READ_ONCE(per_cpu(cur_csd_func, cpux)),
+			 READ_ONCE(per_cpu(cur_csd_info, cpux)));
+	} else {
+		pr_alert("\tcsd: CSD lock (#%d) %s.\n",
+			 *bug_id, !cpu_cur_csd ? "unresponsive" : "handling this request");
+	}
+	if (cpu >= 0) {
+		if (!trigger_single_cpu_backtrace(cpu))
+			dump_cpu_task(cpu);
+		if (!cpu_cur_csd) {
+			pr_alert("csd: Re-sending CSD lock (#%d) IPI from CPU#%02d to CPU#%02d\n", *bug_id, raw_smp_processor_id(), cpu);
+			arch_send_call_function_single_ipi(cpu);
+		}
+	}
+	dump_stack();
+	*ts1 = ts2;
+
+	return false;
+}
+
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -103,10 +203,30 @@ void __init call_function_init(void)
  * previous function call. For multi-cpu calls its even more interesting
  * as we'll have to ensure no other cpu is observing our csd.
  */
+static __always_inline void csd_lock_wait(call_single_data_t *csd)
+{
+	int bug_id = 0;
+	u64 ts0, ts1;
+
+	ts1 = ts0 = sched_clock();
+	for (;;) {
+		if (csd_lock_wait_toolong(csd, ts0, &ts1, &bug_id))
+			break;
+		cpu_relax();
+	}
+	smp_acquire__after_ctrl_dep();
+}
+
+#else
+static void csd_lock_record(call_single_data_t *csd)
+{
+}
+
 static __always_inline void csd_lock_wait(call_single_data_t *csd)
 {
 	smp_cond_load_acquire(&csd->flags, !(VAL & CSD_FLAG_LOCK));
 }
+#endif
 
 static __always_inline void csd_lock(call_single_data_t *csd)
 {
@@ -166,9 +286,11 @@ static int generic_exec_single(int cpu, call_single_data_t *csd)
 		 * We can unlock early even for the synchronous on-stack case,
 		 * since we're doing this from the same CPU..
 		 */
+		csd_lock_record(csd);
 		csd_unlock(csd);
 		local_irq_save(flags);
 		func(info);
+		csd_lock_record(NULL);
 		local_irq_restore(flags);
 		return 0;
 	}
@@ -268,8 +390,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 				entry = &csd_next->llist;
 			}
 
+			csd_lock_record(csd);
 			func(info);
 			csd_unlock(csd);
+			csd_lock_record(NULL);
 		} else {
 			prev = &csd->llist;
 		}
@@ -296,8 +420,10 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
 				smp_call_func_t func = csd->func;
 				void *info = csd->info;
 
+				csd_lock_record(csd);
 				csd_unlock(csd);
 				func(info);
+				csd_lock_record(NULL);
 			} else if (type == CSD_TYPE_IRQ_WORK) {
 				irq_work_single(csd);
 			}
@@ -375,7 +501,8 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
 
 	csd->func = func;
 	csd->info = info;
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+	csd->src = smp_processor_id();
 	csd->dst = cpu;
 #endif
 
@@ -543,7 +670,8 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
 			csd->flags |= CSD_TYPE_SYNC;
 		csd->func = func;
 		csd->info = info;
-#ifdef CONFIG_64BIT
+#ifdef CONFIG_CSD_LOCK_WAIT_DEBUG
+		csd->src = smp_processor_id();
 		csd->dst = cpu;
 #endif
 		if (llist_add(&csd->llist, &per_cpu(call_single_queue, cpu)))
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index e068c3c7189a..86a35fdfe021 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1367,6 +1367,17 @@ config WW_MUTEX_SELFTEST
 	  Say M if you want these self tests to build as a module.
 	  Say N if you are unsure.
 
+config CSD_LOCK_WAIT_DEBUG
+	bool "Debugging for csd_lock_wait(), called from smp_call_function*()"
+	depends on DEBUG_KERNEL
+	depends on 64BIT
+	default n
+	help
+	  This option enables debug prints when CPUs are slow to respond
+	  to the smp_call_function*() IPI wrappers.  These debug prints
+	  include the IPI handler function currently executing (if any)
+	  and relevant stack traces.
+
 endmenu # lock debugging
 
 config TRACE_IRQFLAGS
-- 
cgit v1.2.3


From 2b722160f1a7929f38dfb648c7bbb45f96e65a5b Mon Sep 17 00:00:00 2001
From: Wei Yongjun
Date: Mon, 6 Jul 2020 21:49:41 +0800
Subject: smp: Make symbol 'csd_bug_count' static

The sparse tool complains as follows:

kernel/smp.c:107:10: warning:
 symbol 'csd_bug_count' was not declared. Should it be static?

Because variable is not used outside of smp.c, this commit marks it
static.

Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index c5d31885bd30..b25383d16e8e 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -106,7 +106,7 @@ static DEFINE_PER_CPU(smp_call_func_t, cur_csd_func);
 static DEFINE_PER_CPU(void *, cur_csd_info);
 
 #define CSD_LOCK_TIMEOUT (5ULL * NSEC_PER_SEC)
-atomic_t csd_bug_count = ATOMIC_INIT(0);
+static atomic_t csd_bug_count = ATOMIC_INIT(0);
 
 /* Record current CSD work for current CPU, NULL to erase. */
 static void csd_lock_record(call_single_data_t *csd)
-- 
cgit v1.2.3