From 1887aa07b6765d345dd79f26017aa2d15d49d7af Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky
Date: Fri, 22 Sep 2017 14:17:41 +0200
Subject: s390/topology: add detection of dedicated vs shared CPUs

The topology information returned by STSI 15.x.x contains a flag
if the CPUs of a topology-list are dedicated or shared. Make this
information available if the machine provides topology information.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/kernel/smp.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/s390/kernel/smp.c')

diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 1cee6753d47a..b9fc9b00d845 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -800,6 +800,8 @@ void __init smp_detect_cpus(void)
  */
 static void smp_start_secondary(void *cpuvoid)
 {
+	int cpu = smp_processor_id();
+
 	S390_lowcore.last_update_clock = get_tod_clock();
 	S390_lowcore.restart_stack = (unsigned long) restart_stack;
 	S390_lowcore.restart_fn = (unsigned long) do_restart;
@@ -813,8 +815,12 @@ static void smp_start_secondary(void *cpuvoid)
 	init_cpu_timer();
 	vtime_init();
 	pfault_init();
-	notify_cpu_starting(smp_processor_id());
-	set_cpu_online(smp_processor_id(), true);
+	notify_cpu_starting(cpu);
+	if (topology_cpu_dedicated(cpu))
+		set_cpu_flag(CIF_DEDICATED_CPU);
+	else
+		clear_cpu_flag(CIF_DEDICATED_CPU);
+	set_cpu_online(cpu, true);
 	inc_irq_stat(CPU_RST);
 	local_irq_enable();
 	cpu_startup_entry(CPUHP_AP_ONLINE_IDLE);
-- 
cgit v1.2.3


From b96f7d881ad94203e997cd2aa7112d4a06d121ef Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky
Date: Fri, 24 Mar 2017 17:25:02 +0100
Subject: s390/spinlock: introduce spinlock wait queueing

The queued spinlock code for s390 follows the principles of the common
code qspinlock implementation but with a few notable differences.

The format of the spinlock_t locking word differs, s390 needs to store
the logical CPU number of the lock holder in the spinlock_t to be able
to use the diagnose 9c directed yield hypervisor call.

The inline code sequences for spin_lock and spin_unlock are nice and
short. The inline portion of a spin_lock now typically looks like this:

	lhi	%r0,0			# 0 indicates an empty lock
	l	%r1,0x3a0		# CPU number + 1 from lowcore
	cs	%r0,%r1,<some_lock>	# lock operation
	jnz	call_wait		# on failure call wait function
locked:
	...
call_wait:
	la	%r2,<some_lock>
	brasl	%r14,arch_spin_lock_wait
	j	locked

A spin_unlock is as simple as before:

	lhi	%r0,0
	sth	%r0,2(%r2)		# unlock operation

After a CPU has queued itself it may not enable interrupts again for the
arch_spin_lock_flags() variant. The arch_spin_lock_wait_flags wait function
is removed.

To improve performance the code implements opportunistic lock stealing.
If the wait function finds a spinlock_t that indicates that the lock is
free but there are queued waiters, the CPU may steal the lock up to three
times without queueing itself. The lock stealing update the steal counter
in the lock word to prevent more than 3 steals. The counter is reset at
the time the CPU next in the queue successfully takes the lock.

While the queued spinlocks improve performance in a system with dedicated
CPUs, in a virtualized environment with continuously overcommitted CPUs
the queued spinlocks can have a negative effect on performance. This
is due to the fact that a queued CPU that is preempted by the hypervisor
will block the queue at some point even without holding the lock. With
the classic spinlock it does not matter if a CPU is preempted that waits
for the lock. Therefore use the queued spinlock code only if the system
runs with dedicated CPUs and fall back to classic spinlocks when running
with shared CPUs.

Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/lowcore.h  |   5 +-
 arch/s390/include/asm/spinlock.h |  18 ++--
 arch/s390/kernel/setup.c         |   2 +
 arch/s390/kernel/smp.c           |   4 +
 arch/s390/lib/spinlock.c         | 194 +++++++++++++++++++++++++++++++++------
 5 files changed, 180 insertions(+), 43 deletions(-)

(limited to 'arch/s390/kernel/smp.c')

diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index a6870ea6ea8b..62943af36ac6 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -133,8 +133,9 @@ struct lowcore {
 	__u8	pad_0x03b4[0x03b8-0x03b4];	/* 0x03b4 */
 	__u64	gmap;				/* 0x03b8 */
 	__u32	spinlock_lockval;		/* 0x03c0 */
-	__u32	fpu_flags;			/* 0x03c4 */
-	__u8	pad_0x03c8[0x0400-0x03c8];	/* 0x03c8 */
+	__u32	spinlock_index;			/* 0x03c4 */
+	__u32	fpu_flags;			/* 0x03c8 */
+	__u8	pad_0x03cc[0x0400-0x03cc];	/* 0x03cc */
 
 	/* Per cpu primary space access list */
 	__u32	paste[16];			/* 0x0400 */
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 6727cc30d59b..2da4a6d13f54 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -36,15 +36,11 @@ bool arch_vcpu_is_preempted(int cpu);
  */
 
 void arch_lock_relax(int cpu);
+void arch_spin_relax(arch_spinlock_t *lock);
 
 void arch_spin_lock_wait(arch_spinlock_t *);
 int arch_spin_trylock_retry(arch_spinlock_t *);
-void arch_spin_lock_wait_flags(arch_spinlock_t *, unsigned long flags);
-
-static inline void arch_spin_relax(arch_spinlock_t *lock)
-{
-	arch_lock_relax(lock->lock);
-}
+void arch_spin_lock_setup(int cpu);
 
 static inline u32 arch_spin_lockval(int cpu)
 {
@@ -64,8 +60,7 @@ static inline int arch_spin_is_locked(arch_spinlock_t *lp)
 static inline int arch_spin_trylock_once(arch_spinlock_t *lp)
 {
 	barrier();
-	return likely(arch_spin_value_unlocked(*lp) &&
-		      __atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL));
+	return likely(__atomic_cmpxchg_bool(&lp->lock, 0, SPINLOCK_LOCKVAL));
 }
 
 static inline void arch_spin_lock(arch_spinlock_t *lp)
@@ -78,7 +73,7 @@ static inline void arch_spin_lock_flags(arch_spinlock_t *lp,
 					unsigned long flags)
 {
 	if (!arch_spin_trylock_once(lp))
-		arch_spin_lock_wait_flags(lp, flags);
+		arch_spin_lock_wait(lp);
 }
 
 static inline int arch_spin_trylock(arch_spinlock_t *lp)
@@ -95,8 +90,9 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 #ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES
 		"	.long	0xb2fa0070\n"	/* NIAI 7 */
 #endif
-		"	st	%1,%0\n"
-		: "=Q" (lp->lock) : "d" (0) : "cc", "memory");
+		"	sth	%1,%0\n"
+		: "=Q" (((unsigned short *) &lp->lock)[1])
+		: "d" (0) : "cc", "memory");
 }
 
 /*
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index 164a1e16b53e..b2c9af9b88d5 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -380,6 +380,8 @@ static void __init setup_lowcore(void)
 
 #ifdef CONFIG_SMP
 	lc->spinlock_lockval = arch_spin_lockval(0);
+	lc->spinlock_index = 0;
+	arch_spin_lock_setup(0);
 #endif
 
 	set_prefix((u32)(unsigned long) lc);
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index b9fc9b00d845..cc04b74fbb84 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -226,6 +226,7 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 	lc->mcesad = mcesa_origin | mcesa_bits;
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
+	lc->spinlock_index = 0;
 	if (vdso_alloc_per_cpu(lc))
 		goto out;
 	lowcore_ptr[cpu] = lc;
@@ -273,6 +274,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	cpumask_set_cpu(cpu, mm_cpumask(&init_mm));
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
+	lc->spinlock_index = 0;
 	lc->percpu_offset = __per_cpu_offset[cpu];
 	lc->kernel_asce = S390_lowcore.kernel_asce;
 	lc->machine_flags = S390_lowcore.machine_flags;
@@ -281,6 +283,7 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu)
 	save_access_regs((unsigned int *) lc->access_regs_save_area);
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
 	       MAX_FACILITY_BIT/8);
+	arch_spin_lock_setup(cpu);
 }
 
 static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
@@ -967,6 +970,7 @@ void __init smp_setup_processor_id(void)
 	pcpu_devices[0].address = stap();
 	S390_lowcore.cpu_nr = 0;
 	S390_lowcore.spinlock_lockval = arch_spin_lockval(0);
+	S390_lowcore.spinlock_index = 0;
 }
 
 /*
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index ee73bcca7e6f..6747134227cd 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -8,8 +8,10 @@
 #include <linux/types.h>
 #include <linux/export.h>
 #include <linux/spinlock.h>
+#include <linux/jiffies.h>
 #include <linux/init.h>
 #include <linux/smp.h>
+#include <linux/percpu.h>
 #include <asm/io.h>
 
 int spin_retry = -1;
@@ -32,6 +34,40 @@ static int __init spin_retry_setup(char *str)
 }
 __setup("spin_retry=", spin_retry_setup);
 
+struct spin_wait {
+	struct spin_wait *next, *prev;
+	int node_id;
+} __aligned(32);
+
+static DEFINE_PER_CPU_ALIGNED(struct spin_wait, spin_wait[4]);
+
+#define _Q_LOCK_CPU_OFFSET	0
+#define _Q_LOCK_STEAL_OFFSET	16
+#define _Q_TAIL_IDX_OFFSET	18
+#define _Q_TAIL_CPU_OFFSET	20
+
+#define _Q_LOCK_CPU_MASK	0x0000ffff
+#define _Q_LOCK_STEAL_ADD	0x00010000
+#define _Q_LOCK_STEAL_MASK	0x00030000
+#define _Q_TAIL_IDX_MASK	0x000c0000
+#define _Q_TAIL_CPU_MASK	0xfff00000
+
+#define _Q_LOCK_MASK		(_Q_LOCK_CPU_MASK | _Q_LOCK_STEAL_MASK)
+#define _Q_TAIL_MASK		(_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
+
+void arch_spin_lock_setup(int cpu)
+{
+	struct spin_wait *node;
+	int ix;
+
+	node = per_cpu_ptr(&spin_wait[0], cpu);
+	for (ix = 0; ix < 4; ix++, node++) {
+		memset(node, 0, sizeof(*node));
+		node->node_id = ((cpu + 1) << _Q_TAIL_CPU_OFFSET) +
+			(ix << _Q_TAIL_IDX_OFFSET);
+	}
+}
+
 static inline int arch_load_niai4(int *lock)
 {
 	int owner;
@@ -60,75 +96,160 @@ static inline int arch_cmpxchg_niai8(int *lock, int old, int new)
 	return expected == old;
 }
 
-void arch_spin_lock_wait(arch_spinlock_t *lp)
+static inline struct spin_wait *arch_spin_decode_tail(int lock)
 {
-	int cpu = SPINLOCK_LOCKVAL;
-	int owner, count;
+	int ix, cpu;
+
+	ix = (lock & _Q_TAIL_IDX_MASK) >> _Q_TAIL_IDX_OFFSET;
+	cpu = (lock & _Q_TAIL_CPU_MASK) >> _Q_TAIL_CPU_OFFSET;
+	return per_cpu_ptr(&spin_wait[ix], cpu - 1);
+}
+
+static inline int arch_spin_yield_target(int lock, struct spin_wait *node)
+{
+	if (lock & _Q_LOCK_CPU_MASK)
+		return lock & _Q_LOCK_CPU_MASK;
+	if (node == NULL || node->prev == NULL)
+		return 0;	/* 0 -> no target cpu */
+	while (node->prev)
+		node = node->prev;
+	return node->node_id >> _Q_TAIL_CPU_OFFSET;
+}
+
+static inline void arch_spin_lock_queued(arch_spinlock_t *lp)
+{
+	struct spin_wait *node, *next;
+	int lockval, ix, node_id, tail_id, old, new, owner, count;
+
+	ix = S390_lowcore.spinlock_index++;
+	barrier();
+	lockval = SPINLOCK_LOCKVAL;	/* cpu + 1 */
+	node = this_cpu_ptr(&spin_wait[ix]);
+	node->prev = node->next = NULL;
+	node_id = node->node_id;
+
+	/* Enqueue the node for this CPU in the spinlock wait queue */
+	while (1) {
+		old = READ_ONCE(lp->lock);
+		if ((old & _Q_LOCK_CPU_MASK) == 0 &&
+		    (old & _Q_LOCK_STEAL_MASK) != _Q_LOCK_STEAL_MASK) {
+			/*
+			 * The lock is free but there may be waiters.
+			 * With no waiters simply take the lock, if there
+			 * are waiters try to steal the lock. The lock may
+			 * be stolen three times before the next queued
+			 * waiter will get the lock.
+			 */
+			new = (old ? (old + _Q_LOCK_STEAL_ADD) : 0) | lockval;
+			if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+				/* Got the lock */
+				goto out;
+			/* lock passing in progress */
+			continue;
+		}
+		/* Make the node of this CPU the new tail. */
+		new = node_id | (old & _Q_LOCK_MASK);
+		if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+			break;
+	}
+	/* Set the 'next' pointer of the tail node in the queue */
+	tail_id = old & _Q_TAIL_MASK;
+	if (tail_id != 0) {
+		node->prev = arch_spin_decode_tail(tail_id);
+		WRITE_ONCE(node->prev->next, node);
+	}
 
 	/* Pass the virtual CPU to the lock holder if it is not running */
-	owner = arch_load_niai4(&lp->lock);
+	owner = arch_spin_yield_target(old, node);
 	if (owner && arch_vcpu_is_preempted(owner - 1))
 		smp_yield_cpu(owner - 1);
 
+	/* Spin on the CPU local node->prev pointer */
+	if (tail_id != 0) {
+		count = spin_retry;
+		while (READ_ONCE(node->prev) != NULL) {
+			if (count-- >= 0)
+				continue;
+			count = spin_retry;
+			/* Query running state of lock holder again. */
+			owner = arch_spin_yield_target(old, node);
+			if (owner && arch_vcpu_is_preempted(owner - 1))
+				smp_yield_cpu(owner - 1);
+		}
+	}
+
+	/* Spin on the lock value in the spinlock_t */
 	count = spin_retry;
 	while (1) {
-		owner = arch_load_niai4(&lp->lock);
-		/* Try to get the lock if it is free. */
+		old = READ_ONCE(lp->lock);
+		owner = old & _Q_LOCK_CPU_MASK;
 		if (!owner) {
-			if (arch_cmpxchg_niai8(&lp->lock, 0, cpu))
-				return;
+			tail_id = old & _Q_TAIL_MASK;
+			new = ((tail_id != node_id) ? tail_id : 0) | lockval;
+			if (__atomic_cmpxchg_bool(&lp->lock, old, new))
+				/* Got the lock */
+				break;
 			continue;
 		}
 		if (count-- >= 0)
 			continue;
 		count = spin_retry;
-		/*
-		 * For multiple layers of hypervisors, e.g. z/VM + LPAR
-		 * yield the CPU unconditionally. For LPAR rely on the
-		 * sense running status.
-		 */
 		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1))
 			smp_yield_cpu(owner - 1);
 	}
+
+	/* Pass lock_spin job to next CPU in the queue */
+	if (node_id && tail_id != node_id) {
+		/* Wait until the next CPU has set up the 'next' pointer */
+		while ((next = READ_ONCE(node->next)) == NULL)
+			;
+		next->prev = NULL;
+	}
+
+ out:
+	S390_lowcore.spinlock_index--;
 }
-EXPORT_SYMBOL(arch_spin_lock_wait);
 
-void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags)
+static inline void arch_spin_lock_classic(arch_spinlock_t *lp)
 {
-	int cpu = SPINLOCK_LOCKVAL;
-	int owner, count;
+	int lockval, old, new, owner, count;
 
-	local_irq_restore(flags);
+	lockval = SPINLOCK_LOCKVAL;	/* cpu + 1 */
 
 	/* Pass the virtual CPU to the lock holder if it is not running */
-	owner = arch_load_niai4(&lp->lock);
+	owner = arch_spin_yield_target(ACCESS_ONCE(lp->lock), NULL);
 	if (owner && arch_vcpu_is_preempted(owner - 1))
 		smp_yield_cpu(owner - 1);
 
 	count = spin_retry;
 	while (1) {
-		owner = arch_load_niai4(&lp->lock);
+		old = arch_load_niai4(&lp->lock);
+		owner = old & _Q_LOCK_CPU_MASK;
 		/* Try to get the lock if it is free. */
 		if (!owner) {
-			local_irq_disable();
-			if (arch_cmpxchg_niai8(&lp->lock, 0, cpu))
-				return;
-			local_irq_restore(flags);
+			new = (old & _Q_TAIL_MASK) | lockval;
+			if (arch_cmpxchg_niai8(&lp->lock, old, new))
+				/* Got the lock */
+			       return;
 			continue;
 		}
 		if (count-- >= 0)
 			continue;
 		count = spin_retry;
-		/*
-		 * For multiple layers of hypervisors, e.g. z/VM + LPAR
-		 * yield the CPU unconditionally. For LPAR rely on the
-		 * sense running status.
-		 */
 		if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(owner - 1))
 			smp_yield_cpu(owner - 1);
 	}
 }
-EXPORT_SYMBOL(arch_spin_lock_wait_flags);
+
+void arch_spin_lock_wait(arch_spinlock_t *lp)
+{
+	/* Use classic spinlocks + niai if the steal time is >= 10% */
+	if (test_cpu_flag(CIF_DEDICATED_CPU))
+		arch_spin_lock_queued(lp);
+	else
+		arch_spin_lock_classic(lp);
+}
+EXPORT_SYMBOL(arch_spin_lock_wait);
 
 int arch_spin_trylock_retry(arch_spinlock_t *lp)
 {
@@ -270,3 +391,16 @@ void arch_lock_relax(int cpu)
 	smp_yield_cpu(cpu - 1);
 }
 EXPORT_SYMBOL(arch_lock_relax);
+
+void arch_spin_relax(arch_spinlock_t *lp)
+{
+	int cpu;
+
+	cpu = READ_ONCE(lp->lock) & _Q_LOCK_CPU_MASK;
+	if (!cpu)
+		return;
+	if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(cpu - 1))
+		return;
+	smp_yield_cpu(cpu - 1);
+}
+EXPORT_SYMBOL(arch_spin_relax);
-- 
cgit v1.2.3


From 00a8f886dbdaeea1d93543d5311ddf3a2680bf2b Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky
Date: Fri, 15 Sep 2017 16:24:31 +0200
Subject: s390/nmi: use smp_emergency_stop instead of smp_send_stop

The smp_send_stop() function can be called from s390_handle_damage
while DAT is off. This happens if a machine check indicates that
kernel gprs or control registers can not be restored. The function
smp_send_stop reenables DAT via __load_psw_mask. That should work
for the case of lost kernel gprs and the system will do the expected
stop of all CPUs. But if control registers are lost, in particular
CR13 with the home space ASCE, interesting secondary crashes may
occur.

Make smp_emergency_stop callable from nmi.c and remove the cpumask
argument. Replace the smp_send_stop call with smp_emergency_stop in
the s390_handle_damage function.

In addition add notrace and NOKPROBE_SYMBOL annotations for all
functions required for the emergency shutdown.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/smp.h |  5 +++++
 arch/s390/kernel/nmi.c      |  9 +++++++--
 arch/s390/kernel/smp.c      | 30 +++++++++++++++++-------------
 3 files changed, 29 insertions(+), 15 deletions(-)

(limited to 'arch/s390/kernel/smp.c')

diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h
index 3deb134587b7..3470274a985c 100644
--- a/arch/s390/include/asm/smp.h
+++ b/arch/s390/include/asm/smp.h
@@ -27,6 +27,7 @@ extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
 
 extern void smp_call_online_cpu(void (*func)(void *), void *);
 extern void smp_call_ipl_cpu(void (*func)(void *), void *);
+extern void smp_emergency_stop(void);
 
 extern int smp_find_processor_id(u16 address);
 extern int smp_store_status(int cpu);
@@ -52,6 +53,10 @@ static inline void smp_call_online_cpu(void (*func)(void *), void *data)
 	func(data);
 }
 
+static inline void smp_emergency_stop(void)
+{
+}
+
 static inline int smp_find_processor_id(u16 address) { return 0; }
 static inline int smp_store_status(int cpu) { return 0; }
 static inline int smp_vcpu_scheduled(int cpu) { return 1; }
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 31d03a84126c..15e28eefe7e9 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/hardirq.h>
+#include <linux/kprobes.h>
 #include <linux/time.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
@@ -38,12 +39,13 @@ struct mcck_struct {
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
 
-static void s390_handle_damage(void)
+static notrace void s390_handle_damage(void)
 {
-	smp_send_stop();
+	smp_emergency_stop();
 	disabled_wait((unsigned long) __builtin_return_address(0));
 	while (1);
 }
+NOKPROBE_SYMBOL(s390_handle_damage);
 
 /*
  * Main machine check handler function. Will be called with interrupts enabled
@@ -275,6 +277,7 @@ static int notrace s390_validate_registers(union mci mci, int umode)
 
 	return kill_task;
 }
+NOKPROBE_SYMBOL(s390_validate_registers);
 
 /*
  * Backup the guest's machine check info to its description block
@@ -300,6 +303,7 @@ static void notrace s390_backup_mcck_info(struct pt_regs *regs)
 	mcck_backup->failing_storage_address
 			= S390_lowcore.failing_storage_address;
 }
+NOKPROBE_SYMBOL(s390_backup_mcck_info);
 
 #define MAX_IPD_COUNT	29
 #define MAX_IPD_TIME	(5 * 60 * USEC_PER_SEC) /* 5 minutes */
@@ -443,6 +447,7 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	clear_cpu_flag(CIF_MCCK_GUEST);
 	nmi_exit();
 }
+NOKPROBE_SYMBOL(s390_do_machine_check);
 
 static int __init machine_check_init(void)
 {
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index cc04b74fbb84..2dba3e88a972 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -36,6 +36,7 @@
 #include <linux/sched/task_stack.h>
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
+#include <linux/kprobes.h>
 #include <asm/asm-offsets.h>
 #include <asm/diag.h>
 #include <asm/switch_to.h>
@@ -422,13 +423,17 @@ void smp_yield_cpu(int cpu)
  * Send cpus emergency shutdown signal. This gives the cpus the
  * opportunity to complete outstanding interrupts.
  */
-static void smp_emergency_stop(cpumask_t *cpumask)
+void notrace smp_emergency_stop(void)
 {
+	cpumask_t cpumask;
 	u64 end;
 	int cpu;
 
+	cpumask_copy(&cpumask, cpu_online_mask);
+	cpumask_clear_cpu(smp_processor_id(), &cpumask);
+
 	end = get_tod_clock() + (1000000UL << 12);
-	for_each_cpu(cpu, cpumask) {
+	for_each_cpu(cpu, &cpumask) {
 		struct pcpu *pcpu = pcpu_devices + cpu;
 		set_bit(ec_stop_cpu, &pcpu->ec_mask);
 		while (__pcpu_sigp(pcpu->address, SIGP_EMERGENCY_SIGNAL,
@@ -437,21 +442,21 @@ static void smp_emergency_stop(cpumask_t *cpumask)
 			cpu_relax();
 	}
 	while (get_tod_clock() < end) {
-		for_each_cpu(cpu, cpumask)
+		for_each_cpu(cpu, &cpumask)
 			if (pcpu_stopped(pcpu_devices + cpu))
-				cpumask_clear_cpu(cpu, cpumask);
-		if (cpumask_empty(cpumask))
+				cpumask_clear_cpu(cpu, &cpumask);
+		if (cpumask_empty(&cpumask))
 			break;
 		cpu_relax();
 	}
 }
+NOKPROBE_SYMBOL(smp_emergency_stop);
 
 /*
  * Stop all cpus but the current one.
  */
 void smp_send_stop(void)
 {
-	cpumask_t cpumask;
 	int cpu;
 
 	/* Disable all interrupts/machine checks */
@@ -459,17 +464,16 @@ void smp_send_stop(void)
 	trace_hardirqs_off();
 
 	debug_set_critical();
-	cpumask_copy(&cpumask, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), &cpumask);
 
 	if (oops_in_progress)
-		smp_emergency_stop(&cpumask);
+		smp_emergency_stop();
 
 	/* stop all processors */
-	for_each_cpu(cpu, &cpumask) {
-		struct pcpu *pcpu = pcpu_devices + cpu;
-		pcpu_sigp_retry(pcpu, SIGP_STOP, 0);
-		while (!pcpu_stopped(pcpu))
+	for_each_online_cpu(cpu) {
+		if (cpu == smp_processor_id())
+			continue;
+		pcpu_sigp_retry(pcpu_devices + cpu, SIGP_STOP, 0);
+		while (!pcpu_stopped(pcpu_devices + cpu))
 			cpu_relax();
 	}
 }
-- 
cgit v1.2.3


From 6c81511ca1f52a0bbe921b2b98e34319a4ca59ed Mon Sep 17 00:00:00 2001
From: Martin Schwidefsky
Date: Thu, 12 Oct 2017 13:24:47 +0200
Subject: s390/nmi: allocation of the extended save area

The machine check extended save area is needed to store the vector
registers and the guarded storage control block when a CPU is
interrupted by a machine check.

Move the slab cache allocation of the full save area to nmi.c,
for early boot use a static __initdata block.

Reviewed-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/nmi.h | 10 ++++--
 arch/s390/kernel/nmi.c      | 82 +++++++++++++++++++++++++++++++++++++++++++++
 arch/s390/kernel/setup.c    | 11 ++----
 arch/s390/kernel/smp.c      | 43 ++++--------------------
 4 files changed, 99 insertions(+), 47 deletions(-)

(limited to 'arch/s390/kernel/smp.c')

diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 77a7d9445e33..ed41c424448d 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -80,6 +80,8 @@ union mci {
 
 #define MCESA_ORIGIN_MASK	(~0x3ffUL)
 #define MCESA_LC_MASK		(0xfUL)
+#define MCESA_MIN_SIZE		(1024)
+#define MCESA_MAX_SIZE		(2048)
 
 struct mcesa {
 	u8 vector_save_area[1024];
@@ -88,8 +90,12 @@ struct mcesa {
 
 struct pt_regs;
 
-extern void s390_handle_mcck(void);
-extern void s390_do_machine_check(struct pt_regs *regs);
+void nmi_alloc_boot_cpu(struct lowcore *lc);
+int nmi_alloc_per_cpu(struct lowcore *lc);
+void nmi_free_per_cpu(struct lowcore *lc);
+
+void s390_handle_mcck(void);
+void s390_do_machine_check(struct pt_regs *regs);
 
 #endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_NMI_H */
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index eb3e702cee30..7f6779695a43 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -12,7 +12,9 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/hardirq.h>
+#include <linux/log2.h>
 #include <linux/kprobes.h>
+#include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/module.h>
 #include <linux/sched/signal.h>
@@ -38,6 +40,86 @@ struct mcck_struct {
 };
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
+static struct kmem_cache *mcesa_cache;
+static unsigned long mcesa_origin_lc;
+
+static inline int nmi_needs_mcesa(void)
+{
+	return MACHINE_HAS_VX || MACHINE_HAS_GS;
+}
+
+static inline unsigned long nmi_get_mcesa_size(void)
+{
+	if (MACHINE_HAS_GS)
+		return MCESA_MAX_SIZE;
+	return MCESA_MIN_SIZE;
+}
+
+/*
+ * The initial machine check extended save area for the boot CPU.
+ * It will be replaced by nmi_init() with an allocated structure.
+ * The structure is required for machine check happening early in
+ * the boot process.
+ */
+static struct mcesa boot_mcesa __initdata __aligned(MCESA_MAX_SIZE);
+
+void __init nmi_alloc_boot_cpu(struct lowcore *lc)
+{
+	if (!nmi_needs_mcesa())
+		return;
+	lc->mcesad = (unsigned long) &boot_mcesa;
+	if (MACHINE_HAS_GS)
+		lc->mcesad |= ilog2(MCESA_MAX_SIZE);
+}
+
+static int __init nmi_init(void)
+{
+	unsigned long origin, cr0, size;
+
+	if (!nmi_needs_mcesa())
+		return 0;
+	size = nmi_get_mcesa_size();
+	if (size > MCESA_MIN_SIZE)
+		mcesa_origin_lc = ilog2(size);
+	/* create slab cache for the machine-check-extended-save-areas */
+	mcesa_cache = kmem_cache_create("nmi_save_areas", size, size, 0, NULL);
+	if (!mcesa_cache)
+		panic("Couldn't create nmi save area cache");
+	origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
+	if (!origin)
+		panic("Couldn't allocate nmi save area");
+	/* The pointer is stored with mcesa_bits ORed in */
+	kmemleak_not_leak((void *) origin);
+	__ctl_store(cr0, 0, 0);
+	__ctl_clear_bit(0, 28); /* disable lowcore protection */
+	/* Replace boot_mcesa on the boot CPU */
+	S390_lowcore.mcesad = origin | mcesa_origin_lc;
+	__ctl_load(cr0, 0, 0);
+	return 0;
+}
+early_initcall(nmi_init);
+
+int nmi_alloc_per_cpu(struct lowcore *lc)
+{
+	unsigned long origin;
+
+	if (!nmi_needs_mcesa())
+		return 0;
+	origin = (unsigned long) kmem_cache_alloc(mcesa_cache, GFP_KERNEL);
+	if (!origin)
+		return -ENOMEM;
+	/* The pointer is stored with mcesa_bits ORed in */
+	kmemleak_not_leak((void *) origin);
+	lc->mcesad = origin | mcesa_origin_lc;
+	return 0;
+}
+
+void nmi_free_per_cpu(struct lowcore *lc)
+{
+	if (!nmi_needs_mcesa())
+		return;
+	kmem_cache_free(mcesa_cache, (void *)(lc->mcesad & MCESA_ORIGIN_MASK));
+}
 
 static notrace void s390_handle_damage(void)
 {
diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c
index bf139f9e120e..b0943ef8cc31 100644
--- a/arch/s390/kernel/setup.c
+++ b/arch/s390/kernel/setup.c
@@ -55,6 +55,7 @@
 #include <asm/mmu_context.h>
 #include <asm/cpcmd.h>
 #include <asm/lowcore.h>
+#include <asm/nmi.h>
 #include <asm/irq.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
@@ -340,15 +341,7 @@ static void __init setup_lowcore(void)
 	lc->stfl_fac_list = S390_lowcore.stfl_fac_list;
 	memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list,
 	       MAX_FACILITY_BIT/8);
-	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
-		unsigned long bits, size;
-
-		bits = MACHINE_HAS_GS ? 11 : 10;
-		size = 1UL << bits;
-		lc->mcesad = (__u64) memblock_virt_alloc(size, size);
-		if (MACHINE_HAS_GS)
-			lc->mcesad |= bits;
-	}
+	nmi_alloc_boot_cpu(lc);
 	vdso_alloc_boot_cpu(lc);
 	lc->sync_enter_timer = S390_lowcore.sync_enter_timer;
 	lc->async_enter_timer = S390_lowcore.async_enter_timer;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 2dba3e88a972..6d17ff46b749 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -81,8 +81,6 @@ struct pcpu {
 static u8 boot_core_type;
 static struct pcpu pcpu_devices[NR_CPUS];
 
-static struct kmem_cache *pcpu_mcesa_cache;
-
 unsigned int smp_cpu_mt_shift;
 EXPORT_SYMBOL(smp_cpu_mt_shift);
 
@@ -193,10 +191,8 @@ static void pcpu_ec_call(struct pcpu *pcpu, int ec_bit)
 static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 {
 	unsigned long async_stack, panic_stack;
-	unsigned long mcesa_origin, mcesa_bits;
 	struct lowcore *lc;
 
-	mcesa_origin = mcesa_bits = 0;
 	if (pcpu != &pcpu_devices[0]) {
 		pcpu->lowcore =	(struct lowcore *)
 			__get_free_pages(GFP_KERNEL | GFP_DMA, LC_ORDER);
@@ -204,40 +200,30 @@ static int pcpu_alloc_lowcore(struct pcpu *pcpu, int cpu)
 		panic_stack = __get_free_page(GFP_KERNEL);
 		if (!pcpu->lowcore || !panic_stack || !async_stack)
 			goto out;
-		if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
-			mcesa_origin = (unsigned long)
-				kmem_cache_alloc(pcpu_mcesa_cache, GFP_KERNEL);
-			if (!mcesa_origin)
-				goto out;
-			/* The pointer is stored with mcesa_bits ORed in */
-			kmemleak_not_leak((void *) mcesa_origin);
-			mcesa_bits = MACHINE_HAS_GS ? 11 : 0;
-		}
 	} else {
 		async_stack = pcpu->lowcore->async_stack - ASYNC_FRAME_OFFSET;
 		panic_stack = pcpu->lowcore->panic_stack - PANIC_FRAME_OFFSET;
-		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
-		mcesa_bits = pcpu->lowcore->mcesad & MCESA_LC_MASK;
 	}
 	lc = pcpu->lowcore;
 	memcpy(lc, &S390_lowcore, 512);
 	memset((char *) lc + 512, 0, sizeof(*lc) - 512);
 	lc->async_stack = async_stack + ASYNC_FRAME_OFFSET;
 	lc->panic_stack = panic_stack + PANIC_FRAME_OFFSET;
-	lc->mcesad = mcesa_origin | mcesa_bits;
 	lc->cpu_nr = cpu;
 	lc->spinlock_lockval = arch_spin_lockval(cpu);
 	lc->spinlock_index = 0;
-	if (vdso_alloc_per_cpu(lc))
+	if (nmi_alloc_per_cpu(lc))
 		goto out;
+	if (vdso_alloc_per_cpu(lc))
+		goto out_mcesa;
 	lowcore_ptr[cpu] = lc;
 	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, (u32)(unsigned long) lc);
 	return 0;
+
+out_mcesa:
+	nmi_free_per_cpu(lc);
 out:
 	if (pcpu != &pcpu_devices[0]) {
-		if (mcesa_origin)
-			kmem_cache_free(pcpu_mcesa_cache,
-					(void *) mcesa_origin);
 		free_page(panic_stack);
 		free_pages(async_stack, ASYNC_ORDER);
 		free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -249,17 +235,12 @@ out:
 
 static void pcpu_free_lowcore(struct pcpu *pcpu)
 {
-	unsigned long mcesa_origin;
-
 	pcpu_sigp_retry(pcpu, SIGP_SET_PREFIX, 0);
 	lowcore_ptr[pcpu - pcpu_devices] = NULL;
 	vdso_free_per_cpu(pcpu->lowcore);
+	nmi_free_per_cpu(pcpu->lowcore);
 	if (pcpu == &pcpu_devices[0])
 		return;
-	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
-		mcesa_origin = pcpu->lowcore->mcesad & MCESA_ORIGIN_MASK;
-		kmem_cache_free(pcpu_mcesa_cache, (void *) mcesa_origin);
-	}
 	free_page(pcpu->lowcore->panic_stack-PANIC_FRAME_OFFSET);
 	free_pages(pcpu->lowcore->async_stack-ASYNC_FRAME_OFFSET, ASYNC_ORDER);
 	free_pages((unsigned long) pcpu->lowcore, LC_ORDER);
@@ -936,22 +917,12 @@ void __init smp_fill_possible_mask(void)
 
 void __init smp_prepare_cpus(unsigned int max_cpus)
 {
-	unsigned long size;
-
 	/* request the 0x1201 emergency signal external interrupt */
 	if (register_external_irq(EXT_IRQ_EMERGENCY_SIG, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1201");
 	/* request the 0x1202 external call external interrupt */
 	if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt))
 		panic("Couldn't request external interrupt 0x1202");
-	/* create slab cache for the machine-check-extended-save-areas */
-	if (MACHINE_HAS_VX || MACHINE_HAS_GS) {
-		size = 1UL << (MACHINE_HAS_GS ? 11 : 10);
-		pcpu_mcesa_cache = kmem_cache_create("nmi_save_areas",
-						     size, size, 0, NULL);
-		if (!pcpu_mcesa_cache)
-			panic("Couldn't create nmi save area cache");
-	}
 }
 
 void __init smp_prepare_boot_cpu(void)
-- 
cgit v1.2.3