29 files changed, 1607 insertions, 391 deletions
diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h
index 2499867dd0d8..df3f60cb1168 100644
--- a/arch/arm/include/uapi/asm/kvm.h
+++ b/arch/arm/include/uapi/asm/kvm.h
@@ -195,8 +195,14 @@ struct kvm_arch_memory_slot {
 #define KVM_ARM_IRQ_CPU_IRQ		0
 #define KVM_ARM_IRQ_CPU_FIQ		1
 
-/* Highest supported SPI, from VGIC_NR_IRQS */
+/*
+ * This used to hold the highest supported SPI, but it is now obsolete
+ * and only here to provide source code level compatibility with older
+ * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS.
+ */
+#ifndef __KERNEL__
 #define KVM_ARM_IRQ_GIC_MAX		127
+#endif
 
 /* One single KVM irqchip, ie. the VGIC */
 #define KVM_NR_IRQCHIPS          1
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 6f536451ab78..d9631ecddd56 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -671,8 +671,7 @@ int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_level,
 		if (!irqchip_in_kernel(kvm))
 			return -ENXIO;
 
-		if (irq_num < VGIC_NR_PRIVATE_IRQS ||
-		    irq_num > KVM_ARM_IRQ_GIC_MAX)
+		if (irq_num < VGIC_NR_PRIVATE_IRQS)
 			return -EINVAL;
 
 		return kvm_vgic_inject_irq(kvm, 0, irq_num, level);
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index c154c0b7eb60..d26832022127 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -188,8 +188,14 @@ struct kvm_arch_memory_slot {
 #define KVM_ARM_IRQ_CPU_IRQ		0
 #define KVM_ARM_IRQ_CPU_FIQ		1
 
-/* Highest supported SPI, from VGIC_NR_IRQS */
+/*
+ * This used to hold the highest supported SPI, but it is now obsolete
+ * and only here to provide source code level compatibility with older
+ * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS.
+ */
+#ifndef __KERNEL__
 #define KVM_ARM_IRQ_GIC_MAX		127
+#endif
 
 /* One single KVM irqchip, ie. the VGIC */
 #define KVM_NR_IRQCHIPS          1
diff --git a/arch/powerpc/include/asm/archrandom.h b/arch/powerpc/include/asm/archrandom.h
index bde531103638..0cc6eedc4780 100644
--- a/arch/powerpc/include/asm/archrandom.h
+++ b/arch/powerpc/include/asm/archrandom.h
@@ -30,8 +30,6 @@ static inline int arch_has_random(void)
 	return !!ppc_md.get_random_long;
 }
 
-int powernv_get_random_long(unsigned long *v);
-
 static inline int arch_get_random_seed_long(unsigned long *v)
 {
 	return 0;
@@ -47,4 +45,13 @@ static inline int arch_has_random_seed(void)
 
 #endif /* CONFIG_ARCH_RANDOM */
 
+#ifdef CONFIG_PPC_POWERNV
+int powernv_hwrng_present(void);
+int powernv_get_random_long(unsigned long *v);
+int powernv_get_random_real_mode(unsigned long *v);
+#else
+static inline int powernv_hwrng_present(void) { return 0; }
+static inline int powernv_get_random_real_mode(unsigned long *v) { return 0; }
+#endif
+
 #endif /* _ASM_POWERPC_ARCHRANDOM_H */
diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h
index 993090422690..b91e74a817d8 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -288,6 +288,9 @@ static inline bool kvmppc_supports_magic_page(struct kvm_vcpu *vcpu)
 	return !is_kvmppc_hv_enabled(vcpu->kvm);
 }
 
+extern int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu);
+extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
+
 /* Magic register values loaded into r3 and r4 before the 'sc' assembly
  * instruction for the OSI hypercalls */
 #define OSI_SC_MAGIC_R3			0x113724FA
diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h
index 14619a59ec09..7ae407941be2 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -85,6 +85,20 @@ static inline long try_lock_hpte(__be64 *hpte, unsigned long bits)
 	return old == 0;
 }
 
+static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+	hpte_v &= ~HPTE_V_HVLOCK;
+	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
+	hpte[0] = cpu_to_be64(hpte_v);
+}
+
+/* Without barrier */
+static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
+{
+	hpte_v &= ~HPTE_V_HVLOCK;
+	hpte[0] = cpu_to_be64(hpte_v);
+}
+
 static inline int __hpte_actual_psize(unsigned int lp, int psize)
 {
 	int i, shift;
@@ -424,6 +438,10 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 	return rcu_dereference_raw_notrace(kvm->memslots);
 }
 
+extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
+
+extern void kvmhv_rm_send_ipi(int cpu);
+
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 #endif /* __ASM_KVM_BOOK3S_64_H__ */
diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h
index c610961720c7..a193a13cf08b 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -227,10 +227,8 @@ struct kvm_arch {
 	unsigned long host_sdr1;
 	int tlbie_lock;
 	unsigned long lpcr;
-	unsigned long rmor;
-	struct kvm_rma_info *rma;
 	unsigned long vrma_slb_v;
-	int rma_setup_done;
+	int hpte_setup_done;
 	u32 hpt_order;
 	atomic_t vcpus_running;
 	u32 online_vcores;
@@ -239,6 +237,8 @@ struct kvm_arch {
 	atomic_t hpte_mod_interest;
 	cpumask_t need_tlb_flush;
 	int hpt_cma_alloc;
+	struct dentry *debugfs_dir;
+	struct dentry *htab_dentry;
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
 	struct mutex hpt_mutex;
@@ -263,18 +263,15 @@ struct kvm_arch {
 
 /*
  * Struct for a virtual core.
- * Note: entry_exit_count combines an entry count in the bottom 8 bits
- * and an exit count in the next 8 bits.  This is so that we can
- * atomically increment the entry count iff the exit count is 0
- * without taking the lock.
+ * Note: entry_exit_map combines a bitmap of threads that have entered
+ * in the bottom 8 bits and a bitmap of threads that have exited in the
+ * next 8 bits.  This is so that we can atomically set the entry bit
+ * iff the exit map is 0 without taking a lock.
  */
 struct kvmppc_vcore {
 	int n_runnable;
-	int n_busy;
 	int num_threads;
-	int entry_exit_count;
-	int n_woken;
-	int nap_count;
+	int entry_exit_map;
 	int napping_threads;
 	int first_vcpuid;
 	u16 pcpu;
@@ -299,13 +296,14 @@ struct kvmppc_vcore {
 	ulong conferring_threads;
 };
 
-#define VCORE_ENTRY_COUNT(vc)	((vc)->entry_exit_count & 0xff)
-#define VCORE_EXIT_COUNT(vc)	((vc)->entry_exit_count >> 8)
+#define VCORE_ENTRY_MAP(vc)	((vc)->entry_exit_map & 0xff)
+#define VCORE_EXIT_MAP(vc)	((vc)->entry_exit_map >> 8)
+#define VCORE_IS_EXITING(vc)	(VCORE_EXIT_MAP(vc) != 0)
 
 /* Values for vcore_state */
 #define VCORE_INACTIVE	0
 #define VCORE_SLEEPING	1
-#define VCORE_STARTING	2
+#define VCORE_PREEMPT	2
 #define VCORE_RUNNING	3
 #define VCORE_EXITING	4
 
@@ -368,6 +366,14 @@ struct kvmppc_slb {
 	u8 base_page_size;	/* MMU_PAGE_xxx */
 };
 
+/* Struct used to accumulate timing information in HV real mode code */
+struct kvmhv_tb_accumulator {
+	u64	seqcount;	/* used to synchronize access, also count * 2 */
+	u64	tb_total;	/* total time in timebase ticks */
+	u64	tb_min;		/* min time */
+	u64	tb_max;		/* max time */
+};
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM	2
 #define KVMPPC_BOOKE_DAC_NUM	2
@@ -656,6 +662,19 @@ struct kvm_vcpu_arch {
 
 	u32 emul_inst;
 #endif
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	struct kvmhv_tb_accumulator *cur_activity;	/* What we're timing */
+	u64	cur_tb_start;			/* when it started */
+	struct kvmhv_tb_accumulator rm_entry;	/* real-mode entry code */
+	struct kvmhv_tb_accumulator rm_intr;	/* real-mode intr handling */
+	struct kvmhv_tb_accumulator rm_exit;	/* real-mode exit code */
+	struct kvmhv_tb_accumulator guest_time;	/* guest execution */
+	struct kvmhv_tb_accumulator cede_time;	/* time napping inside guest */
+
+	struct dentry *debugfs_dir;
+	struct dentry *debugfs_timings;
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
 };
 
 #define VCPU_FPR(vcpu, i)	(vcpu)->arch.fp.fpr[i][TS_FPROFFSET]
diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 46bf652c9169..b8475daad884 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -302,6 +302,8 @@ static inline bool is_kvmppc_hv_enabled(struct kvm *kvm)
 	return kvm->arch.kvm_ops == kvmppc_hv_ops;
 }
 
+extern int kvmppc_hwrng_present(void);
+
 /*
  * Cuts out inst bits with ordering according to spec.
  * That means the leftmost bit is zero. All given bits are included.
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 03cbada59d3a..10fc784a2ad4 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -211,5 +211,8 @@ extern void secondary_cpu_time_init(void);
 
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
+/* Convert timebase ticks to nanoseconds */
+unsigned long long tb_to_ns(unsigned long long tb_ticks);
+
 #endif /* __KERNEL__ */
 #endif /* __POWERPC_TIME_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 4717859fdd04..0034b6b3556a 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -37,6 +37,7 @@
 #include <asm/thread_info.h>
 #include <asm/rtas.h>
 #include <asm/vdso_datapage.h>
+#include <asm/dbell.h>
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
 #include <asm/lppaca.h>
@@ -459,6 +460,19 @@ int main(void)
 	DEFINE(VCPU_SPRG2, offsetof(struct kvm_vcpu, arch.shregs.sprg2));
 	DEFINE(VCPU_SPRG3, offsetof(struct kvm_vcpu, arch.shregs.sprg3));
 #endif
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	DEFINE(VCPU_TB_RMENTRY, offsetof(struct kvm_vcpu, arch.rm_entry));
+	DEFINE(VCPU_TB_RMINTR, offsetof(struct kvm_vcpu, arch.rm_intr));
+	DEFINE(VCPU_TB_RMEXIT, offsetof(struct kvm_vcpu, arch.rm_exit));
+	DEFINE(VCPU_TB_GUEST, offsetof(struct kvm_vcpu, arch.guest_time));
+	DEFINE(VCPU_TB_CEDE, offsetof(struct kvm_vcpu, arch.cede_time));
+	DEFINE(VCPU_CUR_ACTIVITY, offsetof(struct kvm_vcpu, arch.cur_activity));
+	DEFINE(VCPU_ACTIVITY_START, offsetof(struct kvm_vcpu, arch.cur_tb_start));
+	DEFINE(TAS_SEQCOUNT, offsetof(struct kvmhv_tb_accumulator, seqcount));
+	DEFINE(TAS_TOTAL, offsetof(struct kvmhv_tb_accumulator, tb_total));
+	DEFINE(TAS_MIN, offsetof(struct kvmhv_tb_accumulator, tb_min));
+	DEFINE(TAS_MAX, offsetof(struct kvmhv_tb_accumulator, tb_max));
+#endif
 	DEFINE(VCPU_SHARED_SPRG3, offsetof(struct kvm_vcpu_arch_shared, sprg3));
 	DEFINE(VCPU_SHARED_SPRG4, offsetof(struct kvm_vcpu_arch_shared, sprg4));
 	DEFINE(VCPU_SHARED_SPRG5, offsetof(struct kvm_vcpu_arch_shared, sprg5));
@@ -492,7 +506,6 @@ int main(void)
 	DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits));
 	DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls));
 	DEFINE(KVM_LPCR, offsetof(struct kvm, arch.lpcr));
-	DEFINE(KVM_RMOR, offsetof(struct kvm, arch.rmor));
 	DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v));
 	DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr));
 	DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar));
@@ -550,8 +563,7 @@ int main(void)
 	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
 	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
 	DEFINE(VCPU_SHADOW_SRR1, offsetof(struct kvm_vcpu, arch.shadow_srr1));
-	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_count));
-	DEFINE(VCORE_NAP_COUNT, offsetof(struct kvmppc_vcore, nap_count));
+	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
 	DEFINE(VCORE_KVM, offsetof(struct kvmppc_vcore, kvm));
@@ -748,5 +760,7 @@ int main(void)
 			offsetof(struct paca_struct, subcore_sibling_mask));
 #endif
 
+	DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER);
+
 	return 0;
 }
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 2d7b33fab953..56f44848b044 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -608,6 +608,12 @@ void arch_suspend_enable_irqs(void)
 }
 #endif
 
+unsigned long long tb_to_ns(unsigned long long ticks)
+{
+	return mulhdu(ticks, tb_to_ns_scale) << tb_to_ns_shift;
+}
+EXPORT_SYMBOL_GPL(tb_to_ns);
+
 /*
  * Scheduler clock - returns current time in nanosec units.
  *
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 11850f310fb4..2963e4dd0b80 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -110,6 +110,20 @@ config KVM_BOOK3S_64_PR
 	  processor, including emulating 32-bit processors on a 64-bit
 	  host.
 
+config KVM_BOOK3S_HV_EXIT_TIMING
+	bool "Detailed timing for hypervisor real-mode code"
+	depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS
+	---help---
+	  Calculate time taken for each vcpu in the real-mode guest entry,
+	  exit, and interrupt handling code, plus time spent in the guest
+	  and in nap mode due to idle (cede) while other threads are still
+	  in the guest.  The total, minimum and maximum times in nanoseconds
+	  together with the number of executions are reported in debugfs in
+	  kvm/vm#/vcpu#/timings.  The overhead is of the order of 30 - 40
+	  ns per exit on POWER8.
+
+	  If unsure, say N.
+
 config KVM_BOOKE_HV
 	bool
 
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index cfbcdc654201..453a8a47a467 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -821,6 +821,82 @@ void kvmppc_core_destroy_vm(struct kvm *kvm)
 #endif
 }
 
+int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	u64 buf;
+	int ret;
+
+	if (!is_power_of_2(size) || (size > sizeof(buf)))
+		return H_TOO_HARD;
+
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	switch (size) {
+	case 1:
+		kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf);
+		break;
+
+	case 2:
+		kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf));
+		break;
+
+	case 4:
+		kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf));
+		break;
+
+	case 8:
+		kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf));
+		break;
+
+	default:
+		BUG();
+	}
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load);
+
+int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	unsigned long size = kvmppc_get_gpr(vcpu, 4);
+	unsigned long addr = kvmppc_get_gpr(vcpu, 5);
+	unsigned long val = kvmppc_get_gpr(vcpu, 6);
+	u64 buf;
+	int ret;
+
+	switch (size) {
+	case 1:
+		*(u8 *)&buf = val;
+		break;
+
+	case 2:
+		*(__be16 *)&buf = cpu_to_be16(val);
+		break;
+
+	case 4:
+		*(__be32 *)&buf = cpu_to_be32(val);
+		break;
+
+	case 8:
+		*(__be64 *)&buf = cpu_to_be64(val);
+		break;
+
+	default:
+		return H_TOO_HARD;
+	}
+
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf);
+	if (ret != 0)
+		return H_TOO_HARD;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store);
+
 int kvmppc_core_check_processor_compat(void)
 {
 	/*
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 534acb3c6c3d..d6fe30835c58 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -27,6 +27,7 @@
 #include <linux/srcu.h>
 #include <linux/anon_inodes.h>
 #include <linux/file.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
@@ -116,12 +117,12 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp)
 	long order;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->arch.rma_setup_done) {
-		kvm->arch.rma_setup_done = 0;
-		/* order rma_setup_done vs. vcpus_running */
+	if (kvm->arch.hpte_setup_done) {
+		kvm->arch.hpte_setup_done = 0;
+		/* order hpte_setup_done vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.rma_setup_done = 1;
+			kvm->arch.hpte_setup_done = 1;
 			goto out;
 		}
 	}
@@ -338,9 +339,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	gr = kvm->arch.revmap[index].guest_rpte;
 
-	/* Unlock the HPTE */
-	asm volatile("lwsync" : : : "memory");
-	hptep[0] = cpu_to_be64(v);
+	unlock_hpte(hptep, v);
 	preempt_enable();
 
 	gpte->eaddr = eaddr;
@@ -469,8 +468,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
 	hpte[1] = be64_to_cpu(hptep[1]);
 	hpte[2] = r = rev->guest_rpte;
-	asm volatile("lwsync" : : : "memory");
-	hptep[0] = cpu_to_be64(hpte[0]);
+	unlock_hpte(hptep, hpte[0]);
 	preempt_enable();
 
 	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
@@ -621,7 +619,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	hptep[1] = cpu_to_be64(r);
 	eieio();
-	hptep[0] = cpu_to_be64(hpte[0]);
+	__unlock_hpte(hptep, hpte[0]);
 	asm volatile("ptesync" : : : "memory");
 	preempt_enable();
 	if (page && hpte_is_writable(r))
@@ -642,7 +640,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	return ret;
 
  out_unlock:
-	hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+	__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	preempt_enable();
 	goto out_put;
 }
@@ -771,7 +769,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			}
 		}
 		unlock_rmap(rmapp);
-		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	}
 	return 0;
 }
@@ -857,7 +855,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			}
 			ret = 1;
 		}
-		hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 	} while ((i = j) != head);
 
 	unlock_rmap(rmapp);
@@ -974,8 +972,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 
 		/* Now check and modify the HPTE */
 		if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) {
-			/* unlock and continue */
-			hptep[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+			__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
 			continue;
 		}
 
@@ -996,9 +993,9 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
 				npages_dirty = n;
 			eieio();
 		}
-		v &= ~(HPTE_V_ABSENT | HPTE_V_HVLOCK);
+		v &= ~HPTE_V_ABSENT;
 		v |= HPTE_V_VALID;
-		hptep[0] = cpu_to_be64(v);
+		__unlock_hpte(hptep, v);
 	} while ((i = j) != head);
 
 	unlock_rmap(rmapp);
@@ -1218,8 +1215,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 			r &= ~HPTE_GR_MODIFIED;
 			revp->guest_rpte = r;
 		}
-		asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-		hptp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		unlock_hpte(hptp, be64_to_cpu(hptp[0]));
 		preempt_enable();
 		if (!(valid == want_valid && (first_pass || dirty)))
 			ok = 0;
@@ -1339,20 +1335,20 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	unsigned long tmp[2];
 	ssize_t nb;
 	long int err, ret;
-	int rma_setup;
+	int hpte_setup;
 
 	if (!access_ok(VERIFY_READ, buf, count))
 		return -EFAULT;
 
 	/* lock out vcpus from running while we're doing this */
 	mutex_lock(&kvm->lock);
-	rma_setup = kvm->arch.rma_setup_done;
-	if (rma_setup) {
-		kvm->arch.rma_setup_done = 0;	/* temporarily */
-		/* order rma_setup_done vs. vcpus_running */
+	hpte_setup = kvm->arch.hpte_setup_done;
+	if (hpte_setup) {
+		kvm->arch.hpte_setup_done = 0;	/* temporarily */
+		/* order hpte_setup_done vs. vcpus_running */
 		smp_mb();
 		if (atomic_read(&kvm->arch.vcpus_running)) {
-			kvm->arch.rma_setup_done = 1;
+			kvm->arch.hpte_setup_done = 1;
 			mutex_unlock(&kvm->lock);
 			return -EBUSY;
 		}
@@ -1405,7 +1401,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 				       "r=%lx\n", ret, i, v, r);
 				goto out;
 			}
-			if (!rma_setup && is_vrma_hpte(v)) {
+			if (!hpte_setup && is_vrma_hpte(v)) {
 				unsigned long psize = hpte_base_page_size(v, r);
 				unsigned long senc = slb_pgsize_encoding(psize);
 				unsigned long lpcr;
@@ -1414,7 +1410,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 					(VRMA_VSID << SLB_VSID_SHIFT_1T);
 				lpcr = senc << (LPCR_VRMASD_SH - 4);
 				kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
-				rma_setup = 1;
+				hpte_setup = 1;
 			}
 			++i;
 			hptp += 2;
@@ -1430,9 +1426,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
 	}
 
  out:
-	/* Order HPTE updates vs. rma_setup_done */
+	/* Order HPTE updates vs. hpte_setup_done */
 	smp_wmb();
-	kvm->arch.rma_setup_done = rma_setup;
+	kvm->arch.hpte_setup_done = hpte_setup;
 	mutex_unlock(&kvm->lock);
 
 	if (err)
@@ -1495,6 +1491,141 @@ int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf)
 	return ret;
 }
 
+struct debugfs_htab_state {
+	struct kvm	*kvm;
+	struct mutex	mutex;
+	unsigned long	hpt_index;
+	int		chars_left;
+	int		buf_index;
+	char		buf[64];
+};
+
+static int debugfs_htab_open(struct inode *inode, struct file *file)
+{
+	struct kvm *kvm = inode->i_private;
+	struct debugfs_htab_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(kvm);
+	p->kvm = kvm;
+	mutex_init(&p->mutex);
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_htab_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_htab_state *p = file->private_data;
+
+	kvm_put_kvm(p->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
+				 size_t len, loff_t *ppos)
+{
+	struct debugfs_htab_state *p = file->private_data;
+	ssize_t ret, r;
+	unsigned long i, n;
+	unsigned long v, hr, gr;
+	struct kvm *kvm;
+	__be64 *hptp;
+
+	ret = mutex_lock_interruptible(&p->mutex);
+	if (ret)
+		return ret;
+
+	if (p->chars_left) {
+		n = p->chars_left;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf + p->buf_index, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index += n;
+		buf += n;
+		len -= n;
+		ret = n;
+		if (r) {
+			if (!n)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+
+	kvm = p->kvm;
+	i = p->hpt_index;
+	hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE));
+	for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) {
+		if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		/* lock the HPTE so it's stable and read it */
+		preempt_disable();
+		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
+			cpu_relax();
+		v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK;
+		hr = be64_to_cpu(hptp[1]);
+		gr = kvm->arch.revmap[i].guest_rpte;
+		unlock_hpte(hptp, v);
+		preempt_enable();
+
+		if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+			continue;
+
+		n = scnprintf(p->buf, sizeof(p->buf),
+			      "%6lx %.16lx %.16lx %.16lx\n",
+			      i, v, hr, gr);
+		p->chars_left = n;
+		if (n > len)
+			n = len;
+		r = copy_to_user(buf, p->buf, n);
+		n -= r;
+		p->chars_left -= n;
+		p->buf_index = n;
+		buf += n;
+		len -= n;
+		ret += n;
+		if (r) {
+			if (!ret)
+				ret = -EFAULT;
+			goto out;
+		}
+	}
+	p->hpt_index = i;
+
+ out:
+	mutex_unlock(&p->mutex);
+	return ret;
+}
+
+ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
+			   size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_htab_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_htab_open,
+	.release = debugfs_htab_release,
+	.read	 = debugfs_htab_read,
+	.write	 = debugfs_htab_write,
+	.llseek	 = generic_file_llseek,
+};
+
+void kvmppc_mmu_debugfs_init(struct kvm *kvm)
+{
+	kvm->arch.htab_dentry = debugfs_create_file("htab", 0400,
+						    kvm->arch.debugfs_dir, kvm,
+						    &debugfs_htab_fops);
+}
+
 void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
 {
 	struct kvmppc_mmu *mmu = &vcpu->arch.mmu;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de747563d29d..48d3c5d2ecc9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -32,6 +32,7 @@
 #include <linux/page-flags.h>
 #include <linux/srcu.h>
 #include <linux/miscdevice.h>
+#include <linux/debugfs.h>
 
 #include <asm/reg.h>
 #include <asm/cputable.h>
@@ -50,6 +51,7 @@
 #include <asm/hvcall.h>
 #include <asm/switch_to.h>
 #include <asm/smp.h>
+#include <asm/dbell.h>
 #include <linux/gfp.h>
 #include <linux/vmalloc.h>
 #include <linux/highmem.h>
@@ -83,9 +85,35 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
+static bool kvmppc_ipi_thread(int cpu)
+{
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
+		preempt_disable();
+		if (cpu_first_thread_sibling(cpu) ==
+		    cpu_first_thread_sibling(smp_processor_id())) {
+			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+			msg |= cpu_thread_in_core(cpu);
+			smp_mb();
+			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+			preempt_enable();
+			return true;
+		}
+		preempt_enable();
+	}
+
+#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
+	if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
+		xics_wake_cpu(cpu);
+		return true;
+	}
+#endif
+
+	return false;
+}
+
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
-	int me;
 	int cpu = vcpu->cpu;
 	wait_queue_head_t *wqp;
 
@@ -95,20 +123,12 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 	}
 
-	me = get_cpu();
+	if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+		return;
 
 	/* CPU points to the first thread of the core */
-	if (cpu != me && cpu >= 0 && cpu < nr_cpu_ids) {
-#ifdef CONFIG_PPC_ICP_NATIVE
-		int real_cpu = cpu + vcpu->arch.ptid;
-		if (paca[real_cpu].kvm_hstate.xics_phys)
-			xics_wake_cpu(real_cpu);
-		else
-#endif
-		if (cpu_online(cpu))
-			smp_send_reschedule(cpu);
-	}
-	put_cpu();
+	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
+		smp_send_reschedule(cpu);
 }
 
 /*
@@ -706,6 +726,16 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
 		/* Send the error out to userspace via KVM_RUN */
 		return rc;
+	case H_LOGICAL_CI_LOAD:
+		ret = kvmppc_h_logical_ci_load(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_LOGICAL_CI_STORE:
+		ret = kvmppc_h_logical_ci_store(vcpu);
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
 	case H_SET_MODE:
 		ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4),
 					kvmppc_get_gpr(vcpu, 5),
@@ -740,6 +770,8 @@ static int kvmppc_hcall_impl_hv(unsigned long cmd)
 	case H_CONFER:
 	case H_REGISTER_VPA:
 	case H_SET_MODE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
 	case H_XIRR:
 	case H_CPPR:
@@ -1410,6 +1442,154 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	return vcore;
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+static struct debugfs_timings_element {
+	const char *name;
+	size_t offset;
+} timings[] = {
+	{"rm_entry",	offsetof(struct kvm_vcpu, arch.rm_entry)},
+	{"rm_intr",	offsetof(struct kvm_vcpu, arch.rm_intr)},
+	{"rm_exit",	offsetof(struct kvm_vcpu, arch.rm_exit)},
+	{"guest",	offsetof(struct kvm_vcpu, arch.guest_time)},
+	{"cede",	offsetof(struct kvm_vcpu, arch.cede_time)},
+};
+
+#define N_TIMINGS	(sizeof(timings) / sizeof(timings[0]))
+
+struct debugfs_timings_state {
+	struct kvm_vcpu	*vcpu;
+	unsigned int	buflen;
+	char		buf[N_TIMINGS * 100];
+};
+
+static int debugfs_timings_open(struct inode *inode, struct file *file)
+{
+	struct kvm_vcpu *vcpu = inode->i_private;
+	struct debugfs_timings_state *p;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	kvm_get_kvm(vcpu->kvm);
+	p->vcpu = vcpu;
+	file->private_data = p;
+
+	return nonseekable_open(inode, file);
+}
+
+static int debugfs_timings_release(struct inode *inode, struct file *file)
+{
+	struct debugfs_timings_state *p = file->private_data;
+
+	kvm_put_kvm(p->vcpu->kvm);
+	kfree(p);
+	return 0;
+}
+
+static ssize_t debugfs_timings_read(struct file *file, char __user *buf,
+				    size_t len, loff_t *ppos)
+{
+	struct debugfs_timings_state *p = file->private_data;
+	struct kvm_vcpu *vcpu = p->vcpu;
+	char *s, *buf_end;
+	struct kvmhv_tb_accumulator tb;
+	u64 count;
+	loff_t pos;
+	ssize_t n;
+	int i, loops;
+	bool ok;
+
+	if (!p->buflen) {
+		s = p->buf;
+		buf_end = s + sizeof(p->buf);
+		for (i = 0; i < N_TIMINGS; ++i) {
+			struct kvmhv_tb_accumulator *acc;
+
+			acc = (struct kvmhv_tb_accumulator *)
+				((unsigned long)vcpu + timings[i].offset);
+			ok = false;
+			for (loops = 0; loops < 1000; ++loops) {
+				count = acc->seqcount;
+				if (!(count & 1)) {
+					smp_rmb();
+					tb = *acc;
+					smp_rmb();
+					if (count == acc->seqcount) {
+						ok = true;
+						break;
+					}
+				}
+				udelay(1);
+			}
+			if (!ok)
+				snprintf(s, buf_end - s, "%s: stuck\n",
+					timings[i].name);
+			else
+				snprintf(s, buf_end - s,
+					"%s: %llu %llu %llu %llu\n",
+					timings[i].name, count / 2,
+					tb_to_ns(tb.tb_total),
+					tb_to_ns(tb.tb_min),
+					tb_to_ns(tb.tb_max));
+			s += strlen(s);
+		}
+		p->buflen = s - p->buf;
+	}
+
+	pos = *ppos;
+	if (pos >= p->buflen)
+		return 0;
+	if (len > p->buflen - pos)
+		len = p->buflen - pos;
+	n = copy_to_user(buf, p->buf + pos, len);
+	if (n) {
+		if (n == len)
+			return -EFAULT;
+		len -= n;
+	}
+	*ppos = pos + len;
+	return len;
+}
+
+static ssize_t debugfs_timings_write(struct file *file, const char __user *buf,
+				     size_t len, loff_t *ppos)
+{
+	return -EACCES;
+}
+
+static const struct file_operations debugfs_timings_ops = {
+	.owner	 = THIS_MODULE,
+	.open	 = debugfs_timings_open,
+	.release = debugfs_timings_release,
+	.read	 = debugfs_timings_read,
+	.write	 = debugfs_timings_write,
+	.llseek	 = generic_file_llseek,
+};
+
+/* Create a debugfs directory for the vcpu */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+	char buf[16];
+	struct kvm *kvm = vcpu->kvm;
+
+	snprintf(buf, sizeof(buf), "vcpu%u", id);
+	if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir);
+	if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir))
+		return;
+	vcpu->arch.debugfs_timings =
+		debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir,
+				    vcpu, &debugfs_timings_ops);
+}
+
+#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id)
+{
+}
+#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */
+
 static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 						   unsigned int id)
 {
@@ -1479,6 +1659,8 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
 
+	debugfs_vcpu_init(vcpu, id);
+
 	return vcpu;
 
 free_vcpu:
@@ -1566,8 +1748,10 @@ static int kvmppc_grab_hwthread(int cpu)
 	tpaca = &paca[cpu];
 
 	/* Ensure the thread won't go into the kernel if it wakes */
-	tpaca->kvm_hstate.hwthread_req = 1;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.napping = 0;
+	smp_wmb();
+	tpaca->kvm_hstate.hwthread_req = 1;
 
 	/*
 	 * If the thread is already executing in the kernel (e.g. handling
@@ -1610,35 +1794,41 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
 	}
 	cpu = vc->pcpu + vcpu->arch.ptid;
 	tpaca = &paca[cpu];
-	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcore = vc;
 	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
 	vcpu->cpu = vc->pcpu;
+	/* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
 	smp_wmb();
-#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
-	if (cpu != smp_processor_id()) {
-		xics_wake_cpu(cpu);
-		if (vcpu->arch.ptid)
-			++vc->n_woken;
-	}
-#endif
+	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	if (cpu != smp_processor_id())
+		kvmppc_ipi_thread(cpu);
 }
 
-static void kvmppc_wait_for_nap(struct kvmppc_vcore *vc)
+static void kvmppc_wait_for_nap(void)
 {
-	int i;
+	int cpu = smp_processor_id();
+	int i, loops;
 
-	HMT_low();
-	i = 0;
-	while (vc->nap_count < vc->n_woken) {
-		if (++i >= 1000000) {
-			pr_err("kvmppc_wait_for_nap timeout %d %d\n",
-			       vc->nap_count, vc->n_woken);
-			break;
+	for (loops = 0; loops < 1000000; ++loops) {
+		/*
+		 * Check if all threads are finished.
+		 * We set the vcpu pointer when starting a thread
+		 * and the thread clears it when finished, so we look
+		 * for any threads that still have a non-NULL vcpu ptr.
+		 */
+		for (i = 1; i < threads_per_subcore; ++i)
+			if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+				break;
+		if (i == threads_per_subcore) {
+			HMT_medium();
+			return;
 		}
-		cpu_relax();
+		HMT_low();
 	}
 	HMT_medium();
+	for (i = 1; i < threads_per_subcore; ++i)
+		if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 
 /*
@@ -1700,63 +1890,103 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
 	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 
+static void prepare_threads(struct kvmppc_vcore *vc)
+{
+	struct kvm_vcpu *vcpu, *vnext;
+
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		if (signal_pending(vcpu->arch.run_task))
+			vcpu->arch.ret = -EINTR;
+		else if (vcpu->arch.vpa.update_pending ||
+			 vcpu->arch.slb_shadow.update_pending ||
+			 vcpu->arch.dtl.update_pending)
+			vcpu->arch.ret = RESUME_GUEST;
+		else
+			continue;
+		kvmppc_remove_runnable(vc, vcpu);
+		wake_up(&vcpu->arch.cpu_run);
+	}
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc)
+{
+	u64 now;
+	long ret;
+	struct kvm_vcpu *vcpu, *vnext;
+
+	now = get_tb();
+	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
+				 arch.run_list) {
+		/* cancel pending dec exception if dec is positive */
+		if (now < vcpu->arch.dec_expires &&
+		    kvmppc_core_pending_dec(vcpu))
+			kvmppc_core_dequeue_dec(vcpu);
+
+		trace_kvm_guest_exit(vcpu);
+
+		ret = RESUME_GUEST;
+		if (vcpu->arch.trap)
+			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
+						    vcpu->arch.run_task);
+
+		vcpu->arch.ret = ret;
+		vcpu->arch.trap = 0;
+
+		if (vcpu->arch.ceded) {
+			if (!is_kvmppc_resume_guest(ret))
+				kvmppc_end_cede(vcpu);
+			else
+				kvmppc_set_timer(vcpu);
+		}
+		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
  */
-static void kvmppc_run_core(struct kvmppc_vcore *vc)
+static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 {
-	struct kvm_vcpu *vcpu, *vnext;
-	long ret;
-	u64 now;
-	int i, need_vpa_update;
+	struct kvm_vcpu *vcpu;
+	int i;
 	int srcu_idx;
-	struct kvm_vcpu *vcpus_to_update[threads_per_core];
 
-	/* don't start if any threads have a signal pending */
-	need_vpa_update = 0;
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		if (signal_pending(vcpu->arch.run_task))
-			return;
-		if (vcpu->arch.vpa.update_pending ||
-		    vcpu->arch.slb_shadow.update_pending ||
-		    vcpu->arch.dtl.update_pending)
-			vcpus_to_update[need_vpa_update++] = vcpu;
-	}
+	/*
+	 * Remove from the list any threads that have a signal pending
+	 * or need a VPA update done
+	 */
+	prepare_threads(vc);
+
+	/* if the runner is no longer runnable, let the caller pick a new one */
+	if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE)
+		return;
 
 	/*
-	 * Initialize *vc, in particular vc->vcore_state, so we can
-	 * drop the vcore lock if necessary.
+	 * Initialize *vc.
 	 */
-	vc->n_woken = 0;
-	vc->nap_count = 0;
-	vc->entry_exit_count = 0;
+	vc->entry_exit_map = 0;
 	vc->preempt_tb = TB_NIL;
-	vc->vcore_state = VCORE_STARTING;
 	vc->in_guest = 0;
 	vc->napping_threads = 0;
 	vc->conferring_threads = 0;
 
 	/*
-	 * Updating any of the vpas requires calling kvmppc_pin_guest_page,
-	 * which can't be called with any spinlocks held.
-	 */
-	if (need_vpa_update) {
-		spin_unlock(&vc->lock);
-		for (i = 0; i < need_vpa_update; ++i)
-			kvmppc_update_vpas(vcpus_to_update[i]);
-		spin_lock(&vc->lock);
-	}
-
-	/*
 	 * Make sure we are running on primary threads, and that secondary
 	 * threads are offline.  Also check if the number of threads in this
 	 * guest are greater than the current system threads per guest.
 	 */
 	if ((threads_per_core > 1) &&
 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
-		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
+		list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
 			vcpu->arch.ret = -EBUSY;
+			kvmppc_remove_runnable(vc, vcpu);
+			wake_up(&vcpu->arch.cpu_run);
+		}
 		goto out;
 	}
 
@@ -1797,8 +2027,7 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
 		vcpu->cpu = -1;
 	/* wait for secondary threads to finish writing their state to memory */
-	if (vc->nap_count < vc->n_woken)
-		kvmppc_wait_for_nap(vc);
+	kvmppc_wait_for_nap();
 	for (i = 0; i < threads_per_subcore; ++i)
 		kvmppc_release_hwthread(vc->pcpu + i);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
@@ -1812,44 +2041,12 @@ static void kvmppc_run_core(struct kvmppc_vcore *vc)
 	kvm_guest_exit();
 
 	preempt_enable();
-	cond_resched();
 
 	spin_lock(&vc->lock);
-	now = get_tb();
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		/* cancel pending dec exception if dec is positive */
-		if (now < vcpu->arch.dec_expires &&
-		    kvmppc_core_pending_dec(vcpu))
-			kvmppc_core_dequeue_dec(vcpu);
-
-		trace_kvm_guest_exit(vcpu);
-
-		ret = RESUME_GUEST;
-		if (vcpu->arch.trap)
-			ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu,
-						    vcpu->arch.run_task);
-
-		vcpu->arch.ret = ret;
-		vcpu->arch.trap = 0;
-
-		if (vcpu->arch.ceded) {
-			if (!is_kvmppc_resume_guest(ret))
-				kvmppc_end_cede(vcpu);
-			else
-				kvmppc_set_timer(vcpu);
-		}
-	}
+	post_guest_process(vc);
 
  out:
 	vc->vcore_state = VCORE_INACTIVE;
-	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
-				 arch.run_list) {
-		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
-			kvmppc_remove_runnable(vc, vcpu);
-			wake_up(&vcpu->arch.cpu_run);
-		}
-	}
-
 	trace_kvmppc_run_core(vc, 1);
 }
 
@@ -1939,8 +2136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * this thread straight away and have it join in.
 	 */
 	if (!signal_pending(current)) {
-		if (vc->vcore_state == VCORE_RUNNING &&
-		    VCORE_EXIT_COUNT(vc) == 0) {
+		if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_start_thread(vcpu);
 			trace_kvm_guest_enter(vcpu);
@@ -1971,7 +2167,6 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		}
 		if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
 			break;
-		vc->runner = vcpu;
 		n_ceded = 0;
 		list_for_each_entry(v, &vc->runnable_threads, arch.run_list) {
 			if (!v->arch.pending_exceptions)
@@ -1979,10 +2174,17 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 			else
 				v->arch.ceded = 0;
 		}
-		if (n_ceded == vc->n_runnable)
+		vc->runner = vcpu;
+		if (n_ceded == vc->n_runnable) {
 			kvmppc_vcore_blocked(vc);
-		else
+		} else if (should_resched()) {
+			vc->vcore_state = VCORE_PREEMPT;
+			/* Let something else run */
+			cond_resched_lock(&vc->lock);
+			vc->vcore_state = VCORE_INACTIVE;
+		} else {
 			kvmppc_run_core(vc);
+		}
 		vc->runner = NULL;
 	}
 
@@ -2032,11 +2234,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 	}
 
 	atomic_inc(&vcpu->kvm->arch.vcpus_running);
-	/* Order vcpus_running vs. rma_setup_done, see kvmppc_alloc_reset_hpt */
+	/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
 	smp_mb();
 
 	/* On the first time here, set up HTAB and VRMA */
-	if (!vcpu->kvm->arch.rma_setup_done) {
+	if (!vcpu->kvm->arch.hpte_setup_done) {
 		r = kvmppc_hv_setup_htab_rma(vcpu);
 		if (r)
 			goto out;
@@ -2238,7 +2440,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	int srcu_idx;
 
 	mutex_lock(&kvm->lock);
-	if (kvm->arch.rma_setup_done)
+	if (kvm->arch.hpte_setup_done)
 		goto out;	/* another vcpu beat us to it */
 
 	/* Allocate hashed page table (if not done already) and reset it */
@@ -2289,9 +2491,9 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
 	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
 
-	/* Order updates to kvm->arch.lpcr etc. vs. rma_setup_done */
+	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
 	smp_wmb();
-	kvm->arch.rma_setup_done = 1;
+	kvm->arch.hpte_setup_done = 1;
 	err = 0;
  out_srcu:
 	srcu_read_unlock(&kvm->srcu, srcu_idx);
@@ -2307,6 +2509,7 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
+	char buf[32];
 
 	/* Allocate the guest's logical partition ID */
 
@@ -2347,6 +2550,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 	 */
 	kvm_hv_vm_activated();
 
+	/*
+	 * Create a debugfs directory for the VM
+	 */
+	snprintf(buf, sizeof(buf), "vm%d", current->pid);
+	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
+	if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir))
+		kvmppc_mmu_debugfs_init(kvm);
+
 	return 0;
 }
 
@@ -2367,6 +2578,8 @@ static void kvmppc_free_vcores(struct kvm *kvm)
 
 static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 {
+	debugfs_remove_recursive(kvm->arch.debugfs_dir);
+
 	kvm_hv_vm_deactivated();
 
 	kvmppc_free_vcores(kvm);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index 1f083ff8a61a..ed2589d4593f 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -21,6 +21,10 @@
 #include <asm/cputable.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
+#include <asm/archrandom.h>
+#include <asm/xics.h>
+#include <asm/dbell.h>
+#include <asm/cputhreads.h>
 
 #define KVM_CMA_CHUNK_ORDER	18
 
@@ -114,11 +118,11 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
 	int rv = H_SUCCESS; /* => don't yield */
 
 	set_bit(vcpu->arch.ptid, &vc->conferring_threads);
-	while ((get_tb() < stop) && (VCORE_EXIT_COUNT(vc) == 0)) {
-		threads_running = VCORE_ENTRY_COUNT(vc);
-		threads_ceded = hweight32(vc->napping_threads);
-		threads_conferring = hweight32(vc->conferring_threads);
-		if (threads_ceded + threads_conferring >= threads_running) {
+	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
+		threads_running = VCORE_ENTRY_MAP(vc);
+		threads_ceded = vc->napping_threads;
+		threads_conferring = vc->conferring_threads;
+		if ((threads_ceded | threads_conferring) == threads_running) {
 			rv = H_TOO_HARD; /* => do yield */
 			break;
 		}
@@ -169,3 +173,89 @@ int kvmppc_hcall_impl_hv_realmode(unsigned long cmd)
 	return 0;
 }
 EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode);
+
+int kvmppc_hwrng_present(void)
+{
+	return powernv_hwrng_present();
+}
+EXPORT_SYMBOL_GPL(kvmppc_hwrng_present);
+
+long kvmppc_h_random(struct kvm_vcpu *vcpu)
+{
+	if (powernv_get_random_real_mode(&vcpu->arch.gpr[4]))
+		return H_SUCCESS;
+
+	return H_HARDWARE;
+}
+
+static inline void rm_writeb(unsigned long paddr, u8 val)
+{
+	__asm__ __volatile__("stbcix %0,0,%1"
+		: : "r" (val), "r" (paddr) : "memory");
+}
+
+/*
+ * Send an interrupt or message to another CPU.
+ * This can only be called in real mode.
+ * The caller needs to include any barrier needed to order writes
+ * to memory vs. the IPI/message.
+ */
+void kvmhv_rm_send_ipi(int cpu)
+{
+	unsigned long xics_phys;
+
+	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    cpu_first_thread_sibling(cpu) ==
+	    cpu_first_thread_sibling(raw_smp_processor_id())) {
+		unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+		msg |= cpu_thread_in_core(cpu);
+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
+		return;
+	}
+
+	/* Else poke the target with an IPI */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+}
+
+/*
+ * The following functions are called from the assembly code
+ * in book3s_hv_rmhandlers.S.
+ */
+static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active)
+{
+	int cpu = vc->pcpu;
+
+	/* Order setting of exit map vs. msgsnd/IPI */
+	smp_mb();
+	for (; active; active >>= 1, ++cpu)
+		if (active & 1)
+			kvmhv_rm_send_ipi(cpu);
+}
+
+void kvmhv_commence_exit(int trap)
+{
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
+	int me, ee;
+
+	/* Set our bit in the threads-exiting-guest map in the 0xff00
+	   bits of vcore->entry_exit_map */
+	me = 0x100 << ptid;
+	do {
+		ee = vc->entry_exit_map;
+	} while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee);
+
+	/* Are we the first here? */
+	if ((ee >> 8) != 0)
+		return;
+
+	/*
+	 * Trigger the other threads in this vcore to exit the guest.
+	 * If this is a hypervisor decrementer interrupt then they
+	 * will be already on their way out of the guest.
+	 */
+	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
+		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 625407e4d3b0..f6bf0b1de6d7 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -150,12 +150,6 @@ static pte_t lookup_linux_pte_and_update(pgd_t *pgdir, unsigned long hva,
 	return kvmppc_read_update_linux_pte(ptep, writing, hugepage_shift);
 }
 
-static inline void unlock_hpte(__be64 *hpte, unsigned long hpte_v)
-{
-	asm volatile(PPC_RELEASE_BARRIER "" : : : "memory");
-	hpte[0] = cpu_to_be64(hpte_v);
-}
-
 long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 		       long pte_index, unsigned long pteh, unsigned long ptel,
 		       pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret)
@@ -271,10 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 				u64 pte;
 				while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 					cpu_relax();
-				pte = be64_to_cpu(*hpte);
+				pte = be64_to_cpu(hpte[0]);
 				if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT)))
 					break;
-				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				__unlock_hpte(hpte, pte);
 				hpte += 2;
 			}
 			if (i == 8)
@@ -290,9 +284,9 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 			while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
 				cpu_relax();
-			pte = be64_to_cpu(*hpte);
+			pte = be64_to_cpu(hpte[0]);
 			if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) {
-				*hpte &= ~cpu_to_be64(HPTE_V_HVLOCK);
+				__unlock_hpte(hpte, pte);
 				return H_PTEG_FULL;
 			}
 		}
@@ -331,7 +325,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
 	eieio();
-	hpte[0] = cpu_to_be64(pteh);
+	__unlock_hpte(hpte, pteh);
 	asm volatile("ptesync" : : : "memory");
 
 	*pte_idx_ret = pte_index;
@@ -412,7 +406,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
 	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
-		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hpte, pte);
 		return H_NOT_FOUND;
 	}
 
@@ -548,7 +542,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 				be64_to_cpu(hp[0]), be64_to_cpu(hp[1]));
 			rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
 			args[j] |= rcbits << (56 - 5);
-			hp[0] = 0;
+			__unlock_hpte(hp, 0);
 		}
 	}
 
@@ -574,7 +568,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 	pte = be64_to_cpu(hpte[0]);
 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
-		hpte[0] &= ~cpu_to_be64(HPTE_V_HVLOCK);
+		__unlock_hpte(hpte, pte);
 		return H_NOT_FOUND;
 	}
 
@@ -755,8 +749,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 				/* Return with the HPTE still locked */
 				return (hash << 3) + (i >> 1);
 
-			/* Unlock and move on */
-			hpte[i] = cpu_to_be64(v);
+			__unlock_hpte(&hpte[i], v);
 		}
 
 		if (val & HPTE_V_SECONDARY)
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 7c22997de906..00e45b6d4f24 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -23,17 +23,37 @@
 
 #define DEBUG_PASSUP
 
-static inline void rm_writeb(unsigned long paddr, u8 val)
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq);
+
+/* -- ICS routines -- */
+static void ics_rm_check_resend(struct kvmppc_xics *xics,
+				struct kvmppc_ics *ics, struct kvmppc_icp *icp)
 {
-	__asm__ __volatile__("sync; stbcix %0,0,%1"
-		: : "r" (val), "r" (paddr) : "memory");
+	int i;
+
+	arch_spin_lock(&ics->lock);
+
+	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+		struct ics_irq_state *state = &ics->irq_state[i];
+
+		if (!state->resend)
+			continue;
+
+		arch_spin_unlock(&ics->lock);
+		icp_rm_deliver_irq(xics, icp, state->number);
+		arch_spin_lock(&ics->lock);
+	}
+
+	arch_spin_unlock(&ics->lock);
 }
 
+/* -- ICP routines -- */
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 				struct kvm_vcpu *this_vcpu)
 {
 	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
-	unsigned long xics_phys;
 	int cpu;
 
 	/* Mark the target VCPU as having an interrupt pending */
@@ -56,9 +76,8 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 	/* In SMT cpu will always point to thread 0, we adjust it */
 	cpu += vcpu->arch.ptid;
 
-	/* Not too hard, then poke the target */
-	xics_phys = paca[cpu].kvm_hstate.xics_phys;
-	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
+	smp_mb();
+	kvmhv_rm_send_ipi(cpu);
 }
 
 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
@@ -116,6 +135,180 @@ static inline int check_too_hard(struct kvmppc_xics *xics,
 	return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS;
 }
 
+static void icp_rm_check_resend(struct kvmppc_xics *xics,
+			     struct kvmppc_icp *icp)
+{
+	u32 icsid;
+
+	/* Order this load with the test for need_resend in the caller */
+	smp_rmb();
+	for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) {
+		struct kvmppc_ics *ics = xics->ics[icsid];
+
+		if (!test_and_clear_bit(icsid, icp->resend_map))
+			continue;
+		if (!ics)
+			continue;
+		ics_rm_check_resend(xics, ics, icp);
+	}
+}
+
+static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority,
+			       u32 *reject)
+{
+	union kvmppc_icp_state old_state, new_state;
+	bool success;
+
+	do {
+		old_state = new_state = READ_ONCE(icp->state);
+
+		*reject = 0;
+
+		/* See if we can deliver */
+		success = new_state.cppr > priority &&
+			new_state.mfrr > priority &&
+			new_state.pending_pri > priority;
+
+		/*
+		 * If we can, check for a rejection and perform the
+		 * delivery
+		 */
+		if (success) {
+			*reject = new_state.xisr;
+			new_state.xisr = irq;
+			new_state.pending_pri = priority;
+		} else {
+			/*
+			 * If we failed to deliver we set need_resend
+			 * so a subsequent CPPR state change causes us
+			 * to try a new delivery.
+			 */
+			new_state.need_resend = true;
+		}
+
+	} while (!icp_rm_try_update(icp, old_state, new_state));
+
+	return success;
+}
+
+static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
+			    u32 new_irq)
+{
+	struct ics_irq_state *state;
+	struct kvmppc_ics *ics;
+	u32 reject;
+	u16 src;
+
+	/*
+	 * This is used both for initial delivery of an interrupt and
+	 * for subsequent rejection.
+	 *
+	 * Rejection can be racy vs. resends. We have evaluated the
+	 * rejection in an atomic ICP transaction which is now complete,
+	 * so potentially the ICP can already accept the interrupt again.
+	 *
+	 * So we need to retry the delivery. Essentially the reject path
+	 * boils down to a failed delivery. Always.
+	 *
+	 * Now the interrupt could also have moved to a different target,
+	 * thus we may need to re-do the ICP lookup as well
+	 */
+
+ again:
+	/* Get the ICS state and lock it */
+	ics = kvmppc_xics_find_ics(xics, new_irq, &src);
+	if (!ics) {
+		/* Unsafe increment, but this does not need to be accurate */
+		xics->err_noics++;
+		return;
+	}
+	state = &ics->irq_state[src];
+
+	/* Get a lock on the ICS */
+	arch_spin_lock(&ics->lock);
+
+	/* Get our server */
+	if (!icp || state->server != icp->server_num) {
+		icp = kvmppc_xics_find_server(xics->kvm, state->server);
+		if (!icp) {
+			/* Unsafe increment again*/
+			xics->err_noicp++;
+			goto out;
+		}
+	}
+
+	/* Clear the resend bit of that interrupt */
+	state->resend = 0;
+
+	/*
+	 * If masked, bail out
+	 *
+	 * Note: PAPR doesn't mention anything about masked pending
+	 * when doing a resend, only when doing a delivery.
+	 *
+	 * However that would have the effect of losing a masked
+	 * interrupt that was rejected and isn't consistent with
+	 * the whole masked_pending business which is about not
+	 * losing interrupts that occur while masked.
+	 *
+	 * I don't differentiate normal deliveries and resends, this
+	 * implementation will differ from PAPR and not lose such
+	 * interrupts.
+	 */
+	if (state->priority == MASKED) {
+		state->masked_pending = 1;
+		goto out;
+	}
+
+	/*
+	 * Try the delivery, this will set the need_resend flag
+	 * in the ICP as part of the atomic transaction if the
+	 * delivery is not possible.
+	 *
+	 * Note that if successful, the new delivery might have itself
+	 * rejected an interrupt that was "delivered" before we took the
+	 * ics spin lock.
+	 *
+	 * In this case we do the whole sequence all over again for the
+	 * new guy. We cannot assume that the rejected interrupt is less
+	 * favored than the new one, and thus doesn't need to be delivered,
+	 * because by the time we exit icp_rm_try_to_deliver() the target
+	 * processor may well have already consumed & completed it, and thus
+	 * the rejected interrupt might actually be already acceptable.
+	 */
+	if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) {
+		/*
+		 * Delivery was successful, did we reject somebody else ?
+		 */
+		if (reject && reject != XICS_IPI) {
+			arch_spin_unlock(&ics->lock);
+			new_irq = reject;
+			goto again;
+		}
+	} else {
+		/*
+		 * We failed to deliver the interrupt we need to set the
+		 * resend map bit and mark the ICS state as needing a resend
+		 */
+		set_bit(ics->icsid, icp->resend_map);
+		state->resend = 1;
+
+		/*
+		 * If the need_resend flag got cleared in the ICP some time
+		 * between icp_rm_try_to_deliver() atomic update and now, then
+		 * we know it might have missed the resend_map bit. So we
+		 * retry
+		 */
+		smp_mb();
+		if (!icp->state.need_resend) {
+			arch_spin_unlock(&ics->lock);
+			goto again;
+		}
+	}
+ out:
+	arch_spin_unlock(&ics->lock);
+}
+
 static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			     u8 new_cppr)
 {
@@ -184,8 +377,8 @@ static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	 * separately here as well.
 	 */
 	if (resend) {
-		icp->rm_action |= XICS_RM_CHECK_RESEND;
-		icp->rm_resend_icp = icp;
+		icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
 	}
 }
 
@@ -300,16 +493,16 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 		}
 	} while (!icp_rm_try_update(icp, old_state, new_state));
 
-	/* Pass rejects to virtual mode */
+	/* Handle reject in real mode */
 	if (reject && reject != XICS_IPI) {
-		this_icp->rm_action |= XICS_RM_REJECT;
-		this_icp->rm_reject = reject;
+		this_icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
 	}
 
-	/* Pass resends to virtual mode */
+	/* Handle resends in real mode */
 	if (resend) {
-		this_icp->rm_action |= XICS_RM_CHECK_RESEND;
-		this_icp->rm_resend_icp = icp;
+		this_icp->n_check_resend++;
+		icp_rm_check_resend(xics, icp);
 	}
 
 	return check_too_hard(xics, this_icp);
@@ -365,10 +558,13 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
 	} while (!icp_rm_try_update(icp, old_state, new_state));
 
-	/* Pass rejects to virtual mode */
+	/*
+	 * Check for rejects. They are handled by doing a new delivery
+	 * attempt (see comments in icp_rm_deliver_irq).
+	 */
 	if (reject && reject != XICS_IPI) {
-		icp->rm_action |= XICS_RM_REJECT;
-		icp->rm_reject = reject;
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, reject);
 	}
  bail:
 	return check_too_hard(xics, icp);
@@ -416,10 +612,10 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 		goto bail;
 	state = &ics->irq_state[src];
 
-	/* Still asserted, resend it, we make it look like a reject */
+	/* Still asserted, resend it */
 	if (state->asserted) {
-		icp->rm_action |= XICS_RM_REJECT;
-		icp->rm_reject = irq;
+		icp->n_reject++;
+		icp_rm_deliver_irq(xics, icp, irq);
 	}
 
 	if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) {
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 6cbf1630cb70..4d70df26c402 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -172,6 +172,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
+	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
+	mfspr	r3, SPRN_HDEC
+	mtspr	SPRN_DEC, r3
+	/*
+	 * Make sure the primary has finished the MMU switch.
+	 * We should never get here on a secondary thread, but
+	 * check it for robustness' sake.
+	 */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+65:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	65b
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
 	/* set our bit in napping_threads */
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	lbz	r7, HSTATE_PTID(r13)
@@ -182,7 +198,7 @@ kvmppc_primary_no_guest:
 	or	r3, r3, r0
 	stwcx.	r3, 0, r6
 	bne	1b
-	/* order napping_threads update vs testing entry_exit_count */
+	/* order napping_threads update vs testing entry_exit_map */
 	isync
 	li	r12, 0
 	lwz	r7, VCORE_ENTRY_EXIT(r5)
@@ -191,6 +207,7 @@ kvmppc_primary_no_guest:
 	li	r3, NAPPING_NOVCPU
 	stb	r3, HSTATE_NAPPING(r13)
 
+	li	r3, 0		/* Don't wake on privileged (OS) doorbell */
 	b	kvm_do_nap
 
 kvm_novcpu_wakeup:
@@ -202,7 +219,7 @@ kvm_novcpu_wakeup:
 
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
-	
+
 	/* see if any other thread is already exiting */
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
@@ -222,13 +239,37 @@ kvm_novcpu_wakeup:
 	cmpdi	r3, 0
 	bge	kvm_novcpu_exit
 
+	/* See if our timeslice has expired (HDEC is negative) */
+	mfspr	r0, SPRN_HDEC
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	cmpwi	r0, 0
+	blt	kvm_novcpu_exit
+
 	/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
 	ld	r4, HSTATE_KVM_VCPU(r13)
 	cmpdi	r4, 0
-	bne	kvmppc_got_guest
+	beq	kvmppc_primary_no_guest
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+#endif
+	b	kvmppc_got_guest
 
 kvm_novcpu_exit:
-	b	hdec_soon
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	beq	13f
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+13:	mr	r3, r12
+	stw	r12, 112-4(r1)
+	bl	kvmhv_commence_exit
+	nop
+	lwz	r12, 112-4(r1)
+	b	kvmhv_switch_to_host
 
 /*
  * We come in here when wakened from nap mode.
@@ -239,9 +280,9 @@ kvm_novcpu_exit:
 kvm_start_guest:
 
 	/* Set runlatch bit the minute you wake up from nap */
-	mfspr	r1, SPRN_CTRLF
-	ori 	r1, r1, 1
-	mtspr	SPRN_CTRLT, r1
+	mfspr	r0, SPRN_CTRLF
+	ori 	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
 
 	ld	r2,PACATOC(r13)
 
@@ -286,26 +327,21 @@ kvm_secondary_got_guest:
 	ld	r6, PACA_DSCR(r13)
 	std	r6, HSTATE_DSCR(r13)
 
+	/* Order load of vcore, ptid etc. after load of vcpu */
+	lwsync
 	bl	kvmppc_hv_entry
 
 	/* Back from the guest, go back to nap */
 	/* Clear our vcpu pointer so we don't come back in early */
 	li	r0, 0
-	std	r0, HSTATE_KVM_VCPU(r13)
 	/*
-	 * Make sure we clear HSTATE_KVM_VCPU(r13) before incrementing
-	 * the nap_count, because once the increment to nap_count is
-	 * visible we could be given another vcpu.
+	 * Once we clear HSTATE_KVM_VCPU(r13), the code in
+	 * kvmppc_run_core() is going to assume that all our vcpu
+	 * state is visible in memory.  This lwsync makes sure
+	 * that that is true.
 	 */
 	lwsync
-
-	/* increment the nap count and then go to nap mode */
-	ld	r4, HSTATE_KVM_VCORE(r13)
-	addi	r4, r4, VCORE_NAP_COUNT
-51:	lwarx	r3, 0, r4
-	addi	r3, r3, 1
-	stwcx.	r3, 0, r4
-	bne	51b
+	std	r0, HSTATE_KVM_VCPU(r13)
 
 /*
  * At this point we have finished executing in the guest.
@@ -376,6 +412,14 @@ kvmppc_hv_entry:
 	li	r6, KVM_GUEST_MODE_HOST_HV
 	stb	r6, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Store initial timestamp */
+	cmpdi	r4, 0
+	beq	1f
+	addi	r3, r4, VCPU_TB_RMENTRY
+	bl	kvmhv_start_timing
+1:
+#endif
 	/* Clear out SLB */
 	li	r6,0
 	slbmte	r6,r6
@@ -387,21 +431,23 @@ kvmppc_hv_entry:
 	 * We don't have to lock against concurrent tlbies,
 	 * but we do have to coordinate across hardware threads.
 	 */
-	/* Increment entry count iff exit count is zero. */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	addi	r9,r5,VCORE_ENTRY_EXIT
-21:	lwarx	r3,0,r9
-	cmpwi	r3,0x100		/* any threads starting to exit? */
+	/* Set bit in entry map iff exit map is zero. */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	li	r7, 1
+	lbz	r6, HSTATE_PTID(r13)
+	sld	r7, r7, r6
+	addi	r9, r5, VCORE_ENTRY_EXIT
+21:	lwarx	r3, 0, r9
+	cmpwi	r3, 0x100		/* any threads starting to exit? */
 	bge	secondary_too_late	/* if so we're too late to the party */
-	addi	r3,r3,1
-	stwcx.	r3,0,r9
+	or	r3, r3, r7
+	stwcx.	r3, 0, r9
 	bne	21b
 
 	/* Primary thread switches to guest partition. */
 	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
-	lbz	r6,HSTATE_PTID(r13)
 	cmpwi	r6,0
-	bne	20f
+	bne	10f
 	ld	r6,KVM_SDR1(r9)
 	lwz	r7,KVM_LPID(r9)
 	li	r0,LPID_RSVD		/* switch to reserved LPID */
@@ -472,28 +518,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	li	r0,1
 	stb	r0,VCORE_IN_GUEST(r5)	/* signal secondaries to continue */
-	b	10f
-
-	/* Secondary threads wait for primary to have done partition switch */
-20:	lbz	r0,VCORE_IN_GUEST(r5)
-	cmpwi	r0,0
-	beq	20b
-
-	/* Set LPCR and RMOR. */
-10:	ld	r8,VCORE_LPCR(r5)
-	mtspr	SPRN_LPCR,r8
-	ld	r8,KVM_RMOR(r9)
-	mtspr	SPRN_RMOR,r8
-	isync
-
-	/* Check if HDEC expires soon */
-	mfspr	r3,SPRN_HDEC
-	cmpwi	r3,512		/* 1 microsecond */
-	li	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	blt	hdec_soon
 
 	/* Do we have a guest vcpu to run? */
-	cmpdi	r4, 0
+10:	cmpdi	r4, 0
 	beq	kvmppc_primary_no_guest
 kvmppc_got_guest:
 
@@ -818,6 +845,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	clrrdi	r6,r6,1
 	mtspr	SPRN_CTRLT,r6
 4:
+	/* Secondary threads wait for primary to have done partition switch */
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, HSTATE_PTID(r13)
+	cmpwi	r6, 0
+	beq	21f
+	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	bne	21f
+	HMT_LOW
+20:	lbz	r0, VCORE_IN_GUEST(r5)
+	cmpwi	r0, 0
+	beq	20b
+	HMT_MEDIUM
+21:
+	/* Set LPCR. */
+	ld	r8,VCORE_LPCR(r5)
+	mtspr	SPRN_LPCR,r8
+	isync
+
+	/* Check if HDEC expires soon */
+	mfspr	r3, SPRN_HDEC
+	cmpwi	r3, 512		/* 1 microsecond */
+	blt	hdec_soon
+
 	ld	r6, VCPU_CTR(r4)
 	lwz	r7, VCPU_XER(r4)
 
@@ -880,6 +931,12 @@ fast_guest_return:
 	li	r9, KVM_GUEST_MODE_GUEST_HV
 	stb	r9, HSTATE_IN_GUEST(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Accumulate timing */
+	addi	r3, r4, VCPU_TB_GUEST
+	bl	kvmhv_accumulate_time
+#endif
+
 	/* Enter guest */
 
 BEGIN_FTR_SECTION
@@ -917,6 +974,27 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	hrfid
 	b	.
 
+secondary_too_late:
+	li	r12, 0
+	cmpdi	r4, 0
+	beq	11f
+	stw	r12, VCPU_TRAP(r4)
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+11:	b	kvmhv_switch_to_host
+
+hdec_soon:
+	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
+	stw	r12, VCPU_TRAP(r4)
+	mr	r9, r4
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMEXIT
+	bl	kvmhv_accumulate_time
+#endif
+	b	guest_exit_cont
+
 /******************************************************************************
  *                                                                            *
  *                               Exit code                                    *
@@ -1002,6 +1080,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 	stw	r12,VCPU_TRAP(r9)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMINTR
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+	ld	r5, VCPU_GPR(R5)(r9)
+	ld	r6, VCPU_GPR(R6)(r9)
+	ld	r7, VCPU_GPR(R7)(r9)
+	ld	r8, VCPU_GPR(R8)(r9)
+#endif
+
 	/* Save HEIR (HV emulation assist reg) in emul_inst
 	   if this is an HEI (HV emulation interrupt, e40) */
 	li	r3,KVM_INST_FETCH_FAILED
@@ -1028,34 +1116,37 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	bne	2f
 	mfspr	r3,SPRN_HDEC
 	cmpwi	r3,0
-	bge	ignore_hdec
+	mr	r4,r9
+	bge	fast_guest_return
 2:
 	/* See if this is an hcall we can handle in real mode */
 	cmpwi	r12,BOOK3S_INTERRUPT_SYSCALL
 	beq	hcall_try_real_mode
 
+	/* Hypervisor doorbell - exit only if host IPI flag set */
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	bne	3f
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	beq	4f
+	b	guest_exit_cont
+3:
 	/* External interrupt ? */
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
-	bne+	ext_interrupt_to_host
+	bne+	guest_exit_cont
 
 	/* External interrupt, first check for host_ipi. If this is
 	 * set, we know the host wants us out so let's do it now
 	 */
 	bl	kvmppc_read_intr
 	cmpdi	r3, 0
-	bgt	ext_interrupt_to_host
+	bgt	guest_exit_cont
 
 	/* Check if any CPU is heading out to the host, if so head out too */
-	ld	r5, HSTATE_KVM_VCORE(r13)
+4:	ld	r5, HSTATE_KVM_VCORE(r13)
 	lwz	r0, VCORE_ENTRY_EXIT(r5)
 	cmpwi	r0, 0x100
-	bge	ext_interrupt_to_host
-
-	/* Return to guest after delivering any pending interrupt */
 	mr	r4, r9
-	b	deliver_guest_interrupt
-
-ext_interrupt_to_host:
+	blt	deliver_guest_interrupt
 
 guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	/* Save more register state  */
@@ -1065,7 +1156,7 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	stw	r7, VCPU_DSISR(r9)
 	/* don't overwrite fault_dar/fault_dsisr if HDSI */
 	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
-	beq	6f
+	beq	mc_cont
 	std	r6, VCPU_FAULT_DAR(r9)
 	stw	r7, VCPU_FAULT_DSISR(r9)
 
@@ -1073,9 +1164,20 @@ guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	beq	machine_check_realmode
 mc_cont:
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r9, VCPU_TB_RMEXIT
+	mr	r4, r9
+	bl	kvmhv_accumulate_time
+#endif
+
+	/* Increment exit count, poke other threads to exit */
+	bl	kvmhv_commence_exit
+	nop
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	lwz	r12, VCPU_TRAP(r9)
 
 	/* Save guest CTRL register, set runlatch to 1 */
-6:	mfspr	r6,SPRN_CTRLF
+	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
 	andi.	r0,r6,1
 	bne	4f
@@ -1417,68 +1519,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	slbia
 	ptesync
 
-hdec_soon:			/* r12 = trap, r13 = paca */
 	/*
 	 * POWER7/POWER8 guest -> host partition switch code.
 	 * We don't have to lock against tlbies but we do
 	 * have to coordinate the hardware threads.
 	 */
-	/* Increment the threads-exiting-guest count in the 0xff00
-	   bits of vcore->entry_exit_count */
-	ld	r5,HSTATE_KVM_VCORE(r13)
-	addi	r6,r5,VCORE_ENTRY_EXIT
-41:	lwarx	r3,0,r6
-	addi	r0,r3,0x100
-	stwcx.	r0,0,r6
-	bne	41b
-	isync		/* order stwcx. vs. reading napping_threads */
-
-	/*
-	 * At this point we have an interrupt that we have to pass
-	 * up to the kernel or qemu; we can't handle it in real mode.
-	 * Thus we have to do a partition switch, so we have to
-	 * collect the other threads, if we are the first thread
-	 * to take an interrupt.  To do this, we set the HDEC to 0,
-	 * which causes an HDEC interrupt in all threads within 2ns
-	 * because the HDEC register is shared between all 4 threads.
-	 * However, we don't need to bother if this is an HDEC
-	 * interrupt, since the other threads will already be on their
-	 * way here in that case.
-	 */
-	cmpwi	r3,0x100	/* Are we the first here? */
-	bge	43f
-	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
-	beq	40f
-	li	r0,0
-	mtspr	SPRN_HDEC,r0
-40:
-	/*
-	 * Send an IPI to any napping threads, since an HDEC interrupt
-	 * doesn't wake CPUs up from nap.
-	 */
-	lwz	r3,VCORE_NAPPING_THREADS(r5)
-	lbz	r4,HSTATE_PTID(r13)
-	li	r0,1
-	sld	r0,r0,r4
-	andc.	r3,r3,r0		/* no sense IPI'ing ourselves */
-	beq	43f
-	/* Order entry/exit update vs. IPIs */
-	sync
-	mulli	r4,r4,PACA_SIZE		/* get paca for thread 0 */
-	subf	r6,r4,r13
-42:	andi.	r0,r3,1
-	beq	44f
-	ld	r8,HSTATE_XICS_PHYS(r6)	/* get thread's XICS reg addr */
-	li	r0,IPI_PRIORITY
-	li	r7,XICS_MFRR
-	stbcix	r0,r7,r8		/* trigger the IPI */
-44:	srdi.	r3,r3,1
-	addi	r6,r6,PACA_SIZE
-	bne	42b
-
-secondary_too_late:
+kvmhv_switch_to_host:
 	/* Secondary threads wait for primary to do partition switch */
-43:	ld	r5,HSTATE_KVM_VCORE(r13)
+	ld	r5,HSTATE_KVM_VCORE(r13)
 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
 	lbz	r3,HSTATE_PTID(r13)
 	cmpwi	r3,0
@@ -1562,6 +1610,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 1:	addi	r8,r8,16
 	.endr
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	/* Finish timing, if we have a vcpu */
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	cmpdi	r4, 0
+	li	r3, 0
+	beq	2f
+	bl	kvmhv_accumulate_time
+2:
+#endif
 	/* Unset guest mode */
 	li	r0, KVM_GUEST_MODE_NONE
 	stb	r0, HSTATE_IN_GUEST(r13)
@@ -1696,8 +1753,10 @@ kvmppc_hisi:
  * Returns to the guest if we handle it, or continues on up to
  * the kernel if we can't (i.e. if we don't have a handler for
  * it, or if the handler returns H_TOO_HARD).
+ *
+ * r5 - r8 contain hcall args,
+ * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca
  */
-	.globl	hcall_try_real_mode
 hcall_try_real_mode:
 	ld	r3,VCPU_GPR(R3)(r9)
 	andi.	r0,r11,MSR_PR
@@ -1839,13 +1898,124 @@ hcall_real_table:
 	.long	0		/* 0x12c */
 	.long	0		/* 0x130 */
 	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+	.long	0		/* 0x138 */
+	.long	0		/* 0x13c */
+	.long	0		/* 0x140 */
+	.long	0		/* 0x144 */
+	.long	0		/* 0x148 */
+	.long	0		/* 0x14c */
+	.long	0		/* 0x150 */
+	.long	0		/* 0x154 */
+	.long	0		/* 0x158 */
+	.long	0		/* 0x15c */
+	.long	0		/* 0x160 */
+	.long	0		/* 0x164 */
+	.long	0		/* 0x168 */
+	.long	0		/* 0x16c */
+	.long	0		/* 0x170 */
+	.long	0		/* 0x174 */
+	.long	0		/* 0x178 */
+	.long	0		/* 0x17c */
+	.long	0		/* 0x180 */
+	.long	0		/* 0x184 */
+	.long	0		/* 0x188 */
+	.long	0		/* 0x18c */
+	.long	0		/* 0x190 */
+	.long	0		/* 0x194 */
+	.long	0		/* 0x198 */
+	.long	0		/* 0x19c */
+	.long	0		/* 0x1a0 */
+	.long	0		/* 0x1a4 */
+	.long	0		/* 0x1a8 */
+	.long	0		/* 0x1ac */
+	.long	0		/* 0x1b0 */
+	.long	0		/* 0x1b4 */
+	.long	0		/* 0x1b8 */
+	.long	0		/* 0x1bc */
+	.long	0		/* 0x1c0 */
+	.long	0		/* 0x1c4 */
+	.long	0		/* 0x1c8 */
+	.long	0		/* 0x1cc */
+	.long	0		/* 0x1d0 */
+	.long	0		/* 0x1d4 */
+	.long	0		/* 0x1d8 */
+	.long	0		/* 0x1dc */
+	.long	0		/* 0x1e0 */
+	.long	0		/* 0x1e4 */
+	.long	0		/* 0x1e8 */
+	.long	0		/* 0x1ec */
+	.long	0		/* 0x1f0 */
+	.long	0		/* 0x1f4 */
+	.long	0		/* 0x1f8 */
+	.long	0		/* 0x1fc */
+	.long	0		/* 0x200 */
+	.long	0		/* 0x204 */
+	.long	0		/* 0x208 */
+	.long	0		/* 0x20c */
+	.long	0		/* 0x210 */
+	.long	0		/* 0x214 */
+	.long	0		/* 0x218 */
+	.long	0		/* 0x21c */
+	.long	0		/* 0x220 */
+	.long	0		/* 0x224 */
+	.long	0		/* 0x228 */
+	.long	0		/* 0x22c */
+	.long	0		/* 0x230 */
+	.long	0		/* 0x234 */
+	.long	0		/* 0x238 */
+	.long	0		/* 0x23c */
+	.long	0		/* 0x240 */
+	.long	0		/* 0x244 */
+	.long	0		/* 0x248 */
+	.long	0		/* 0x24c */
+	.long	0		/* 0x250 */
+	.long	0		/* 0x254 */
+	.long	0		/* 0x258 */
+	.long	0		/* 0x25c */
+	.long	0		/* 0x260 */
+	.long	0		/* 0x264 */
+	.long	0		/* 0x268 */
+	.long	0		/* 0x26c */
+	.long	0		/* 0x270 */
+	.long	0		/* 0x274 */
+	.long	0		/* 0x278 */
+	.long	0		/* 0x27c */
+	.long	0		/* 0x280 */
+	.long	0		/* 0x284 */
+	.long	0		/* 0x288 */
+	.long	0		/* 0x28c */
+	.long	0		/* 0x290 */
+	.long	0		/* 0x294 */
+	.long	0		/* 0x298 */
+	.long	0		/* 0x29c */
+	.long	0		/* 0x2a0 */
+	.long	0		/* 0x2a4 */
+	.long	0		/* 0x2a8 */
+	.long	0		/* 0x2ac */
+	.long	0		/* 0x2b0 */
+	.long	0		/* 0x2b4 */
+	.long	0		/* 0x2b8 */
+	.long	0		/* 0x2bc */
+	.long	0		/* 0x2c0 */
+	.long	0		/* 0x2c4 */
+	.long	0		/* 0x2c8 */
+	.long	0		/* 0x2cc */
+	.long	0		/* 0x2d0 */
+	.long	0		/* 0x2d4 */
+	.long	0		/* 0x2d8 */
+	.long	0		/* 0x2dc */
+	.long	0		/* 0x2e0 */
+	.long	0		/* 0x2e4 */
+	.long	0		/* 0x2e8 */
+	.long	0		/* 0x2ec */
+	.long	0		/* 0x2f0 */
+	.long	0		/* 0x2f4 */
+	.long	0		/* 0x2f8 */
+	.long	0		/* 0x2fc */
+	.long	DOTSYM(kvmppc_h_random) - hcall_real_table
 	.globl	hcall_real_table_end
 hcall_real_table_end:
 
-ignore_hdec:
-	mr	r4,r9
-	b	fast_guest_return
-
 _GLOBAL(kvmppc_h_set_xdabr)
 	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
 	beq	6f
@@ -1884,7 +2054,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	li	r3, 0
 	blr
 
-_GLOBAL(kvmppc_h_cede)
+_GLOBAL(kvmppc_h_cede)		/* r3 = vcpu pointer, r11 = msr, r13 = paca */
 	ori	r11,r11,MSR_EE
 	std	r11,VCPU_MSR(r3)
 	li	r0,1
@@ -1893,8 +2063,8 @@ _GLOBAL(kvmppc_h_cede)
 	lbz	r5,VCPU_PRODDED(r3)
 	cmpwi	r5,0
 	bne	kvm_cede_prodded
-	li	r0,0		/* set trap to 0 to say hcall is handled */
-	stw	r0,VCPU_TRAP(r3)
+	li	r12,0		/* set trap to 0 to say hcall is handled */
+	stw	r12,VCPU_TRAP(r3)
 	li	r0,H_SUCCESS
 	std	r0,VCPU_GPR(R3)(r3)
 
@@ -1912,12 +2082,11 @@ _GLOBAL(kvmppc_h_cede)
 	addi	r6,r5,VCORE_NAPPING_THREADS
 31:	lwarx	r4,0,r6
 	or	r4,r4,r0
-	PPC_POPCNTW(R7,R4)
-	cmpw	r7,r8
-	bge	kvm_cede_exit
+	cmpw	r4,r8
+	beq	kvm_cede_exit
 	stwcx.	r4,0,r6
 	bne	31b
-	/* order napping_threads update vs testing entry_exit_count */
+	/* order napping_threads update vs testing entry_exit_map */
 	isync
 	li	r0,NAPPING_CEDE
 	stb	r0,HSTATE_NAPPING(r13)
@@ -1955,21 +2124,52 @@ _GLOBAL(kvmppc_h_cede)
 	bl	kvmppc_save_fp
 
 	/*
+	 * Set DEC to the smaller of DEC and HDEC, so that we wake
+	 * no later than the end of our timeslice (HDEC interrupts
+	 * don't wake us from nap).
+	 */
+	mfspr	r3, SPRN_DEC
+	mfspr	r4, SPRN_HDEC
+	mftb	r5
+	cmpw	r3, r4
+	ble	67f
+	mtspr	SPRN_DEC, r4
+67:
+	/* save expiry time of guest decrementer */
+	extsw	r3, r3
+	add	r3, r3, r5
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	subf	r3, r6, r3	/* convert to host TB value */
+	std	r3, VCPU_DEC_EXPIRES(r4)
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	ld	r4, HSTATE_KVM_VCPU(r13)
+	addi	r3, r4, VCPU_TB_CEDE
+	bl	kvmhv_accumulate_time
+#endif
+
+	lis	r3, LPCR_PECEDP@h	/* Do wake on privileged doorbell */
+
+	/*
 	 * Take a nap until a decrementer or external or doobell interrupt
-	 * occurs, with PECE1, PECE0 and PECEDP set in LPCR. Also clear the
-	 * runlatch bit before napping.
+	 * occurs, with PECE1 and PECE0 set in LPCR.
+	 * On POWER8, set PECEDH, and if we are ceding, also set PECEDP.
+	 * Also clear the runlatch bit before napping.
 	 */
 kvm_do_nap:
-	mfspr	r2, SPRN_CTRLF
-	clrrdi	r2, r2, 1
-	mtspr	SPRN_CTRLT, r2
+	mfspr	r0, SPRN_CTRLF
+	clrrdi	r0, r0, 1
+	mtspr	SPRN_CTRLT, r0
 
 	li	r0,1
 	stb	r0,HSTATE_HWTHREAD_REQ(r13)
 	mfspr	r5,SPRN_LPCR
 	ori	r5,r5,LPCR_PECE0 | LPCR_PECE1
 BEGIN_FTR_SECTION
-	oris	r5,r5,LPCR_PECEDP@h
+	ori	r5, r5, LPCR_PECEDH
+	rlwimi	r5, r3, 0, LPCR_PECEDP
 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_LPCR,r5
 	isync
@@ -1994,9 +2194,23 @@ kvm_end_cede:
 	/* Woken by external or decrementer interrupt */
 	ld	r1, HSTATE_HOST_R1(r13)
 
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+	addi	r3, r4, VCPU_TB_RMINTR
+	bl	kvmhv_accumulate_time
+#endif
+
 	/* load up FP state */
 	bl	kvmppc_load_fp
 
+	/* Restore guest decrementer */
+	ld	r3, VCPU_DEC_EXPIRES(r4)
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	ld	r6, VCORE_TB_OFFSET(r5)
+	add	r3, r3, r6	/* convert host TB to guest TB value */
+	mftb	r7
+	subf	r3, r7, r3
+	mtspr	SPRN_DEC, r3
+
 	/* Load NV GPRS */
 	ld	r14, VCPU_GPR(R14)(r4)
 	ld	r15, VCPU_GPR(R15)(r4)
@@ -2057,7 +2271,8 @@ kvm_cede_prodded:
 
 	/* we've ceded but we want to give control to the host */
 kvm_cede_exit:
-	b	hcall_real_fallback
+	ld	r9, HSTATE_KVM_VCPU(r13)
+	b	guest_exit_cont
 
 	/* Try to handle a machine check in real mode */
 machine_check_realmode:
@@ -2089,13 +2304,14 @@ machine_check_realmode:
 
 /*
  * Check the reason we woke from nap, and take appropriate action.
- * Returns:
+ * Returns (in r3):
  *	0 if nothing needs to be done
  *	1 if something happened that needs to be handled by the host
- *	-1 if there was a guest wakeup (IPI)
+ *	-1 if there was a guest wakeup (IPI or msgsnd)
  *
  * Also sets r12 to the interrupt vector for any interrupt that needs
  * to be handled now by the host (0x500 for external interrupt), or zero.
+ * Modifies r0, r6, r7, r8.
  */
 kvmppc_check_wake_reason:
 	mfspr	r6, SPRN_SRR1
@@ -2122,7 +2338,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 	/* hypervisor doorbell */
 3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	/* see if it's a host IPI */
 	li	r3, 1
+	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
+	bnelr
+	/* if not, clear it and return -1 */
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+	li	r3, -1
 	blr
 
 /*
@@ -2131,6 +2355,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
  *	0 if no interrupt is pending
  *	1 if an interrupt is pending that needs to be handled by the host
  *	-1 if there was a guest wakeup IPI (which has now been cleared)
+ * Modifies r0, r6, r7, r8, returns value in r3.
  */
 kvmppc_read_intr:
 	/* see if a host IPI is pending */
@@ -2185,6 +2410,7 @@ kvmppc_read_intr:
 	bne-	43f
 
 	/* OK, it's an IPI for us */
+	li	r12, 0
 	li	r3, -1
 1:	blr
 
@@ -2314,3 +2540,62 @@ kvmppc_fix_pmao:
 	mtspr	SPRN_PMC6, r3
 	isync
 	blr
+
+#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
+/*
+ * Start timing an activity
+ * r3 = pointer to time accumulation struct, r4 = vcpu
+ */
+kvmhv_start_timing:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r6, VCORE_IN_GUEST(r5)
+	cmpwi	r6, 0
+	beq	5f				/* if in guest, need to */
+	ld	r6, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+5:	mftb	r5
+	subf	r5, r6, r5
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	std	r5, VCPU_ACTIVITY_START(r4)
+	blr
+
+/*
+ * Accumulate time to one activity and start another.
+ * r3 = pointer to new time accumulation struct, r4 = vcpu
+ */
+kvmhv_accumulate_time:
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	lbz	r8, VCORE_IN_GUEST(r5)
+	cmpwi	r8, 0
+	beq	4f				/* if in guest, need to */
+	ld	r8, VCORE_TB_OFFSET(r5)		/* subtract timebase offset */
+4:	ld	r5, VCPU_CUR_ACTIVITY(r4)
+	ld	r6, VCPU_ACTIVITY_START(r4)
+	std	r3, VCPU_CUR_ACTIVITY(r4)
+	mftb	r7
+	subf	r7, r8, r7
+	std	r7, VCPU_ACTIVITY_START(r4)
+	cmpdi	r5, 0
+	beqlr
+	subf	r3, r6, r7
+	ld	r8, TAS_SEQCOUNT(r5)
+	cmpdi	r8, 0
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	lwsync
+	ld	r7, TAS_TOTAL(r5)
+	add	r7, r7, r3
+	std	r7, TAS_TOTAL(r5)
+	ld	r6, TAS_MIN(r5)
+	ld	r7, TAS_MAX(r5)
+	beq	3f
+	cmpd	r3, r6
+	bge	1f
+3:	std	r3, TAS_MIN(r5)
+1:	cmpd	r3, r7
+	ble	2f
+	std	r3, TAS_MAX(r5)
+2:	lwsync
+	addi	r8, r8, 1
+	std	r8, TAS_SEQCOUNT(r5)
+	blr
+#endif
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index ce3c893d509b..f2c75a1e0536 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -258,6 +258,28 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_load(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
+{
+	long rc;
+
+	rc = kvmppc_h_logical_ci_store(vcpu);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
 	long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -290,6 +312,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
 		vcpu->stat.halt_wakeup++;
 		return EMULATE_DONE;
+	case H_LOGICAL_CI_LOAD:
+		return kvmppc_h_pr_logical_ci_load(vcpu);
+	case H_LOGICAL_CI_STORE:
+		return kvmppc_h_pr_logical_ci_store(vcpu);
 	case H_XIRR:
 	case H_CPPR:
 	case H_EOI:
@@ -323,6 +349,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
 	case H_BULK_REMOVE:
 	case H_PUT_TCE:
 	case H_CEDE:
+	case H_LOGICAL_CI_LOAD:
+	case H_LOGICAL_CI_STORE:
 #ifdef CONFIG_KVM_XICS
 	case H_XIRR:
 	case H_CPPR:
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index a4a8d9f0dcb7..8f3e6cc54d95 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -20,6 +20,7 @@
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/time.h>
+#include <asm/spinlock.h>
 
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -39,7 +40,7 @@
  * LOCKING
  * =======
  *
- * Each ICS has a mutex protecting the information about the IRQ
+ * Each ICS has a spin lock protecting the information about the IRQ
  * sources and avoiding simultaneous deliveries if the same interrupt.
  *
  * ICP operations are done via a single compare & swap transaction
@@ -109,7 +110,10 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 {
 	int i;
 
-	mutex_lock(&ics->lock);
+	unsigned long flags;
+
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 		struct ics_irq_state *state = &ics->irq_state[i];
@@ -120,12 +124,15 @@ static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		XICS_DBG("resend %#x prio %#x\n", state->number,
 			      state->priority);
 
-		mutex_unlock(&ics->lock);
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
 		icp_deliver_irq(xics, icp, state->number);
-		mutex_lock(&ics->lock);
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
 	}
 
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 }
 
 static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
@@ -133,8 +140,10 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		       u32 server, u32 priority, u32 saved_priority)
 {
 	bool deliver;
+	unsigned long flags;
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	state->server = server;
 	state->priority = priority;
@@ -145,7 +154,8 @@ static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics,
 		deliver = true;
 	}
 
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	return deliver;
 }
@@ -186,6 +196,7 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
 	struct kvmppc_ics *ics;
 	struct ics_irq_state *state;
 	u16 src;
+	unsigned long flags;
 
 	if (!xics)
 		return -ENODEV;
@@ -195,10 +206,12 @@ int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority)
 		return -EINVAL;
 	state = &ics->irq_state[src];
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	*server = state->server;
 	*priority = state->priority;
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	return 0;
 }
@@ -365,6 +378,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	struct kvmppc_ics *ics;
 	u32 reject;
 	u16 src;
+	unsigned long flags;
 
 	/*
 	 * This is used both for initial delivery of an interrupt and
@@ -391,7 +405,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	state = &ics->irq_state[src];
 
 	/* Get a lock on the ICS */
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 
 	/* Get our server */
 	if (!icp || state->server != icp->server_num) {
@@ -434,7 +449,7 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 	 *
 	 * Note that if successful, the new delivery might have itself
 	 * rejected an interrupt that was "delivered" before we took the
-	 * icp mutex.
+	 * ics spin lock.
 	 *
 	 * In this case we do the whole sequence all over again for the
 	 * new guy. We cannot assume that the rejected interrupt is less
@@ -448,7 +463,8 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 * Delivery was successful, did we reject somebody else ?
 		 */
 		if (reject && reject != XICS_IPI) {
-			mutex_unlock(&ics->lock);
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
 			new_irq = reject;
 			goto again;
 		}
@@ -468,12 +484,14 @@ static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 		 */
 		smp_mb();
 		if (!icp->state.need_resend) {
-			mutex_unlock(&ics->lock);
+			arch_spin_unlock(&ics->lock);
+			local_irq_restore(flags);
 			goto again;
 		}
 	}
  out:
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 }
 
 static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
@@ -802,14 +820,22 @@ static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
 	XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n",
 		 hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt);
 
-	if (icp->rm_action & XICS_RM_KICK_VCPU)
+	if (icp->rm_action & XICS_RM_KICK_VCPU) {
+		icp->n_rm_kick_vcpu++;
 		kvmppc_fast_vcpu_kick(icp->rm_kick_target);
-	if (icp->rm_action & XICS_RM_CHECK_RESEND)
+	}
+	if (icp->rm_action & XICS_RM_CHECK_RESEND) {
+		icp->n_rm_check_resend++;
 		icp_check_resend(xics, icp->rm_resend_icp);
-	if (icp->rm_action & XICS_RM_REJECT)
+	}
+	if (icp->rm_action & XICS_RM_REJECT) {
+		icp->n_rm_reject++;
 		icp_deliver_irq(xics, icp, icp->rm_reject);
-	if (icp->rm_action & XICS_RM_NOTIFY_EOI)
+	}
+	if (icp->rm_action & XICS_RM_NOTIFY_EOI) {
+		icp->n_rm_notify_eoi++;
 		kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq);
+	}
 
 	icp->rm_action = 0;
 
@@ -872,10 +898,21 @@ static int xics_debug_show(struct seq_file *m, void *private)
 	struct kvm *kvm = xics->kvm;
 	struct kvm_vcpu *vcpu;
 	int icsid, i;
+	unsigned long flags;
+	unsigned long t_rm_kick_vcpu, t_rm_check_resend;
+	unsigned long t_rm_reject, t_rm_notify_eoi;
+	unsigned long t_reject, t_check_resend;
 
 	if (!kvm)
 		return 0;
 
+	t_rm_kick_vcpu = 0;
+	t_rm_notify_eoi = 0;
+	t_rm_check_resend = 0;
+	t_rm_reject = 0;
+	t_check_resend = 0;
+	t_reject = 0;
+
 	seq_printf(m, "=========\nICP state\n=========\n");
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
@@ -890,8 +927,19 @@ static int xics_debug_show(struct seq_file *m, void *private)
 			   icp->server_num, state.xisr,
 			   state.pending_pri, state.cppr, state.mfrr,
 			   state.out_ee, state.need_resend);
+		t_rm_kick_vcpu += icp->n_rm_kick_vcpu;
+		t_rm_notify_eoi += icp->n_rm_notify_eoi;
+		t_rm_check_resend += icp->n_rm_check_resend;
+		t_rm_reject += icp->n_rm_reject;
+		t_check_resend += icp->n_check_resend;
+		t_reject += icp->n_reject;
 	}
 
+	seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n",
+			t_rm_kick_vcpu, t_rm_check_resend,
+			t_rm_reject, t_rm_notify_eoi);
+	seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n",
+			t_check_resend, t_reject);
 	for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) {
 		struct kvmppc_ics *ics = xics->ics[icsid];
 
@@ -901,7 +949,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 		seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n",
 			   icsid);
 
-		mutex_lock(&ics->lock);
+		local_irq_save(flags);
+		arch_spin_lock(&ics->lock);
 
 		for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
 			struct ics_irq_state *irq = &ics->irq_state[i];
@@ -912,7 +961,8 @@ static int xics_debug_show(struct seq_file *m, void *private)
 				   irq->resend, irq->masked_pending);
 
 		}
-		mutex_unlock(&ics->lock);
+		arch_spin_unlock(&ics->lock);
+		local_irq_restore(flags);
 	}
 	return 0;
 }
@@ -965,7 +1015,6 @@ static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm,
 	if (!ics)
 		goto out;
 
-	mutex_init(&ics->lock);
 	ics->icsid = icsid;
 
 	for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
@@ -1107,13 +1156,15 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	u64 __user *ubufp = (u64 __user *) addr;
 	u16 idx;
 	u64 val, prio;
+	unsigned long flags;
 
 	ics = kvmppc_xics_find_ics(xics, irq, &idx);
 	if (!ics)
 		return -ENOENT;
 
 	irqp = &ics->irq_state[idx];
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	ret = -ENOENT;
 	if (irqp->exists) {
 		val = irqp->server;
@@ -1129,7 +1180,8 @@ static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr)
 			val |= KVM_XICS_PENDING;
 		ret = 0;
 	}
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	if (!ret && put_user(val, ubufp))
 		ret = -EFAULT;
@@ -1146,6 +1198,7 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	u64 val;
 	u8 prio;
 	u32 server;
+	unsigned long flags;
 
 	if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS)
 		return -ENOENT;
@@ -1166,7 +1219,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	    kvmppc_xics_find_server(xics->kvm, server) == NULL)
 		return -EINVAL;
 
-	mutex_lock(&ics->lock);
+	local_irq_save(flags);
+	arch_spin_lock(&ics->lock);
 	irqp->server = server;
 	irqp->saved_priority = prio;
 	if (val & KVM_XICS_MASKED)
@@ -1178,7 +1232,8 @@ static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr)
 	if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE))
 		irqp->asserted = 1;
 	irqp->exists = 1;
-	mutex_unlock(&ics->lock);
+	arch_spin_unlock(&ics->lock);
+	local_irq_restore(flags);
 
 	if (val & KVM_XICS_PENDING)
 		icp_deliver_irq(xics, NULL, irqp->number);
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 73f0f2723c07..56ea44f9867f 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -78,13 +78,22 @@ struct kvmppc_icp {
 	u32  rm_reject;
 	u32  rm_eoied_irq;
 
+	/* Counters for each reason we exited real mode */
+	unsigned long n_rm_kick_vcpu;
+	unsigned long n_rm_check_resend;
+	unsigned long n_rm_reject;
+	unsigned long n_rm_notify_eoi;
+	/* Counters for handling ICP processing in real mode */
+	unsigned long n_check_resend;
+	unsigned long n_reject;
+
 	/* Debug stuff for real mode */
 	union kvmppc_icp_state rm_dbgstate;
 	struct kvm_vcpu *rm_dbgtgt;
 };
 
 struct kvmppc_ics {
-	struct mutex lock;
+	arch_spinlock_t lock;
 	u16 icsid;
 	struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
 };
@@ -96,6 +105,8 @@ struct kvmppc_xics {
 	u32 max_icsid;
 	bool real_mode;
 	bool real_mode_dbg;
+	u32 err_noics;
+	u32 err_noicp;
 	struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1];
 };
 
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 91bbc845ac66..ac3ddf115f3d 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -529,6 +529,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_RMA:
 		r = 0;
 		break;
+	case KVM_CAP_PPC_HWRNG:
+		r = kvmppc_hwrng_present();
+		break;
 #endif
 	case KVM_CAP_SYNC_MMU:
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
diff --git a/arch/powerpc/platforms/powernv/rng.c b/arch/powerpc/platforms/powernv/rng.c
index 80db43944afe..6eb808ff637e 100644
--- a/arch/powerpc/platforms/powernv/rng.c
+++ b/arch/powerpc/platforms/powernv/rng.c
@@ -24,12 +24,22 @@
 
 struct powernv_rng {
 	void __iomem *regs;
+	void __iomem *regs_real;
 	unsigned long mask;
 };
 
 static DEFINE_PER_CPU(struct powernv_rng *, powernv_rng);
 
 
+int powernv_hwrng_present(void)
+{
+	struct powernv_rng *rng;
+
+	rng = get_cpu_var(powernv_rng);
+	put_cpu_var(rng);
+	return rng != NULL;
+}
+
 static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
 {
 	unsigned long parity;
@@ -46,6 +56,17 @@ static unsigned long rng_whiten(struct powernv_rng *rng, unsigned long val)
 	return val;
 }
 
+int powernv_get_random_real_mode(unsigned long *v)
+{
+	struct powernv_rng *rng;
+
+	rng = raw_cpu_read(powernv_rng);
+
+	*v = rng_whiten(rng, in_rm64(rng->regs_real));
+
+	return 1;
+}
+
 int powernv_get_random_long(unsigned long *v)
 {
 	struct powernv_rng *rng;
@@ -80,12 +101,20 @@ static __init void rng_init_per_cpu(struct powernv_rng *rng,
 static __init int rng_create(struct device_node *dn)
 {
 	struct powernv_rng *rng;
+	struct resource res;
 	unsigned long val;
 
 	rng = kzalloc(sizeof(*rng), GFP_KERNEL);
 	if (!rng)
 		return -ENOMEM;
 
+	if (of_address_to_resource(dn, 0, &res)) {
+		kfree(rng);
+		return -ENXIO;
+	}
+
+	rng->regs_real = (void __iomem *)res.start;
+
 	rng->regs = of_iomap(dn, 0);
 	if (!rng->regs) {
 		kfree(rng);
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index afa2bd750ffc..8cd8e7b288c5 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -110,7 +110,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 /* upper facilities limit for kvm */
 unsigned long kvm_s390_fac_list_mask[] = {
 	0xffe6fffbfcfdfc40UL,
-	0x205c800000000000UL,
+	0x005c800000000000UL,
 };
 
 unsigned long kvm_s390_fac_list_mask_size(void)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d67206a7b99a..629af0f1c5c4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -683,8 +683,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	unsigned long bitmap = 1;
 	struct kvm_lapic **dst;
 	int i;
-	bool ret = false;
-	bool x2apic_ipi = src && apic_x2apic_mode(src);
+	bool ret, x2apic_ipi;
 
 	*r = -1;
 
@@ -696,16 +695,18 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	if (irq->shorthand)
 		return false;
 
+	x2apic_ipi = src && apic_x2apic_mode(src);
 	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
 		return false;
 
+	ret = true;
 	rcu_read_lock();
 	map = rcu_dereference(kvm->arch.apic_map);
 
-	if (!map)
+	if (!map) {
+		ret = false;
 		goto out;
-
-	ret = true;
+	}
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
 		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 146f295ee322..d43867c33bc4 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4481,9 +4481,11 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
 		pfn = spte_to_pfn(*sptep);
 
 		/*
-		 * Only EPT supported for now; otherwise, one would need to
-		 * find out efficiently whether the guest page tables are
-		 * also using huge pages.
+		 * We cannot do huge page mapping for indirect shadow pages,
+		 * which are found on the last rmap (level = 1) when not using
+		 * tdp; such shadow pages are synced with the page table in
+		 * the guest, and the guest page table is using 4K page size
+		 * mapping if the indirect sp has level = 1.
 		 */
 		if (sp->role.direct &&
 			!kvm_is_reserved_pfn(pfn) &&
@@ -4504,19 +4506,12 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	bool flush = false;
 	unsigned long *rmapp;
 	unsigned long last_index, index;
-	gfn_t gfn_start, gfn_end;
 
 	spin_lock(&kvm->mmu_lock);
 
-	gfn_start = memslot->base_gfn;
-	gfn_end = memslot->base_gfn + memslot->npages - 1;
-
-	if (gfn_start >= gfn_end)
-		goto out;
-
 	rmapp = memslot->arch.rmap[0];
-	last_index = gfn_to_index(gfn_end, memslot->base_gfn,
-					PT_PAGE_TABLE_LEVEL);
+	last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1,
+				memslot->base_gfn, PT_PAGE_TABLE_LEVEL);
 
 	for (index = 0; index <= last_index; ++index, ++rmapp) {
 		if (*rmapp)
@@ -4534,7 +4529,6 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
 	if (flush)
 		kvm_flush_remote_tlbs(kvm);
 
-out:
 	spin_unlock(&kvm->mmu_lock);
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f5e8dce8046c..f7b61687bd79 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3622,8 +3622,16 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
 static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 {
-	unsigned long hw_cr4 = cr4 | (to_vmx(vcpu)->rmode.vm86_active ?
-		    KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
+	/*
+	 * Pass through host's Machine Check Enable value to hw_cr4, which
+	 * is in force while we are in guest mode.  Do not let guests control
+	 * this bit, even if host CR4.MCE == 0.
+	 */
+	unsigned long hw_cr4 =
+		(cr4_read_shadow() & X86_CR4_MCE) |
+		(cr4 & ~X86_CR4_MCE) |
+		(to_vmx(vcpu)->rmode.vm86_active ?
+		 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
 
 	if (cr4 & X86_CR4_VMXE) {
 		/*
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e1a81267f3f6..ed31c31b2485 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5799,7 +5799,6 @@ int kvm_arch_init(void *opaque)
 	kvm_set_mmio_spte_mask();
 
 	kvm_x86_ops = ops;
-	kvm_init_msr_list();
 
 	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
 			PT_DIRTY_MASK, PT64_NX_MASK, 0);
@@ -7253,7 +7252,14 @@ void kvm_arch_hardware_disable(void)
 
 int kvm_arch_hardware_setup(void)
 {
-	return kvm_x86_ops->hardware_setup();
+	int r;
+
+	r = kvm_x86_ops->hardware_setup();
+	if (r != 0)
+		return r;
+
+	kvm_init_msr_list();
+	return 0;
 }
 
 void kvm_arch_hardware_unsetup(void)