From 8e57c586c6a8333c266a6cf76d50e110f2954558 Mon Sep 17 00:00:00 2001
From: Markus Elfring
Date: Tue, 3 Feb 2015 12:40:54 +0100
Subject: perf/x86/intel/uncore: Delete an unnecessary check before
 pci_dev_put() call

The pci_dev_put() function tests whether its argument is NULL and then
returns immediately. Thus the test around the call is not needed.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/54D0B59C.2060106@users.sourceforge.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
index 21af6149edf2..12d9548457e7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -1132,8 +1132,7 @@ static int snbep_pci2phy_map_init(int devid)
 		}
 	}
 
-	if (ubox_dev)
-		pci_dev_put(ubox_dev);
+	pci_dev_put(ubox_dev);
 
 	return err ? pcibios_err_to_errno(err) : 0;
 }
-- 
cgit v1.2.3


From c796b205b88c775fd220c1a63390bac6a8cdda3f Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Fri, 23 Jan 2015 12:19:35 -0600
Subject: perf/x86/amd/ibs: Convert force_ibs_eilvt_setup() to void

The caller of force_ibs_eilvt_setup() is ibs_eilvt_setup()
which does not care about the return values.

So mark it void and clean up the return statements.

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <hpa@zytor.com>
Cc: <paulus@samba.org>
Cc: <tglx@linutronix.de>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1422037175-20957-1-git-send-email-aravind.gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_amd_ibs.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
index a61f5c6911da..989d3c215d2b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c
+++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c
@@ -796,7 +796,7 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
  * the IBS interrupt vector is handled by perf_ibs_cpu_notifier that
  * is using the new offset.
  */
-static int force_ibs_eilvt_setup(void)
+static void force_ibs_eilvt_setup(void)
 {
 	int offset;
 	int ret;
@@ -811,26 +811,24 @@ static int force_ibs_eilvt_setup(void)
 
 	if (offset == APIC_EILVT_NR_MAX) {
 		printk(KERN_DEBUG "No EILVT entry available\n");
-		return -EBUSY;
+		return;
 	}
 
 	ret = setup_ibs_ctl(offset);
 	if (ret)
 		goto out;
 
-	if (!ibs_eilvt_valid()) {
-		ret = -EFAULT;
+	if (!ibs_eilvt_valid())
 		goto out;
-	}
 
 	pr_info("IBS: LVT offset %d assigned\n", offset);
 
-	return 0;
+	return;
 out:
 	preempt_disable();
 	put_eilvt(offset);
 	preempt_enable();
-	return ret;
+	return;
 }
 
 static void ibs_eilvt_setup(void)
-- 
cgit v1.2.3


From 27ac905b8f88d28779b0661809286b5ba2817d37 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:55:57 -0500
Subject: perf/x86/intel: Reduce lbr_sel_map[] size

The index of lbr_sel_map is bit value of perf branch_sample_type.
PERF_SAMPLE_BRANCH_MAX is 1024 at present, so each lbr_sel_map uses
4096 bytes. By using bit shift as index, we can reduce lbr_sel_map
size to 40 bytes. This patch defines 'bit shift' for branch types,
and use 'bit shift' to define lbr_sel_maps.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1415156173-10035-2-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  4 +++
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 54 ++++++++++++++----------------
 include/uapi/linux/perf_event.h            | 49 +++++++++++++++++++--------
 3 files changed, 64 insertions(+), 43 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index df525d2be1e8..0c45b22495dc 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -515,6 +515,10 @@ struct x86_pmu {
 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+enum {
+	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+};
+
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 58f1a94beaf0..8bc078f43a82 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -69,10 +69,6 @@ static enum {
 #define LBR_FROM_FLAG_IN_TX    (1ULL << 62)
 #define LBR_FROM_FLAG_ABORT    (1ULL << 61)
 
-#define for_each_branch_sample_type(x) \
-	for ((x) = PERF_SAMPLE_BRANCH_USER; \
-	     (x) < PERF_SAMPLE_BRANCH_MAX; (x) <<= 1)
-
 /*
  * x86control flow change classification
  * x86control flow changes include branches, interrupts, traps, faults
@@ -403,14 +399,14 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 {
 	struct hw_perf_event_extra *reg;
 	u64 br_type = event->attr.branch_sample_type;
-	u64 mask = 0, m;
-	u64 v;
+	u64 mask = 0, v;
+	int i;
 
-	for_each_branch_sample_type(m) {
-		if (!(br_type & m))
+	for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
+		if (!(br_type & (1ULL << i)))
 			continue;
 
-		v = x86_pmu.lbr_sel_map[m];
+		v = x86_pmu.lbr_sel_map[i];
 		if (v == LBR_NOT_SUPP)
 			return -EOPNOTSUPP;
 
@@ -678,35 +674,35 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 /*
  * Map interface branch filters onto LBR filters
  */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_REL_JMP
-					| LBR_IND_JMP | LBR_FAR,
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_REL_JMP
+						| LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include REL_JMP+IND_JMP to get CALL branches
 	 */
-	[PERF_SAMPLE_BRANCH_ANY_CALL] =
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT] =
 	 LBR_REL_CALL | LBR_IND_CALL | LBR_REL_JMP | LBR_IND_JMP | LBR_FAR,
 	/*
 	 * NHM/WSM erratum: must include IND_JMP to capture IND_CALL
 	 */
-	[PERF_SAMPLE_BRANCH_IND_CALL] = LBR_IND_CALL | LBR_IND_JMP,
-	[PERF_SAMPLE_BRANCH_COND]     = LBR_JCC,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
 };
 
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX] = {
-	[PERF_SAMPLE_BRANCH_ANY]	= LBR_ANY,
-	[PERF_SAMPLE_BRANCH_USER]	= LBR_USER,
-	[PERF_SAMPLE_BRANCH_KERNEL]	= LBR_KERNEL,
-	[PERF_SAMPLE_BRANCH_HV]		= LBR_IGN,
-	[PERF_SAMPLE_BRANCH_ANY_RETURN]	= LBR_RETURN | LBR_FAR,
-	[PERF_SAMPLE_BRANCH_ANY_CALL]	= LBR_REL_CALL | LBR_IND_CALL
-					| LBR_FAR,
-	[PERF_SAMPLE_BRANCH_IND_CALL]	= LBR_IND_CALL,
-	[PERF_SAMPLE_BRANCH_COND]       = LBR_JCC,
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
 /* core */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 9b79abbd1ab8..e46b93279e3d 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -152,21 +152,42 @@ enum perf_event_sample_format {
  * The branch types can be combined, however BRANCH_ANY covers all types
  * of branches and therefore it supersedes all the other types.
  */
+enum perf_branch_sample_type_shift {
+	PERF_SAMPLE_BRANCH_USER_SHIFT		= 0, /* user branches */
+	PERF_SAMPLE_BRANCH_KERNEL_SHIFT		= 1, /* kernel branches */
+	PERF_SAMPLE_BRANCH_HV_SHIFT		= 2, /* hypervisor branches */
+
+	PERF_SAMPLE_BRANCH_ANY_SHIFT		= 3, /* any branch types */
+	PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT	= 4, /* any call branch */
+	PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT	= 5, /* any return branch */
+	PERF_SAMPLE_BRANCH_IND_CALL_SHIFT	= 6, /* indirect calls */
+	PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT	= 7, /* transaction aborts */
+	PERF_SAMPLE_BRANCH_IN_TX_SHIFT		= 8, /* in transaction */
+	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
+	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
+
+	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
+};
+
 enum perf_branch_sample_type {
-	PERF_SAMPLE_BRANCH_USER		= 1U << 0, /* user branches */
-	PERF_SAMPLE_BRANCH_KERNEL	= 1U << 1, /* kernel branches */
-	PERF_SAMPLE_BRANCH_HV		= 1U << 2, /* hypervisor branches */
-
-	PERF_SAMPLE_BRANCH_ANY		= 1U << 3, /* any branch types */
-	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << 4, /* any call branch */
-	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << 5, /* any return branch */
-	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << 6, /* indirect calls */
-	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << 7, /* transaction aborts */
-	PERF_SAMPLE_BRANCH_IN_TX	= 1U << 8, /* in transaction */
-	PERF_SAMPLE_BRANCH_NO_TX	= 1U << 9, /* not in transaction */
-	PERF_SAMPLE_BRANCH_COND		= 1U << 10, /* conditional branches */
-
-	PERF_SAMPLE_BRANCH_MAX		= 1U << 11, /* non-ABI */
+	PERF_SAMPLE_BRANCH_USER		= 1U << PERF_SAMPLE_BRANCH_USER_SHIFT,
+	PERF_SAMPLE_BRANCH_KERNEL	= 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT,
+	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
+
+	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	=
+				1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	=
+				1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	=
+				1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	=
+				1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
+
+	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
 #define PERF_SAMPLE_BRANCH_PLM_ALL \
-- 
cgit v1.2.3


From ba532500c5651a4be4108acc64ed99a95cb005b3 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:55:58 -0500
Subject: perf: Introduce pmu context switch callback

The callback is invoked when process is scheduled in or out.
It provides mechanism for later patches to save/store the LBR
stack. For the schedule in case, the callback is invoked at
the same place that flush branch stack callback is invoked.
So it also can replace the flush branch stack callback. To
avoid unnecessary overhead, the callback is enabled only when
there are events use the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-3-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c |  7 +++++
 arch/x86/kernel/cpu/perf_event.h |  2 ++
 include/linux/perf_event.h       |  9 +++++++
 kernel/events/core.c             | 57 ++++++++++++++++++++++++++++++++++++++++
 4 files changed, 75 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..0efbd6cc2966 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1914,6 +1914,12 @@ static const struct attribute_group *x86_pmu_attr_groups[] = {
 	NULL,
 };
 
+static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	if (x86_pmu.sched_task)
+		x86_pmu.sched_task(ctx, sched_in);
+}
+
 static void x86_pmu_flush_branch_stack(void)
 {
 	if (x86_pmu.flush_branch_stack)
@@ -1950,6 +1956,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.flush_branch_stack	= x86_pmu_flush_branch_stack,
+	.sched_task		= x86_pmu_sched_task,
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 0c45b22495dc..211b54c0c00c 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -473,6 +473,8 @@ struct x86_pmu {
 
 	void		(*check_microcode)(void);
 	void		(*flush_branch_stack)(void);
+	void		(*sched_task)(struct perf_event_context *ctx,
+				      bool sched_in);
 
 	/*
 	 * Intel Arch Perfmon v2+
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 33262004c310..fbab6235d053 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -265,6 +265,13 @@ struct pmu {
 	 * flush branch stack on context-switches (needed in cpu-wide mode)
 	 */
 	void (*flush_branch_stack)	(void);
+
+	/*
+	 * context-switches callback
+	 */
+	void (*sched_task)		(struct perf_event_context *ctx,
+					bool sched_in);
+
 };
 
 /**
@@ -558,6 +565,8 @@ extern void perf_event_delayed_put(struct task_struct *task);
 extern void perf_event_print_debug(void);
 extern void perf_pmu_disable(struct pmu *pmu);
 extern void perf_pmu_enable(struct pmu *pmu);
+extern void perf_sched_cb_dec(struct pmu *pmu);
+extern void perf_sched_cb_inc(struct pmu *pmu);
 extern int perf_event_task_disable(void);
 extern int perf_event_task_enable(void);
 extern int perf_event_refresh(struct perf_event *event, int refresh);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index fef45b4bb5f8..6c8b31b7efb6 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -154,6 +154,7 @@ enum event_type_t {
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
+static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
 static atomic_t nr_comm_events __read_mostly;
@@ -2577,6 +2578,56 @@ unlock:
 	}
 }
 
+void perf_sched_cb_dec(struct pmu *pmu)
+{
+	this_cpu_dec(perf_sched_cb_usages);
+}
+
+void perf_sched_cb_inc(struct pmu *pmu)
+{
+	this_cpu_inc(perf_sched_cb_usages);
+}
+
+/*
+ * This function provides the context switch callback to the lower code
+ * layer. It is invoked ONLY when the context switch callback is enabled.
+ */
+static void perf_pmu_sched_task(struct task_struct *prev,
+				struct task_struct *next,
+				bool sched_in)
+{
+	struct perf_cpu_context *cpuctx;
+	struct pmu *pmu;
+	unsigned long flags;
+
+	if (prev == next)
+		return;
+
+	local_irq_save(flags);
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(pmu, &pmus, entry) {
+		if (pmu->sched_task) {
+			cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
+
+			perf_pmu_disable(pmu);
+
+			pmu->sched_task(cpuctx->task_ctx, sched_in);
+
+			perf_pmu_enable(pmu);
+
+			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
+		}
+	}
+
+	rcu_read_unlock();
+
+	local_irq_restore(flags);
+}
+
 #define for_each_task_context_nr(ctxn)					\
 	for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
 
@@ -2596,6 +2647,9 @@ void __perf_event_task_sched_out(struct task_struct *task,
 {
 	int ctxn;
 
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(task, next, false);
+
 	for_each_task_context_nr(ctxn)
 		perf_event_context_sched_out(task, ctxn, next);
 
@@ -2847,6 +2901,9 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	/* check for system-wide branch_stack events */
 	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
 		perf_branch_stack_sched_in(prev, task);
+
+	if (__this_cpu_read(perf_sched_cb_usages))
+		perf_pmu_sched_task(prev, task, true);
 }
 
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
-- 
cgit v1.2.3


From 2a0ad3b326a9024ba86dca4028499d31fa0c6c4d Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:55:59 -0500
Subject: perf/x86/intel: Use context switch callback to flush LBR stack

Previous commit introduces context switch callback, its function
overlaps with the flush branch stack callback. So we can use the
context switch callback to flush LBR stack.

This patch adds code that uses the flush branch callback to
flush the LBR stack when task is being scheduled in. The callback
is enabled only when there are events use the LBR hardware. This
patch also removes all old flush branch stack code.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-4-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c           |  7 ---
 arch/x86/kernel/cpu/perf_event.h           |  3 +-
 arch/x86/kernel/cpu/perf_event_intel.c     | 14 +-----
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 27 +++++++++++
 include/linux/perf_event.h                 |  1 -
 kernel/events/core.c                       | 77 ------------------------------
 6 files changed, 30 insertions(+), 99 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0efbd6cc2966..6b1fd26a37cf 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1920,12 +1920,6 @@ static void x86_pmu_sched_task(struct perf_event_context *ctx, bool sched_in)
 		x86_pmu.sched_task(ctx, sched_in);
 }
 
-static void x86_pmu_flush_branch_stack(void)
-{
-	if (x86_pmu.flush_branch_stack)
-		x86_pmu.flush_branch_stack();
-}
-
 void perf_check_microcode(void)
 {
 	if (x86_pmu.check_microcode)
@@ -1955,7 +1949,6 @@ static struct pmu pmu = {
 	.commit_txn		= x86_pmu_commit_txn,
 
 	.event_idx		= x86_pmu_event_idx,
-	.flush_branch_stack	= x86_pmu_flush_branch_stack,
 	.sched_task		= x86_pmu_sched_task,
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 211b54c0c00c..949d0083a29e 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -472,7 +472,6 @@ struct x86_pmu {
 	void		(*cpu_dead)(int cpu);
 
 	void		(*check_microcode)(void);
-	void		(*flush_branch_stack)(void);
 	void		(*sched_task)(struct perf_event_context *ctx,
 				      bool sched_in);
 
@@ -733,6 +732,8 @@ void intel_pmu_pebs_disable_all(void);
 
 void intel_ds_init(void);
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in);
+
 void intel_pmu_lbr_reset(void);
 
 void intel_pmu_lbr_enable(struct perf_event *event);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..424fbf74dee7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2044,18 +2044,6 @@ static void intel_pmu_cpu_dying(int cpu)
 	fini_debug_store_on_cpu(cpu);
 }
 
-static void intel_pmu_flush_branch_stack(void)
-{
-	/*
-	 * Intel LBR does not tag entries with the
-	 * PID of the current task, then we need to
-	 * flush it on ctxsw
-	 * For now, we simply reset it
-	 */
-	if (x86_pmu.lbr_nr)
-		intel_pmu_lbr_reset();
-}
-
 PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63");
 
 PMU_FORMAT_ATTR(ldlat, "config1:0-15");
@@ -2107,7 +2095,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.cpu_starting		= intel_pmu_cpu_starting,
 	.cpu_dying		= intel_pmu_cpu_dying,
 	.guest_get_msrs		= intel_guest_get_msrs,
-	.flush_branch_stack	= intel_pmu_flush_branch_stack,
+	.sched_task		= intel_pmu_lbr_sched_task,
 };
 
 static __init void intel_clovertown_quirk(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 8bc078f43a82..c0e23c5f85bb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -177,6 +177,31 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+
+	if (!x86_pmu.lbr_nr)
+		return;
+
+	/*
+	 * When sampling the branck stack in system-wide, it may be
+	 * necessary to flush the stack on context switch. This happens
+	 * when the branch stack does not tag its entries with the pid
+	 * of the current task. Otherwise it becomes impossible to
+	 * associate a branch entry with a task. This ambiguity is more
+	 * likely to appear when the branch stack supports priv level
+	 * filtering and the user sets it to monitor only at the user
+	 * level (which could be a useful measurement in system-wide
+	 * mode). In that case, the risk is high of having a branch
+	 * stack with branch from multiple tasks.
+ 	 */
+	if (sched_in) {
+		intel_pmu_lbr_reset();
+		cpuc->lbr_context = ctx;
+	}
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -195,6 +220,7 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
 	cpuc->lbr_users++;
+	perf_sched_cb_inc(event->ctx->pmu);
 }
 
 void intel_pmu_lbr_disable(struct perf_event *event)
@@ -206,6 +232,7 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
+	perf_sched_cb_dec(event->ctx->pmu);
 
 	if (cpuc->enabled && !cpuc->lbr_users) {
 		__intel_pmu_lbr_disable();
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index fbab6235d053..c7007a564440 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -511,7 +511,6 @@ struct perf_event_context {
 	u64				generation;
 	int				pin_count;
 	int				nr_cgroups;	 /* cgroup evts */
-	int				nr_branch_stack; /* branch_stack evt */
 	struct rcu_head			rcu_head;
 
 	struct delayed_work		orphans_remove;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 6c8b31b7efb6..f563ce767f93 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -153,7 +153,6 @@ enum event_type_t {
  */
 struct static_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
-static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
 
 static atomic_t nr_mmap_events __read_mostly;
@@ -1240,9 +1239,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 	if (is_cgroup_event(event))
 		ctx->nr_cgroups++;
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack++;
-
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
 	if (event->attr.inherit_stat)
@@ -1409,9 +1405,6 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 			cpuctx->cgrp = NULL;
 	}
 
-	if (has_branch_stack(event))
-		ctx->nr_branch_stack--;
-
 	ctx->nr_events--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
@@ -2808,64 +2801,6 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 	perf_ctx_unlock(cpuctx, ctx);
 }
 
-/*
- * When sampling the branck stack in system-wide, it may be necessary
- * to flush the stack on context switch. This happens when the branch
- * stack does not tag its entries with the pid of the current task.
- * Otherwise it becomes impossible to associate a branch entry with a
- * task. This ambiguity is more likely to appear when the branch stack
- * supports priv level filtering and the user sets it to monitor only
- * at the user level (which could be a useful measurement in system-wide
- * mode). In that case, the risk is high of having a branch stack with
- * branch from multiple tasks. Flushing may mean dropping the existing
- * entries or stashing them somewhere in the PMU specific code layer.
- *
- * This function provides the context switch callback to the lower code
- * layer. It is invoked ONLY when there is at least one system-wide context
- * with at least one active event using taken branch sampling.
- */
-static void perf_branch_stack_sched_in(struct task_struct *prev,
-				       struct task_struct *task)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	/* no need to flush branch stack if not changing task */
-	if (prev == task)
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		/*
-		 * check if the context has at least one
-		 * event using PERF_SAMPLE_BRANCH_STACK
-		 */
-		if (cpuctx->ctx.nr_branch_stack > 0
-		    && pmu->flush_branch_stack) {
-
-			perf_ctx_lock(cpuctx, cpuctx->task_ctx);
-
-			perf_pmu_disable(pmu);
-
-			pmu->flush_branch_stack();
-
-			perf_pmu_enable(pmu);
-
-			perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
-		}
-	}
-
-	rcu_read_unlock();
-
-	local_irq_restore(flags);
-}
-
 /*
  * Called from scheduler to add the events of the current task
  * with interrupts disabled.
@@ -2898,10 +2833,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
 	if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
 		perf_cgroup_sched_in(prev, task);
 
-	/* check for system-wide branch_stack events */
-	if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
-		perf_branch_stack_sched_in(prev, task);
-
 	if (__this_cpu_read(perf_sched_cb_usages))
 		perf_pmu_sched_task(prev, task, true);
 }
@@ -3480,10 +3411,6 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_dec(&per_cpu(perf_cgroup_events, cpu));
 }
@@ -7139,10 +7066,6 @@ static void account_event_cpu(struct perf_event *event, int cpu)
 	if (event->parent)
 		return;
 
-	if (has_branch_stack(event)) {
-		if (!(event->attach_state & PERF_ATTACH_TASK))
-			atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
-	}
 	if (is_cgroup_event(event))
 		atomic_inc(&per_cpu(perf_cgroup_events, cpu));
 }
-- 
cgit v1.2.3


From e9d7f7cd97c45e2c612d3b38be05b4cfb27939ee Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:00 -0500
Subject: perf/x86/intel: Add basic Haswell LBR call stack support

Haswell has a new feature that utilizes the existing LBR facility to
record call chains. To enable this feature, bits (JCC, NEAR_IND_JMP,
NEAR_REL_JMP, FAR_BRANCH, EN_CALLSTACK) in LBR_SELECT must be set to 1,
bits (NEAR_REL_CALL, NEAR-IND_CALL, NEAR_RET) must be cleared. Due to
a hardware bug of Haswell, this feature doesn't work well with
FREEZE_LBRS_ON_PMI.

When the call stack feature is enabled, the LBR stack will capture
unfiltered call data normally, but as return instructions are executed,
the last captured branch record is flushed from the on-chip registers
in a last-in first-out (LIFO) manner. Thus, branch information relative
to leaf functions will not be captured, while preserving the call stack
information of the main line execution path.

This patch defines a separate lbr_sel map for Haswell. The map contains
a new entry for the call stack feature.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-5-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           | 14 ++++-
 arch/x86/kernel/cpu/perf_event_intel.c     |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 91 ++++++++++++++++++++++--------
 3 files changed, 83 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 949d0083a29e..c9a62c5bca75 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -517,7 +517,11 @@ struct x86_pmu {
 };
 
 enum {
-	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
+	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
+
+	PERF_SAMPLE_BRANCH_CALL_STACK =
+				1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
 };
 
 #define x86_add_quirk(func_)						\
@@ -551,6 +555,12 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
 extern struct x86_pmu x86_pmu __read_mostly;
 
+static inline bool x86_pmu_has_lbr_callstack(void)
+{
+	return  x86_pmu.lbr_sel_map &&
+		x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0;
+}
+
 DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events);
 
 int x86_perf_event_set_period(struct perf_event *event);
@@ -754,6 +764,8 @@ void intel_pmu_lbr_init_atom(void);
 
 void intel_pmu_lbr_init_snb(void);
 
+void intel_pmu_lbr_init_hsw(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 int p4_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 424fbf74dee7..a485ba124476 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2537,7 +2537,7 @@ __init int intel_pmu_init(void)
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
-		intel_pmu_lbr_init_snb();
+		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_hsw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index c0e23c5f85bb..ac8279e560a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -39,6 +39,7 @@ static enum {
 #define LBR_IND_JMP_BIT		6 /* do not capture indirect jumps */
 #define LBR_REL_JMP_BIT		7 /* do not capture relative jumps */
 #define LBR_FAR_BIT		8 /* do not capture far branches */
+#define LBR_CALL_STACK_BIT	9 /* enable call stack */
 
 #define LBR_KERNEL	(1 << LBR_KERNEL_BIT)
 #define LBR_USER	(1 << LBR_USER_BIT)
@@ -49,6 +50,7 @@ static enum {
 #define LBR_REL_JMP	(1 << LBR_REL_JMP_BIT)
 #define LBR_IND_JMP	(1 << LBR_IND_JMP_BIT)
 #define LBR_FAR		(1 << LBR_FAR_BIT)
+#define LBR_CALL_STACK	(1 << LBR_CALL_STACK_BIT)
 
 #define LBR_PLM (LBR_KERNEL | LBR_USER)
 
@@ -74,24 +76,25 @@ static enum {
  * x86control flow changes include branches, interrupts, traps, faults
  */
 enum {
-	X86_BR_NONE     = 0,      /* unknown */
-
-	X86_BR_USER     = 1 << 0, /* branch target is user */
-	X86_BR_KERNEL   = 1 << 1, /* branch target is kernel */
-
-	X86_BR_CALL     = 1 << 2, /* call */
-	X86_BR_RET      = 1 << 3, /* return */
-	X86_BR_SYSCALL  = 1 << 4, /* syscall */
-	X86_BR_SYSRET   = 1 << 5, /* syscall return */
-	X86_BR_INT      = 1 << 6, /* sw interrupt */
-	X86_BR_IRET     = 1 << 7, /* return from interrupt */
-	X86_BR_JCC      = 1 << 8, /* conditional */
-	X86_BR_JMP      = 1 << 9, /* jump */
-	X86_BR_IRQ      = 1 << 10,/* hw interrupt or trap or fault */
-	X86_BR_IND_CALL = 1 << 11,/* indirect calls */
-	X86_BR_ABORT    = 1 << 12,/* transaction abort */
-	X86_BR_IN_TX    = 1 << 13,/* in transaction */
-	X86_BR_NO_TX    = 1 << 14,/* not in transaction */
+	X86_BR_NONE		= 0,      /* unknown */
+
+	X86_BR_USER		= 1 << 0, /* branch target is user */
+	X86_BR_KERNEL		= 1 << 1, /* branch target is kernel */
+
+	X86_BR_CALL		= 1 << 2, /* call */
+	X86_BR_RET		= 1 << 3, /* return */
+	X86_BR_SYSCALL		= 1 << 4, /* syscall */
+	X86_BR_SYSRET		= 1 << 5, /* syscall return */
+	X86_BR_INT		= 1 << 6, /* sw interrupt */
+	X86_BR_IRET		= 1 << 7, /* return from interrupt */
+	X86_BR_JCC		= 1 << 8, /* conditional */
+	X86_BR_JMP		= 1 << 9, /* jump */
+	X86_BR_IRQ		= 1 << 10,/* hw interrupt or trap or fault */
+	X86_BR_IND_CALL		= 1 << 11,/* indirect calls */
+	X86_BR_ABORT		= 1 << 12,/* transaction abort */
+	X86_BR_IN_TX		= 1 << 13,/* in transaction */
+	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
+	X86_BR_CALL_STACK	= 1 << 15,/* call stack */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -373,7 +376,7 @@ void intel_pmu_lbr_read(void)
  * - in case there is no HW filter
  * - in case the HW filter has errata or limitations
  */
-static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
+static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 {
 	u64 br_type = event->attr.branch_sample_type;
 	int mask = 0;
@@ -410,11 +413,21 @@ static void intel_pmu_setup_sw_lbr_filter(struct perf_event *event)
 	if (br_type & PERF_SAMPLE_BRANCH_COND)
 		mask |= X86_BR_JCC;
 
+	if (br_type & PERF_SAMPLE_BRANCH_CALL_STACK) {
+		if (!x86_pmu_has_lbr_callstack())
+			return -EOPNOTSUPP;
+		if (mask & ~(X86_BR_USER | X86_BR_KERNEL))
+			return -EINVAL;
+		mask |= X86_BR_CALL | X86_BR_IND_CALL | X86_BR_RET |
+			X86_BR_CALL_STACK;
+	}
+
 	/*
 	 * stash actual user request into reg, it may
 	 * be used by fixup code for some CPU
 	 */
 	event->hw.branch_reg.reg = mask;
+	return 0;
 }
 
 /*
@@ -443,8 +456,12 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	reg = &event->hw.branch_reg;
 	reg->idx = EXTRA_REG_LBR;
 
-	/* LBR_SELECT operates in suppress mode so invert mask */
-	reg->config = ~mask & x86_pmu.lbr_sel_mask;
+	/*
+	 * The first 9 bits (LBR_SEL_MASK) in LBR_SELECT operate
+	 * in suppress mode. So LBR_SELECT should be set to
+	 * (~mask & LBR_SEL_MASK) | (mask & ~LBR_SEL_MASK)
+	 */
+	reg->config = mask ^ x86_pmu.lbr_sel_mask;
 
 	return 0;
 }
@@ -462,7 +479,9 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event)
 	/*
 	 * setup SW LBR filter
 	 */
-	intel_pmu_setup_sw_lbr_filter(event);
+	ret = intel_pmu_setup_sw_lbr_filter(event);
+	if (ret)
+		return ret;
 
 	/*
 	 * setup HW LBR filter, if any
@@ -732,6 +751,20 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
+	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
+	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
+	[PERF_SAMPLE_BRANCH_HV_SHIFT]		= LBR_IGN,
+	[PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT]	= LBR_RETURN | LBR_FAR,
+	[PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_FAR,
+	[PERF_SAMPLE_BRANCH_IND_CALL_SHIFT]	= LBR_IND_CALL,
+	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
+	[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT]	= LBR_REL_CALL | LBR_IND_CALL
+						| LBR_RETURN | LBR_CALL_STACK,
+};
+
 /* core */
 void __init intel_pmu_lbr_init_core(void)
 {
@@ -788,6 +821,20 @@ void __init intel_pmu_lbr_init_snb(void)
 	pr_cont("16-deep LBR, ");
 }
 
+/* haswell */
+void intel_pmu_lbr_init_hsw(void)
+{
+	x86_pmu.lbr_nr	 = 16;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	pr_cont("16-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
-- 
cgit v1.2.3


From e18bf526422769611e7248135e36a4cea0e4e38d Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:03 -0500
Subject: perf/x86/intel: Allocate space for storing LBR stack

When the LBR call stack is enabled, it is necessary to save/restore
the LBR stack on context switch. We can use pmu specific data to
store LBR stack when task is scheduled out. This patch adds code
that allocates the pmu specific data.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-8-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++++
 arch/x86/kernel/cpu/perf_event.h | 7 +++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 6b1fd26a37cf..8ffd71ec2173 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -432,6 +432,9 @@ int x86_pmu_hw_config(struct perf_event *event)
 		}
 	}
 
+	if (event->attr.branch_sample_type & PERF_SAMPLE_BRANCH_CALL_STACK)
+		event->attach_state |= PERF_ATTACH_TASK_DATA;
+
 	/*
 	 * Generate PMC IRQs:
 	 * (keep 'enabled' bit clear for now)
@@ -1950,6 +1953,7 @@ static struct pmu pmu = {
 
 	.event_idx		= x86_pmu_event_idx,
 	.sched_task		= x86_pmu_sched_task,
+	.task_ctx_size          = sizeof(struct x86_perf_task_context),
 };
 
 void arch_perf_update_userpage(struct perf_event *event,
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index c9a62c5bca75..69c26b396cf4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -516,6 +516,13 @@ struct x86_pmu {
 	struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr);
 };
 
+struct x86_perf_task_context {
+	u64 lbr_from[MAX_LBR_ENTRIES];
+	u64 lbr_to[MAX_LBR_ENTRIES];
+	int lbr_callstack_users;
+	int lbr_stack_state;
+};
+
 enum {
 	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
 	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
-- 
cgit v1.2.3


From 63f0c1d84196b712fe9de99a8514486ab416d517 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:04 -0500
Subject: perf/x86/intel: Track number of events that use the LBR callstack

When enabling/disabling an event, check if the event uses the LBR
callstack feature, adjust the LBR callstack usage count accordingly.
Later patch will use the usage count to decide if LBR stack should
be saved/restored.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-9-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ac8279e560a1..ac8e54200934 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -205,9 +205,15 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 	}
 }
 
+static inline bool branch_user_callstack(unsigned br_sel)
+{
+	return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK);
+}
+
 void intel_pmu_lbr_enable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
@@ -222,6 +228,12 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 	}
 	cpuc->br_sel = event->hw.branch_reg.reg;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users++;
+	}
+
 	cpuc->lbr_users++;
 	perf_sched_cb_inc(event->ctx->pmu);
 }
@@ -229,10 +241,17 @@ void intel_pmu_lbr_enable(struct perf_event *event)
 void intel_pmu_lbr_disable(struct perf_event *event)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	if (branch_user_callstack(cpuc->br_sel) && event->ctx &&
+					event->ctx->task_ctx_data) {
+		task_ctx = event->ctx->task_ctx_data;
+		task_ctx->lbr_callstack_users--;
+	}
+
 	cpuc->lbr_users--;
 	WARN_ON_ONCE(cpuc->lbr_users < 0);
 	perf_sched_cb_dec(event->ctx->pmu);
-- 
cgit v1.2.3


From 76cb2c617f12a4dd53c0e899972813b805ad6cc2 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:05 -0500
Subject: perf/x86/intel: Save/restore LBR stack during context switch

When the LBR call stack is enabled, it is necessary to save/restore
the LBR stack on context switch. The solution is saving/restoring
the LBR stack to/from task's perf event context.

The LBR stack is saved/restored only when there are events that use
the LBR call stack. If no event uses LBR call stack, the LBR stack
is reset when task is scheduled in.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-10-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 88 ++++++++++++++++++++++++++----
 1 file changed, 76 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index ac8e54200934..a8b3f236cf37 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -180,13 +180,89 @@ void intel_pmu_lbr_reset(void)
 		intel_pmu_lbr_reset_64();
 }
 
+/*
+ * TOS = most recently recorded branch
+ */
+static inline u64 intel_pmu_lbr_tos(void)
+{
+	u64 tos;
+
+	rdmsrl(x86_pmu.lbr_tos, tos);
+	return tos;
+}
+
+enum {
+	LBR_NONE,
+	LBR_VALID,
+};
+
+static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0 ||
+	    task_ctx->lbr_stack_state == LBR_NONE) {
+		intel_pmu_lbr_reset();
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_NONE;
+}
+
+static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)
+{
+	int i;
+	unsigned lbr_idx, mask;
+	u64 tos;
+
+	if (task_ctx->lbr_callstack_users == 0) {
+		task_ctx->lbr_stack_state = LBR_NONE;
+		return;
+	}
+
+	mask = x86_pmu.lbr_nr - 1;
+	tos = intel_pmu_lbr_tos();
+	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+		lbr_idx = (tos - i) & mask;
+		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
+		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+	}
+	task_ctx->lbr_stack_state = LBR_VALID;
+}
+
 void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct x86_perf_task_context *task_ctx;
 
 	if (!x86_pmu.lbr_nr)
 		return;
 
+	/*
+	 * If LBR callstack feature is enabled and the stack was saved when
+	 * the task was scheduled out, restore the stack. Otherwise flush
+	 * the LBR stack.
+	 */
+	task_ctx = ctx ? ctx->task_ctx_data : NULL;
+	if (task_ctx) {
+		if (sched_in) {
+			__intel_pmu_lbr_restore(task_ctx);
+			cpuc->lbr_context = ctx;
+		} else {
+			__intel_pmu_lbr_save(task_ctx);
+		}
+		return;
+	}
+
 	/*
 	 * When sampling the branck stack in system-wide, it may be
 	 * necessary to flush the stack on context switch. This happens
@@ -279,18 +355,6 @@ void intel_pmu_lbr_disable_all(void)
 		__intel_pmu_lbr_disable();
 }
 
-/*
- * TOS = most recently recorded branch
- */
-static inline u64 intel_pmu_lbr_tos(void)
-{
-	u64 tos;
-
-	rdmsrl(x86_pmu.lbr_tos, tos);
-
-	return tos;
-}
-
 static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
 {
 	unsigned long mask = x86_pmu.lbr_nr - 1;
-- 
cgit v1.2.3


From a46a23000198d929391aa9dac8de68734efa2703 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:06 -0500
Subject: perf: Simplify the branch stack check

Use event->attr.branch_sample_type to replace
intel_pmu_needs_lbr_smpl() for avoiding duplicated code that
implicitly enables the LBR.

Currently, branch stack can be enabled by user explicitly requesting
branch sampling or implicit branch sampling to correct PEBS skid.

For user explicitly requested branch sampling, the branch_sample_type
is explicitly set by user. For PEBS case, the branch_sample_type is also
implicitly set to PERF_SAMPLE_BRANCH_ANY in x86_pmu_hw_config.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-11-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 20 +++-----------------
 include/linux/perf_event.h             |  5 +++++
 kernel/events/core.c                   |  3 +++
 3 files changed, 11 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index a485ba124476..9f1dd18fa395 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1029,20 +1029,6 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static void intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
@@ -1207,7 +1193,7 @@ static void intel_pmu_disable_event(struct perf_event *event)
 	 * must disable before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_disable(event);
 
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1268,7 +1254,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
 	 * must enabled before any actual event
 	 * because any event may be combined with LBR
 	 */
-	if (intel_pmu_needs_lbr_smpl(event))
+	if (needs_branch_stack(event))
 		intel_pmu_lbr_enable(event);
 
 	if (event->attr.exclude_host)
@@ -1747,7 +1733,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip && x86_pmu.pebs_aliases)
 		x86_pmu.pebs_aliases(event);
 
-	if (intel_pmu_needs_lbr_smpl(event)) {
+	if (needs_branch_stack(event)) {
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 270cd0173e61..43cc158487e6 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -814,6 +814,11 @@ static inline bool has_branch_stack(struct perf_event *event)
 	return event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK;
 }
 
+static inline bool needs_branch_stack(struct perf_event *event)
+{
+	return event->attr.branch_sample_type != 0;
+}
+
 extern int perf_output_begin(struct perf_output_handle *handle,
 			     struct perf_event *event, unsigned int size);
 extern void perf_output_end(struct perf_output_handle *handle);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 84451c0debba..257eccf9afd4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7232,6 +7232,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 	if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
 		goto err_ns;
 
+	if (!has_branch_stack(event))
+		event->attr.branch_sample_type = 0;
+
 	pmu = perf_init_event(event);
 	if (!pmu)
 		goto err_ns;
-- 
cgit v1.2.3


From 4b854900995194601d767fcd112307b21ed278b2 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:08 -0500
Subject: perf/x86/intel: Re-organize code that implicitly enables LBR/PEBS

Make later patch more readable, no logic change.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-13-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 59 ++++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 8ffd71ec2173..e0dab5ce61e9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -399,36 +399,35 @@ int x86_pmu_hw_config(struct perf_event *event)
 
 		if (event->attr.precise_ip > precise)
 			return -EOPNOTSUPP;
-		/*
-		 * check that PEBS LBR correction does not conflict with
-		 * whatever the user is asking with attr->branch_sample_type
-		 */
-		if (event->attr.precise_ip > 1 &&
-		    x86_pmu.intel_cap.pebs_format < 2) {
-			u64 *br_type = &event->attr.branch_sample_type;
-
-			if (has_branch_stack(event)) {
-				if (!precise_br_compat(event))
-					return -EOPNOTSUPP;
-
-				/* branch_sample_type is compatible */
-
-			} else {
-				/*
-				 * user did not specify  branch_sample_type
-				 *
-				 * For PEBS fixups, we capture all
-				 * the branches at the priv level of the
-				 * event.
-				 */
-				*br_type = PERF_SAMPLE_BRANCH_ANY;
-
-				if (!event->attr.exclude_user)
-					*br_type |= PERF_SAMPLE_BRANCH_USER;
-
-				if (!event->attr.exclude_kernel)
-					*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
-			}
+	}
+	/*
+	 * check that PEBS LBR correction does not conflict with
+	 * whatever the user is asking with attr->branch_sample_type
+	 */
+	if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format < 2) {
+		u64 *br_type = &event->attr.branch_sample_type;
+
+		if (has_branch_stack(event)) {
+			if (!precise_br_compat(event))
+				return -EOPNOTSUPP;
+
+			/* branch_sample_type is compatible */
+
+		} else {
+			/*
+			 * user did not specify  branch_sample_type
+			 *
+			 * For PEBS fixups, we capture all
+			 * the branches at the priv level of the
+			 * event.
+			 */
+			*br_type = PERF_SAMPLE_BRANCH_ANY;
+
+			if (!event->attr.exclude_user)
+				*br_type |= PERF_SAMPLE_BRANCH_USER;
+
+			if (!event->attr.exclude_kernel)
+				*br_type |= PERF_SAMPLE_BRANCH_KERNEL;
 		}
 	}
 
-- 
cgit v1.2.3


From 2c70d0086e4e9e2440f0f78098090f32bde14277 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:10 -0500
Subject: perf/x86/intel: Disable FREEZE_LBRS_ON_PMI when LBR operates in
 callstack mode

LBR callstack is designed for PEBS, It does not work well with
FREEZE_LBRS_ON_PMI for non PEBS event. If FREEZE_LBRS_ON_PMI is set for
non PEBS event, PMIs near call/return instructions may cause superfluous
increase/decrease of LBR_TOS.

This patch modifies __intel_pmu_lbr_enable() to not enable
FREEZE_LBRS_ON_PMI when LBR operates in callstack mode. We currently
don't use LBR callstack to capture kernel space callchain, so disabling
FREEZE_LBRS_ON_PMI should not be a problem.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-15-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index a8b3f236cf37..92a44fdbc9d3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -131,14 +131,23 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 
 static void __intel_pmu_lbr_enable(void)
 {
-	u64 debugctl;
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	u64 debugctl, lbr_select = 0;
 
-	if (cpuc->lbr_sel)
-		wrmsrl(MSR_LBR_SELECT, cpuc->lbr_sel->config);
+	if (cpuc->lbr_sel) {
+		lbr_select = cpuc->lbr_sel->config;
+		wrmsrl(MSR_LBR_SELECT, lbr_select);
+	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
-	debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+	debugctl |= DEBUGCTLMSR_LBR;
+	/*
+	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
+	 * If FREEZE_LBRS_ON_PMI is set, PMI near call/return instructions
+	 * may cause superfluous increase/decrease of LBR_TOS.
+	 */
+	if (!(lbr_select & LBR_CALL_STACK))
+		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
 	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
-- 
cgit v1.2.3


From aa54ae9b87b83af7edabcc34a299e7e014609af4 Mon Sep 17 00:00:00 2001
From: Yan, Zheng
Date: Tue, 4 Nov 2014 21:56:11 -0500
Subject: perf/x86/intel: Discard zero length call entries in LBR call stack

"Zero length call" uses the attribute of the call instruction to push
the immediate instruction pointer on to the stack and then pops off
that address into a register. This is accomplished without any matching
return instruction. It confuses the hardware and make the recorded call
stack incorrect.

We can partially resolve this issue by: decode call instructions and
discard any zero length call entry in the LBR stack.

Signed-off-by: Yan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: eranian@google.com
Cc: jolsa@redhat.com
Link: http://lkml.kernel.org/r/1415156173-10035-16-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 92a44fdbc9d3..084f2eb20c8b 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -94,7 +94,8 @@ enum {
 	X86_BR_ABORT		= 1 << 12,/* transaction abort */
 	X86_BR_IN_TX		= 1 << 13,/* in transaction */
 	X86_BR_NO_TX		= 1 << 14,/* not in transaction */
-	X86_BR_CALL_STACK	= 1 << 15,/* call stack */
+	X86_BR_ZERO_CALL	= 1 << 15,/* zero length call */
+	X86_BR_CALL_STACK	= 1 << 16,/* call stack */
 };
 
 #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL)
@@ -111,13 +112,15 @@ enum {
 	 X86_BR_JMP	 |\
 	 X86_BR_IRQ	 |\
 	 X86_BR_ABORT	 |\
-	 X86_BR_IND_CALL)
+	 X86_BR_IND_CALL |\
+	 X86_BR_ZERO_CALL)
 
 #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY)
 
 #define X86_BR_ANY_CALL		 \
 	(X86_BR_CALL		|\
 	 X86_BR_IND_CALL	|\
+	 X86_BR_ZERO_CALL	|\
 	 X86_BR_SYSCALL		|\
 	 X86_BR_IRQ		|\
 	 X86_BR_INT)
@@ -702,6 +705,12 @@ static int branch_type(unsigned long from, unsigned long to, int abort)
 		ret = X86_BR_INT;
 		break;
 	case 0xe8: /* call near rel */
+		insn_get_immediate(&insn);
+		if (insn.immediate1.value == 0) {
+			/* zero length call */
+			ret = X86_BR_ZERO_CALL;
+			break;
+		}
 	case 0x9a: /* call far absolute */
 		ret = X86_BR_CALL;
 		break;
-- 
cgit v1.2.3


From 2c44b1936bb3b135a3fac8b3493394d42e51cf70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 5 Nov 2014 10:36:45 +0100
Subject: perf/x86/intel: Expose LBR callstack to user space tooling

With LBR call stack feature enable, there are three callchain options.
Enable the 3rd callchain option (LBR callstack) to user space tooling.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/20141105093759.GQ10501@worktop.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  8 --------
 arch/x86/kernel/cpu/perf_event_intel_lbr.c |  8 ++++----
 include/uapi/linux/perf_event.h            | 16 ++++++++--------
 3 files changed, 12 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 69c26b396cf4..a371d27d6795 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -523,14 +523,6 @@ struct x86_perf_task_context {
 	int lbr_stack_state;
 };
 
-enum {
-	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = PERF_SAMPLE_BRANCH_MAX_SHIFT,
-	PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE,
-
-	PERF_SAMPLE_BRANCH_CALL_STACK =
-				1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
-};
-
 #define x86_add_quirk(func_)						\
 do {									\
 	static struct x86_pmu_quirk __quirk __initdata = {		\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 084f2eb20c8b..0473874109cb 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -537,7 +537,7 @@ static int intel_pmu_setup_hw_lbr_filter(struct perf_event *event)
 	u64 mask = 0, v;
 	int i;
 
-	for (i = 0; i < PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE; i++) {
+	for (i = 0; i < PERF_SAMPLE_BRANCH_MAX_SHIFT; i++) {
 		if (!(br_type & (1ULL << i)))
 			continue;
 
@@ -821,7 +821,7 @@ intel_pmu_lbr_filter(struct cpu_hw_events *cpuc)
 /*
  * Map interface branch filters onto LBR filters
  */
-static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
@@ -840,7 +840,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]     = LBR_JCC,
 };
 
-static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
@@ -852,7 +852,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
 	[PERF_SAMPLE_BRANCH_COND_SHIFT]		= LBR_JCC,
 };
 
-static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_SELECT_MAP_SIZE] = {
+static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = {
 	[PERF_SAMPLE_BRANCH_ANY_SHIFT]		= LBR_ANY,
 	[PERF_SAMPLE_BRANCH_USER_SHIFT]		= LBR_USER,
 	[PERF_SAMPLE_BRANCH_KERNEL_SHIFT]	= LBR_KERNEL,
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e46b93279e3d..1e3cd07cf76e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -166,6 +166,8 @@ enum perf_branch_sample_type_shift {
 	PERF_SAMPLE_BRANCH_NO_TX_SHIFT		= 9, /* not in transaction */
 	PERF_SAMPLE_BRANCH_COND_SHIFT		= 10, /* conditional branches */
 
+	PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT	= 11, /* call/ret stack */
+
 	PERF_SAMPLE_BRANCH_MAX_SHIFT		/* non-ABI */
 };
 
@@ -175,18 +177,16 @@ enum perf_branch_sample_type {
 	PERF_SAMPLE_BRANCH_HV		= 1U << PERF_SAMPLE_BRANCH_HV_SHIFT,
 
 	PERF_SAMPLE_BRANCH_ANY		= 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT,
-	PERF_SAMPLE_BRANCH_ANY_CALL	=
-				1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
-	PERF_SAMPLE_BRANCH_ANY_RETURN	=
-				1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
-	PERF_SAMPLE_BRANCH_IND_CALL	=
-				1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
-	PERF_SAMPLE_BRANCH_ABORT_TX	=
-				1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_CALL	= 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ANY_RETURN	= 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT,
+	PERF_SAMPLE_BRANCH_IND_CALL	= 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT,
+	PERF_SAMPLE_BRANCH_ABORT_TX	= 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_IN_TX	= 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_NO_TX	= 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT,
 	PERF_SAMPLE_BRANCH_COND		= 1U << PERF_SAMPLE_BRANCH_COND_SHIFT,
 
+	PERF_SAMPLE_BRANCH_CALL_STACK	= 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT,
+
 	PERF_SAMPLE_BRANCH_MAX		= 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT,
 };
 
-- 
cgit v1.2.3


From 4421f8f0fa02bc982b410cd773223cc280791c54 Mon Sep 17 00:00:00 2001
From: Miroslav Benes
Date: Wed, 18 Feb 2015 15:21:07 +0100
Subject: livepatch: remove extern specifier from header files

Storage-class specifier 'extern' is redundant in front of the function
declaration. According to the C specification it has the same meaning as
if not present at all. So remove it.

Signed-off-by: Miroslav Benes <mbenes@suse.cz>
Acked-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/x86/include/asm/livepatch.h | 4 ++--
 include/linux/livepatch.h        | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/livepatch.h b/arch/x86/include/asm/livepatch.h
index a455a53d789a..2d29197bd2fb 100644
--- a/arch/x86/include/asm/livepatch.h
+++ b/arch/x86/include/asm/livepatch.h
@@ -32,8 +32,8 @@ static inline int klp_check_compiler_support(void)
 #endif
 	return 0;
 }
-extern int klp_write_module_reloc(struct module *mod, unsigned long type,
-				  unsigned long loc, unsigned long value);
+int klp_write_module_reloc(struct module *mod, unsigned long type,
+			   unsigned long loc, unsigned long value);
 
 static inline void klp_arch_set_pc(struct pt_regs *regs, unsigned long ip)
 {
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 95023fd8b00d..ee6dbb39a809 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -123,10 +123,10 @@ struct klp_patch {
 	enum klp_state state;
 };
 
-extern int klp_register_patch(struct klp_patch *);
-extern int klp_unregister_patch(struct klp_patch *);
-extern int klp_enable_patch(struct klp_patch *);
-extern int klp_disable_patch(struct klp_patch *);
+int klp_register_patch(struct klp_patch *);
+int klp_unregister_patch(struct klp_patch *);
+int klp_enable_patch(struct klp_patch *);
+int klp_disable_patch(struct klp_patch *);
 
 #endif /* CONFIG_LIVEPATCH */
 
-- 
cgit v1.2.3


From 8a764a875fe3cf3a83296bacd00bfc41917e95e2 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 12 Feb 2015 20:04:39 +0100
Subject: x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX

Before this patch, users need to do this to fetch vex.vvvv:

        if (insn->vex_prefix.nbytes == 2) {
                vex_vvvv = ((insn->vex_prefix.bytes[1] >> 3) & 0xf) ^ 0xf;
        }
        if (insn->vex_prefix.nbytes == 3) {
                vex_vvvv = ((insn->vex_prefix.bytes[2] >> 3) & 0xf) ^ 0xf;
        }

Make it so that insn->vex_prefix.bytes[2] always contains
vex.wvvvvLpp bits.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Frank Ch. Eigler <fche@redhat.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1423767879-31691-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/insn.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..5ea00ddc1825 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -164,6 +164,12 @@ found:
 				/* VEX.W overrides opnd_size */
 				insn->opnd_bytes = 8;
 		} else {
+			/*
+			 * For VEX2, fake VEX3-like byte#2.
+			 * Makes it easier to decode vex.W, vex.vvvv,
+			 * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+			 */
+			insn->vex_prefix.bytes[2] = b2 & 0x7f;
 			insn->vex_prefix.nbytes = 2;
 			insn->next_byte += 2;
 		}
-- 
cgit v1.2.3


From cbb53b9623a70f012e1fdfb6fc0af6878df4762b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 12 Feb 2015 20:06:57 +0100
Subject: x86/asm/decoder: Explain CALLW discrepancy between Intel and AMD

In 64-bit mode, AMD and Intel CPUs treat 0x66 prefix before
branch insns differently. For near branches, it affects decode
too since immediate offset's width is different.

See these empirical tests:

  http://marc.info/?l=linux-kernel&m=139714939728946&w=2

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1423768017-31766-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/x86-opcode-map.txt | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
 de: ESC
 df: ESC
 # 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
 e0: LOOPNE/LOOPNZ Jb (f64)
 e1: LOOPE/LOOPZ Jb (f64)
 e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
 e5: IN eAX,Ib
 e6: OUT Ib,AL
 e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
 e8: CALL Jz (f64)
 e9: JMP-near Jz (f64)
 ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
 # 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 80: JO Jz (f64)
 81: JNO Jz (f64)
 82: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
 GrpTable: Grp5
 0: INC Ev
 1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
 2: CALLN Ev (f64)
 3: CALLF Ep
 4: JMPN Ev (f64)
-- 
cgit v1.2.3


From e0fd24a3b4ad7b4084b41944835952eedec53f98 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Thu, 5 Feb 2015 15:39:34 +0000
Subject: x86/Kconfig: Avoid issuing pointless turned off entries to .config

Settings without prompts shouldn't normally have defaults other
than Y, as otherwise they (a) needlessly enlarge the resulting
.config and (b) if they ever get a prompt added later, the
tracked setting of off will prevent the devloper from then being
prompted for his/her choice when doing an incremental update of
the configuration (make oldconfig).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/54D39CC6020000780005D6AE@mail.emea.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5e28e2be3a41..463d8838f1db 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -233,12 +233,10 @@ config ARCH_WANT_GENERAL_HUGETLB
 	def_bool y
 
 config ZONE_DMA32
-	bool
-	default X86_64
+	def_bool y if X86_64
 
 config AUDIT_ARCH
-	bool
-	default X86_64
+	def_bool y if X86_64
 
 config ARCH_SUPPORTS_OPTIMIZED_INLINING
 	def_bool y
@@ -1115,10 +1113,10 @@ config MICROCODE_OLD_INTERFACE
 	depends on MICROCODE
 
 config MICROCODE_INTEL_EARLY
-	def_bool n
+	bool
 
 config MICROCODE_AMD_EARLY
-	def_bool n
+	bool
 
 config MICROCODE_EARLY
 	bool "Early load microcode"
-- 
cgit v1.2.3


From b1da1e715d4faf01468b7f45f7098922bc85ea8e Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Thu, 5 Feb 2015 15:35:21 +0000
Subject: x86/Kconfig: Simplify X86_IO_APIC dependencies

Since dependencies are transitive, we don't really need to
repeat those of X86_UP_IOAPIC.

Furthermore avoid the symbol getting entered into .config when
it is off by having the default simply Y and the dependencies
solely handled via the intended for that purpose "depends on".

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/54D39BC9020000780005D688@mail.emea.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 463d8838f1db..afb75f5c243c 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -893,8 +893,8 @@ config X86_LOCAL_APIC
 	select GENERIC_IRQ_LEGACY_ALLOC_HWIRQ
 
 config X86_IO_APIC
-	def_bool X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
-	depends on X86_LOCAL_APIC
+	def_bool y
+	depends on X86_LOCAL_APIC || X86_UP_IOAPIC
 	select IRQ_DOMAIN
 
 config X86_REROUTE_FOR_BROKEN_BOOT_IRQS
-- 
cgit v1.2.3


From 50849eefea3ba8aa6e540e0cbdc9533098f25656 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Thu, 5 Feb 2015 15:31:56 +0000
Subject: x86/Kconfig: Simplify X86_UP_APIC handling

We don't really need a helper symbol for that. For one, it's
pointlessly getting set to Y for all configurations (even 64-bit
ones). And then the purpose can be fulfilled by suitably
adjusting X86_UP_APIC: Hide its prompt when PCI_MSI, and default
it to PCI_MSI.

Tested-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/54D39AFC020000780005D684@mail.emea.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index afb75f5c243c..c226c2bda7f3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -859,7 +859,8 @@ config UP_LATE_INIT
        depends on !SMP && X86_LOCAL_APIC
 
 config X86_UP_APIC
-	bool "Local APIC support on uniprocessors"
+	bool "Local APIC support on uniprocessors" if !PCI_MSI
+	default PCI_MSI
 	depends on X86_32 && !SMP && !X86_32_NON_STANDARD
 	---help---
 	  A local APIC (Advanced Programmable Interrupt Controller) is an
@@ -871,10 +872,6 @@ config X86_UP_APIC
 	  performance counters), and the NMI watchdog which detects hard
 	  lockups.
 
-config X86_UP_APIC_MSI
-	def_bool y
-	select X86_UP_APIC if X86_32 && !SMP && !X86_32_NON_STANDARD && PCI_MSI
-
 config X86_UP_IOAPIC
 	bool "IO-APIC support on uniprocessors"
 	depends on X86_UP_APIC
-- 
cgit v1.2.3


From 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 27 Jan 2015 16:06:02 -0800
Subject: x86/asm/decoder: Fix and enforce max instruction size in the insn
 decoder

x86 instructions cannot exceed 15 bytes, and the instruction
decoder should enforce that.  Prior to 6ba48ff46f76, the
instruction length limit was implicitly set to 16, which was an
approximation of 15, but there is currently no limit at all.

Fix MAX_INSN_SIZE (it should be 15, not 16), and fix the decoder
to reject instructions that exceed MAX_INSN_SIZE.

Other than potentially confusing some of the decoder sanity
checks, I'm not aware of any actual problems that omitting this
check would cause, nor am I aware of any practical problems
caused by the MAX_INSN_SIZE error.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Fixes: 6ba48ff46f76 ("x86: Remove arbitrary instruction size limit ...
Link: http://lkml.kernel.org/r/f8f0bc9b8c58cfd6830f7d88400bf1396cbdcd0f.1422403511.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/insn.h | 2 +-
 arch/x86/lib/insn.c         | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
 	const insn_byte_t *next_byte;
 };
 
-#define MAX_INSN_SIZE	16
+#define MAX_INSN_SIZE	15
 
 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
 #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 5ea00ddc1825..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
  */
 void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
 {
+	/*
+	 * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+	 * even if the input buffer is long enough to hold them.
+	 */
+	if (buf_len > MAX_INSN_SIZE)
+		buf_len = MAX_INSN_SIZE;
+
 	memset(insn, 0, sizeof(*insn));
 	insn->kaddr = kaddr;
 	insn->end_kaddr = kaddr + buf_len;
-- 
cgit v1.2.3


From 5b171e8218044d3c951d20a39512df861e349722 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov
Date: Wed, 28 Jan 2015 00:16:28 +0600
Subject: x86/asm/boot: Fix path in comments

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Martin Mares <mj@ucw.cz>
Link: http://lkml.kernel.org/r/1422382588-10367-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index a468c0a65c42..6975f5dcc368 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
 /*
- *  linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ *  linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  *  Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
 	 * %rsi holds a physical pointer to real_mode_data.
 	 *
 	 * We come here either directly from a 64bit bootloader, or from
-	 * arch/x86_64/boot/compressed/head.S.
+	 * arch/x86/boot/compressed/head_64.S.
 	 *
 	 * We only come here initially at boot nothing else comes here.
 	 *
-- 
cgit v1.2.3


From 79287cf8778df21e5ad6c3ac01615e02cf9dbae6 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov
Date: Sun, 25 Jan 2015 00:11:32 +0600
Subject: x86/boot/video: Move the 'video_segment' variable to video.c

video.c is the only real user of the 'video_segment' variable,
so move it to video.c and make it static.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Martin Mares <mj@ucw.cz>
Link: http://lkml.kernel.org/r/1422123092-28750-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/video-mode.c | 4 +---
 arch/x86/boot/video.c      | 2 ++
 arch/x86/boot/video.h      | 1 -
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/video-mode.c b/arch/x86/boot/video-mode.c
index 748e8d06290a..aa8a96b052e3 100644
--- a/arch/x86/boot/video-mode.c
+++ b/arch/x86/boot/video-mode.c
@@ -22,10 +22,8 @@
 /*
  * Common variables
  */
-int adapter;			/* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
-u16 video_segment;
+int adapter;		/* 0=CGA/MDA/HGC, 1=EGA, 2=VGA+ */
 int force_x, force_y;	/* Don't query the BIOS for cols/rows */
-
 int do_restore;		/* Screen contents changed during mode flip */
 int graphic_mode;	/* Graphic mode with linear frame buffer */
 
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index 43eda284d27f..05111bb8d018 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -17,6 +17,8 @@
 #include "video.h"
 #include "vesa.h"
 
+static u16 video_segment;
+
 static void store_cursor_position(void)
 {
 	struct biosregs ireg, oreg;
diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h
index 0bb25491262d..b54e0328c449 100644
--- a/arch/x86/boot/video.h
+++ b/arch/x86/boot/video.h
@@ -91,7 +91,6 @@ int mode_defined(u16 mode);	/* video.c */
 #define ADAPTER_VGA	2
 
 extern int adapter;
-extern u16 video_segment;
 extern int force_x, force_y;	/* Don't query the BIOS for cols/rows */
 extern int do_restore;		/* Restore screen contents */
 extern int graphic_mode;	/* Graphics mode with linear frame buffer */
-- 
cgit v1.2.3


From 1db491f77b6ed0f32f1d4a3ac40a5be9524f1914 Mon Sep 17 00:00:00 2001
From: Fenghua Yu
Date: Thu, 15 Jan 2015 20:30:01 -0800
Subject: x86/mm: Reduce PAE-mode per task pgd allocation overhead from 4K to
 32 bytes

With more embedded systems emerging using Quark, among other
things, 32-bit kernel matters again. 32-bit machine and kernel
uses PAE paging, which currently wastes at least 4K of memory
per process on Linux where we have to reserve an entire page to
support a single 32-byte PGD structure. It would be a very good
thing if we could eliminate that wastage.

PAE paging is used to access more than 4GB memory on x86-32. And
it is required for NX.

In this patch, we still allocate one page for pgd for a Xen
domain and 64-bit kernel because one page pgd is assumed in
these cases. But we can save memory space by only allocating
32-byte pgd for 32-bit PAE kernel when it is not running as a
Xen domain.

Signed-off-by: Fenghua Yu <fenghua.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christoph Lameter <cl@linux.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Glenn Williamson <glenn.p.williamson@intel.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1421382601-46912-1-git-send-email-fenghua.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pgtable.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 6fb6927f9e76..d223e1fe1dd2 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -271,12 +271,87 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
 	}
 }
 
+/*
+ * Xen paravirt assumes pgd table should be in one page. 64 bit kernel also
+ * assumes that pgd should be in one page.
+ *
+ * But kernel with PAE paging that is not running as a Xen domain
+ * only needs to allocate 32 bytes for pgd instead of one page.
+ */
+#ifdef CONFIG_X86_PAE
+
+#include <linux/slab.h>
+
+#define PGD_SIZE	(PTRS_PER_PGD * sizeof(pgd_t))
+#define PGD_ALIGN	32
+
+static struct kmem_cache *pgd_cache;
+
+static int __init pgd_cache_init(void)
+{
+	/*
+	 * When PAE kernel is running as a Xen domain, it does not use
+	 * shared kernel pmd. And this requires a whole page for pgd.
+	 */
+	if (!SHARED_KERNEL_PMD)
+		return 0;
+
+	/*
+	 * when PAE kernel is not running as a Xen domain, it uses
+	 * shared kernel pmd. Shared kernel pmd does not require a whole
+	 * page for pgd. We are able to just allocate a 32-byte for pgd.
+	 * During boot time, we create a 32-byte slab for pgd table allocation.
+	 */
+	pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
+				      SLAB_PANIC, NULL);
+	if (!pgd_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+core_initcall(pgd_cache_init);
+
+static inline pgd_t *_pgd_alloc(void)
+{
+	/*
+	 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
+	 * We allocate one page for pgd.
+	 */
+	if (!SHARED_KERNEL_PMD)
+		return (pgd_t *)__get_free_page(PGALLOC_GFP);
+
+	/*
+	 * Now PAE kernel is not running as a Xen domain. We can allocate
+	 * a 32-byte slab for pgd to save memory space.
+	 */
+	return kmem_cache_alloc(pgd_cache, PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+	if (!SHARED_KERNEL_PMD)
+		free_page((unsigned long)pgd);
+	else
+		kmem_cache_free(pgd_cache, pgd);
+}
+#else
+static inline pgd_t *_pgd_alloc(void)
+{
+	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+}
+
+static inline void _pgd_free(pgd_t *pgd)
+{
+	free_page((unsigned long)pgd);
+}
+#endif /* CONFIG_X86_PAE */
+
 pgd_t *pgd_alloc(struct mm_struct *mm)
 {
 	pgd_t *pgd;
 	pmd_t *pmds[PREALLOCATED_PMDS];
 
-	pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
+	pgd = _pgd_alloc();
 
 	if (pgd == NULL)
 		goto out;
@@ -306,7 +381,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 out_free_pmds:
 	free_pmds(pmds);
 out_free_pgd:
-	free_page((unsigned long)pgd);
+	_pgd_free(pgd);
 out:
 	return NULL;
 }
@@ -316,7 +391,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd)
 	pgd_mop_up_pmds(mm, pgd);
 	pgd_dtor(pgd);
 	paravirt_pgd_free(mm, pgd);
-	free_page((unsigned long)pgd);
+	_pgd_free(pgd);
 }
 
 /*
-- 
cgit v1.2.3


From 1f40a8bfa91adfa8d1ac5013c08c76f6fd6691ad Mon Sep 17 00:00:00 2001
From: Pavel Machek
Date: Sun, 28 Dec 2014 17:15:24 +0100
Subject: x86/mm/pat: Ensure different messages in STRICT_DEVMEM and PAT cases

STRICT_DEVMEM and PAT produce same failure accessing /dev/mem,
which is quite confusing to the user. Make printk messages
different to lessen confusion.

Signed-off-by: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pat.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 7ac68698406c..35af6771a95a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -610,7 +610,7 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
 }
 
 #ifdef CONFIG_STRICT_DEVMEM
-/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
+/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM */
 static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 {
 	return 1;
@@ -628,8 +628,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 
 	while (cursor < to) {
 		if (!devmem_is_allowed(pfn)) {
-			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx]\n",
-				current->comm, from, to - 1);
+			printk(KERN_INFO "Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
+			       current->comm, from, to - 1);
 			return 0;
 		}
 		cursor += PAGE_SIZE;
-- 
cgit v1.2.3


From 0cdb81bef20b1d9e12111fa6cd81f748ebd87778 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Fri, 23 Jan 2015 08:35:13 +0000
Subject: x86-64: Also clear _PAGE_GLOBAL from __supported_pte_mask if
 !cpu_has_pge

Not just setting it when the feature is available is for
consistency, and may allow Xen to drop its custom clearing of
the flag (unless it needs it cleared earlier than this code
executes). Note that the change is benign to ix86, as the flag
starts out clear there.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/54C215D10200007800058912@mail.emea.novell.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 079c3b6a3ff1..090499a363cb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void)
 	if (cpu_has_pge) {
 		set_in_cr4(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
-	}
+	} else
+		__supported_pte_mask &= ~_PAGE_GLOBAL;
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From e85bd9892c4f10fd20fc575e62327efb7c77015d Mon Sep 17 00:00:00 2001
From: Sylvain BERTRAND
Date: Mon, 29 Dec 2014 16:43:24 +0100
Subject: x86/build: Fix mkcapflags.sh bash-ism

Chocked while compiling linux with dash shell instead of bash
shell. See:

  http://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_09_05

Signed-off-by: Sylvain BERTRAND <sylvain.bertrand@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20141229154324.GA27533@dhcppc1
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mkcapflags.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a337b49..3f20710a5b23 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
 IN=$1
 OUT=$2
 
-function dump_array()
+dump_array()
 {
 	ARRAY=$1
 	SIZE=$2
-- 
cgit v1.2.3


From 9261dc1de11e158a5f6b4b92c8bf1ef4a02dbf0d Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks
Date: Fri, 2 Jan 2015 11:19:57 -0500
Subject: x86/build/defconfig: Enable USB_EHCI_TT_NEWSCHED=y

Some Gentoo users are encountering problems because
USB_EHCI_TT_NEWSCHED isn't set in the defconfig (and Gentoo
differs from other distros in not providing a distro .config).
Alan Stern has said there's no reason to not set it, and the
ability to turn it off at all should probably be yanked:

  http://article.gmane.org/gmane.linux.usb.general/119920

This addresses issue:

  https://bugs.gentoo.org/show_bug.cgi?id=533472

(The problem also theoretically affects the sh, arm, mips,
powerpc, and sparc archs, but those would be other patches if
this one that fixes 98% of the problem is accepted).

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Acked-by: Alan Stern <stern@rowland.harvard.edu>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/configs/i386_defconfig   | 2 +-
 arch/x86/configs/x86_64_defconfig | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 419819d6dab3..aaa1118bf01e 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -248,7 +248,7 @@ CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_MON=y
 CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4c311ddd973b..315b86106572 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -243,7 +243,7 @@ CONFIG_USB=y
 CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
 CONFIG_USB_MON=y
 CONFIG_USB_EHCI_HCD=y
-# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_EHCI_TT_NEWSCHED=y
 CONFIG_USB_OHCI_HCD=y
 CONFIG_USB_UHCI_HCD=y
 CONFIG_USB_PRINTER=y
-- 
cgit v1.2.3


From b0bd96fe9a1428a04c82f8c3834db69468869d65 Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Thu, 19 Feb 2015 12:30:49 +1030
Subject: lguest: now depends on PCI

Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 4a0890f815c4..21e89807244c 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -1,6 +1,6 @@
 config LGUEST_GUEST
 	bool "Lguest guest support"
-	depends on X86_32 && PARAVIRT
+	depends on X86_32 && PARAVIRT && PCI
 	select TTY
 	select VIRTUALIZATION
 	select VIRTIO
-- 
cgit v1.2.3


From f476893459318cb2eff3ecd2a05d4ceacf82e73e Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Thu, 19 Feb 2015 14:43:21 +1030
Subject: lguest: update help text.

We now add about 10k, not 6k, when lguest support is compiled in.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 arch/x86/lguest/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 21e89807244c..08f41caada45 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -8,7 +8,7 @@ config LGUEST_GUEST
 	help
 	  Lguest is a tiny in-kernel hypervisor.  Selecting this will
 	  allow your kernel to boot under lguest.  This option will increase
-	  your kernel size by about 6k.  If in doubt, say N.
+	  your kernel size by about 10k.  If in doubt, say N.
 
 	  If you say Y here, make sure you say Y (or M) to the virtio block
 	  and net drivers which lguest needs.
-- 
cgit v1.2.3


From fb148d83ec6924b7766731e58739d7281b6fb8c7 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov
Date: Thu, 19 Feb 2015 13:34:58 +0600
Subject: x86/asm/boot: Use already defined KEEP_SEGMENTS macro in
 head_{32,64}.S

There is already defined macro KEEP_SEGMENTS in
<asm/bootparam.h>, let's use it instead of hardcoded
constants.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1424331298-7456-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/head_32.S | 3 ++-
 arch/x86/boot/compressed/head_64.S | 3 ++-
 arch/x86/kernel/head_32.S          | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
 #include <asm/page_types.h>
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
 	__HEAD
 ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
 	 * us to not reload segments
 	 */
-	testb	$(1<<6), BP_loadflags(%esi)
+	testb	$KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz	1f
 
 	cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..90c152135e92 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
 #include <asm/msr.h>
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
 
 	__HEAD
 	.code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
 	 * Test KEEP_SEGMENTS flag to see if the bootloader is asking
 	 * us to not reload segments
 	 */
-	testb $(1<<6), BP_loadflags(%esi)
+	testb $KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz 1f
 
 	cli
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
 #include <asm/cpufeature.h>
 #include <asm/percpu.h>
 #include <asm/nops.h>
+#include <asm/bootparam.h>
 
 /* Physical address */
 #define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
 	
 	/* test KEEP_SEGMENTS flag to see if the bootloader is asking
 		us to not reload segments */
-	testb $(1<<6), BP_loadflags(%esi)
+	testb $KEEP_SEGMENTS, BP_loadflags(%esi)
 	jnz 2f
 
 /*
-- 
cgit v1.2.3


From a9241ea5fd709fc935dade130f4e3b2612bbe9e3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 6 Feb 2015 15:01:58 -0500
Subject: x86/fpu: Don't reset thread.fpu_counter

The "else" branch clears ->fpu_counter as a remnant of the lazy FPU
usage counting:

  e07e23e1fd30 ("[PATCH] non lazy "sleazy" fpu implementation")

However, switch_fpu_prepare() does this now so that else branch is
superfluous.

If we do use_eager_fpu(), then this has no effect. Otherwise, if we
actually wanted to prevent fpu preload after the context switch we would
need to reset it unconditionally, even if __thread_has_fpu().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1423252925-14451-2-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/i387.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a9a4229f6161..4d0db9ed58e0 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -108,8 +108,7 @@ void unlazy_fpu(struct task_struct *tsk)
 	if (__thread_has_fpu(tsk)) {
 		__save_init_fpu(tsk);
 		__thread_fpu_end(tsk);
-	} else
-		tsk->thread.fpu_counter = 0;
+	}
 	preempt_enable();
 }
 EXPORT_SYMBOL(unlazy_fpu);
-- 
cgit v1.2.3


From 1a2a7f4ec8e3a7ac582dac4d01fcc7e8acd3bb30 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 6 Feb 2015 15:01:59 -0500
Subject: x86/fpu: Don't do __thread_fpu_end() if use_eager_fpu()

unlazy_fpu()->__thread_fpu_end() doesn't look right if use_eager_fpu().
Unconditional __thread_fpu_end() is only correct if we know that this
thread can't return to user-mode and use FPU.

Fortunately it has only 2 callers. fpu_copy() checks use_eager_fpu(),
and init_fpu(current) can be only called by the coredumping thread via
regset->get(). But it is exported to modules, and imo this should be
fixed anyway.

And if we check use_eager_fpu() we can use __save_fpu() like fpu_copy()
and save_init_fpu() do.

- It seems that even !use_eager_fpu() case doesn't need the unconditional
  __thread_fpu_end(), we only need it if __save_init_fpu() returns 0.

- It is still not clear to me if __save_init_fpu() can safely nest with
  another save + restore from __kernel_fpu_begin(). If not, we can use
  kernel_fpu_disable() to fix the race.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1423252925-14451-3-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/i387.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 4d0db9ed58e0..f3ced6f4b2b6 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -106,8 +106,12 @@ void unlazy_fpu(struct task_struct *tsk)
 {
 	preempt_disable();
 	if (__thread_has_fpu(tsk)) {
-		__save_init_fpu(tsk);
-		__thread_fpu_end(tsk);
+		if (use_eager_fpu()) {
+			__save_fpu(tsk);
+		} else {
+			__save_init_fpu(tsk);
+			__thread_fpu_end(tsk);
+		}
 	}
 	preempt_enable();
 }
-- 
cgit v1.2.3


From 08a744c6bfded3d5fa66f94263f81773226113d1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 6 Feb 2015 15:02:00 -0500
Subject: x86/fpu: Change math_error() to use unlazy_fpu(), kill (now) unused
 save_init_fpu()

math_error() calls save_init_fpu() after conditional_sti(), this means
that the caller can be preempted. If !use_eager_fpu() we can hit the
WARN_ON_ONCE(!__thread_has_fpu(tsk)) and/or save the wrong FPU state.

Change math_error() to use unlazy_fpu() and kill save_init_fpu().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1423252925-14451-4-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 18 ------------------
 arch/x86/kernel/traps.c             |  2 +-
 2 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e97622f57722..02f2e0817918 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -517,24 +517,6 @@ static inline void __save_fpu(struct task_struct *tsk)
 		fpu_fxsave(&tsk->thread.fpu);
 }
 
-/*
- * These disable preemption on their own and are safe
- */
-static inline void save_init_fpu(struct task_struct *tsk)
-{
-	WARN_ON_ONCE(!__thread_has_fpu(tsk));
-
-	if (use_eager_fpu()) {
-		__save_fpu(tsk);
-		return;
-	}
-
-	preempt_disable();
-	__save_init_fpu(tsk);
-	__thread_fpu_end(tsk);
-	preempt_enable();
-}
-
 /*
  * i387 state interaction
  */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 88900e288021..9d889f74e806 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -663,7 +663,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 	/*
 	 * Save the info for the exception handler and clear the error.
 	 */
-	save_init_fpu(task);
+	unlazy_fpu(task);
 	task->thread.trap_nr = trapnr;
 	task->thread.error_code = error_code;
 	info.si_signo = SIGFPE;
-- 
cgit v1.2.3


From 1c927eea4cad83c439cb51e9c96ad19cb005157d Mon Sep 17 00:00:00 2001
From: Rik van Riel
Date: Fri, 6 Feb 2015 15:02:01 -0500
Subject: x86/fpu: Move lazy restore functions up a few lines

We need another lazy restore related function, that will be called
from a function that is above where the lazy restore functions are
now. It would be nice to keep all three functions grouped together.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1423252925-14451-5-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 02f2e0817918..217d6d7b9cb0 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -67,6 +67,24 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
+/*
+ * Must be run with preemption disabled: this clears the fpu_owner_task,
+ * on this CPU.
+ *
+ * This will disable any lazy FPU state restore of the current FPU state,
+ * but if the current thread owns the FPU, it will still be saved by.
+ */
+static inline void __cpu_disable_lazy_restore(unsigned int cpu)
+{
+	per_cpu(fpu_owner_task, cpu) = NULL;
+}
+
+static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
+{
+	return new == this_cpu_read_stable(fpu_owner_task) &&
+		cpu == new->thread.fpu.last_cpu;
+}
+
 static inline int is_ia32_compat_frame(void)
 {
 	return config_enabled(CONFIG_IA32_EMULATION) &&
@@ -398,24 +416,6 @@ static inline void drop_init_fpu(struct task_struct *tsk)
  */
 typedef struct { int preload; } fpu_switch_t;
 
-/*
- * Must be run with preemption disabled: this clears the fpu_owner_task,
- * on this CPU.
- *
- * This will disable any lazy FPU state restore of the current FPU state,
- * but if the current thread owns the FPU, it will still be saved by.
- */
-static inline void __cpu_disable_lazy_restore(unsigned int cpu)
-{
-	per_cpu(fpu_owner_task, cpu) = NULL;
-}
-
-static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
-{
-	return new == this_cpu_read_stable(fpu_owner_task) &&
-		cpu == new->thread.fpu.last_cpu;
-}
-
 static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu)
 {
 	fpu_switch_t fpu;
-- 
cgit v1.2.3


From 33e03dedd759cc9396252d9641b25d01909a26bb Mon Sep 17 00:00:00 2001
From: Rik van Riel
Date: Fri, 6 Feb 2015 15:02:02 -0500
Subject: x86/fpu: Introduce task_disable_lazy_fpu_restore() helper

Currently there are a few magic assignments sprinkled through the
code that disable lazy FPU state restoring, some more effective than
others, and all equally mystifying.

It would be easier to have a helper to explicitly disable lazy
FPU state restoring for a task.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1423252925-14451-6-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 217d6d7b9cb0..9c27f44e1c5c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -79,6 +79,16 @@ static inline void __cpu_disable_lazy_restore(unsigned int cpu)
 	per_cpu(fpu_owner_task, cpu) = NULL;
 }
 
+/*
+ * Used to indicate that the FPU state in memory is newer than the FPU
+ * state in registers, and the FPU state should be reloaded next time the
+ * task is run. Only safe on the current task, or non-running tasks.
+ */
+static inline void task_disable_lazy_fpu_restore(struct task_struct *tsk)
+{
+	tsk->thread.fpu.last_cpu = ~0;
+}
+
 static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu)
 {
 	return new == this_cpu_read_stable(fpu_owner_task) &&
-- 
cgit v1.2.3


From 1361ef29c7e49ae7cf37220c25fac1904b77f71a Mon Sep 17 00:00:00 2001
From: Rik van Riel
Date: Fri, 6 Feb 2015 15:02:03 -0500
Subject: x86/fpu: Use an explicit if/else in switch_fpu_prepare()

Use an explicit if/else branch after __save_init_fpu(old) in
switch_fpu_prepare(). This makes substituting the assignment with a call
in task_disable_lazy_fpu_restore() in the next patch easier to review.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1423252925-14451-7-git-send-email-riel@redhat.com
[ Space out stuff for more readability. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 9c27f44e1c5c..04c2807aab66 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -434,13 +434,17 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 	 * If the task has used the math, pre-load the FPU on xsave processors
 	 * or if the past 5 consecutive context-switches used math.
 	 */
-	fpu.preload = tsk_used_math(new) && (use_eager_fpu() ||
-					     new->thread.fpu_counter > 5);
+	fpu.preload = tsk_used_math(new) &&
+		      (use_eager_fpu() || new->thread.fpu_counter > 5);
+
 	if (__thread_has_fpu(old)) {
 		if (!__save_init_fpu(old))
-			cpu = ~0;
-		old->thread.fpu.last_cpu = cpu;
-		old->thread.fpu.has_fpu = 0;	/* But leave fpu_owner_task! */
+			old->thread.fpu.last_cpu = ~0;
+		else
+			old->thread.fpu.last_cpu = cpu;
+
+		/* But leave fpu_owner_task! */
+		old->thread.fpu.has_fpu = 0;
 
 		/* Don't change CR0.TS if we just switch! */
 		if (fpu.preload) {
-- 
cgit v1.2.3


From 6a5fe8952bd676baf382d14df21e7b32b5d8943e Mon Sep 17 00:00:00 2001
From: Rik van Riel
Date: Fri, 6 Feb 2015 15:02:04 -0500
Subject: x86/fpu: Use task_disable_lazy_fpu_restore() helper

Replace magic assignments of fpu.last_cpu = ~0 with more explicit
task_disable_lazy_fpu_restore() calls.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1423252925-14451-8-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 4 ++--
 arch/x86/kernel/i387.c              | 2 +-
 arch/x86/kernel/process.c           | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 04c2807aab66..e5f8f8eaf225 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -439,7 +439,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 
 	if (__thread_has_fpu(old)) {
 		if (!__save_init_fpu(old))
-			old->thread.fpu.last_cpu = ~0;
+			task_disable_lazy_fpu_restore(old);
 		else
 			old->thread.fpu.last_cpu = cpu;
 
@@ -455,7 +455,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 			stts();
 	} else {
 		old->thread.fpu_counter = 0;
-		old->thread.fpu.last_cpu = ~0;
+		task_disable_lazy_fpu_restore(old);
 		if (fpu.preload) {
 			new->thread.fpu_counter++;
 			if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f3ced6f4b2b6..5722ab6c7c36 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -236,7 +236,7 @@ int init_fpu(struct task_struct *tsk)
 	if (tsk_used_math(tsk)) {
 		if (cpu_has_fpu && tsk == current)
 			unlazy_fpu(tsk);
-		tsk->thread.fpu.last_cpu = ~0;
+		task_disable_lazy_fpu_restore(tsk);
 		return 0;
 	}
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..ce8b10351e28 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -68,8 +68,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 
 	dst->thread.fpu_counter = 0;
 	dst->thread.fpu.has_fpu = 0;
-	dst->thread.fpu.last_cpu = ~0;
 	dst->thread.fpu.state = NULL;
+	task_disable_lazy_fpu_restore(dst);
 	if (tsk_used_math(src)) {
 		int err = fpu_alloc(&dst->thread.fpu);
 		if (err)
-- 
cgit v1.2.3


From 728e53fef429a0f3c9dda3587c3ccc57ad268b70 Mon Sep 17 00:00:00 2001
From: Rik van Riel
Date: Fri, 6 Feb 2015 15:02:05 -0500
Subject: x86/fpu: Also check fpu_lazy_restore() when use_eager_fpu()

With Oleg's patch:

  33a3ebdc077f ("x86, fpu: Don't abuse has_fpu in __kernel_fpu_begin/end()")

kernel threads no longer have an FPU state even on systems with
use_eager_fpu().

That in turn means that a task may still have its FPU state
loaded in the FPU registers, if the task only got interrupted by
kernel threads from when it went to sleep, to when it woke up
again.

In that case, there is no need to restore the FPU state for
this task, since it is still in the registers.

The kernel can simply use the same logic to determine this as
is used for !use_eager_fpu() systems.

Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1423252925-14451-9-git-send-email-riel@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/fpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index e5f8f8eaf225..19fb41cc4755 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -458,7 +458,7 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta
 		task_disable_lazy_fpu_restore(old);
 		if (fpu.preload) {
 			new->thread.fpu_counter++;
-			if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))
+			if (fpu_lazy_restore(new, cpu))
 				fpu.preload = 0;
 			else
 				prefetch(new->thread.fpu.state);
-- 
cgit v1.2.3


From 2cd4c303a7b438482f73006a29add462ca1cb9d9 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Fri, 23 Jan 2015 08:32:01 +0000
Subject: x86/MCE/AMD: Drop bogus const modifier from AMD's bank4_names()

The compiler validly warns about it being ignored.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/54C21511020000780005890E@mail.emea.novell.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769bbd64..39d073c20a5f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
 	return (bank == 4);
 }
 
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
 {
 	switch (b->address) {
 	/* MSR4_MISC0 */
-- 
cgit v1.2.3


From 3f2f0680d1161df96a0e8fea16930f1bd487a9cf Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 13 Jan 2015 15:08:51 +0100
Subject: x86/MCE/intel: Cleanup CMCI storm logic

Initially, this started with the yet another report about a race
condition in the CMCI storm adaptive period length thing. Yes, we have
to admit, it is fragile and error prone. So let's simplify it.

The simpler logic is: now, after we enter storm mode, we go straight to
polling with CMCI_STORM_INTERVAL, i.e. once a second. We remain in storm
mode as long as we see errors being logged while polling.

Theoretically, if we see an uninterrupted error stream, we will remain
in storm mode indefinitely and keep polling the MSRs.

However, when the storm is actually a burst of errors, once we have
logged them all, we back out of it after ~5 mins of polling and no more
errors logged.

If we encounter an error during those 5 minutes, we reset the polling
interval to 5 mins.

Making machine_check_poll() return a bool and denoting whether it has
seen an error or not lets us simplify a bunch of code and move the storm
handling private to mce_intel.c.

Some minor cleanups while at it.

Reported-by: Calvin Owens <calvinowens@fb.com>
Tested-by: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1417746575-23299-1-git-send-email-calvinowens@fb.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/mce.h                |  8 +--
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  9 ++--
 arch/x86/kernel/cpu/mcheck/mce.c          | 86 ++++++++++++++++---------------
 arch/x86/kernel/cpu/mcheck/mce_intel.c    | 63 ++++++++++++++--------
 4 files changed, 96 insertions(+), 70 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index 51b26e895933..13eeea518233 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -183,11 +183,11 @@ typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS);
 DECLARE_PER_CPU(mce_banks_t, mce_poll_banks);
 
 enum mcp_flags {
-	MCP_TIMESTAMP = (1 << 0),	/* log time stamp */
-	MCP_UC = (1 << 1),		/* log uncorrected errors */
-	MCP_DONTLOG = (1 << 2),		/* only clear, don't log */
+	MCP_TIMESTAMP	= BIT(0),	/* log time stamp */
+	MCP_UC		= BIT(1),	/* log uncorrected errors */
+	MCP_DONTLOG	= BIT(2),	/* only clear, don't log */
 };
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b);
 
 int mce_notify_irq(void);
 void mce_notify_process(void);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b46906767f..e12f0bfb45c1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
 };
 
 #define ATTR_LEN		16
+#define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */
 
 /* One object for each MCE bank, shared by all CPUs */
 struct mce_bank {
@@ -30,13 +31,13 @@ extern struct mce_bank *mce_banks;
 extern mce_banks_t mce_banks_ce_disabled;
 
 #ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
 void mce_intel_hcpu_update(unsigned long cpu);
 void cmci_disable_bank(int bank);
 #else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
 static inline void mce_intel_hcpu_update(unsigned long cpu) { }
 static inline void cmci_disable_bank(int bank) { }
 #endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d2c611699cd9..d60cbb8d78f7 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -58,7 +58,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 #define CREATE_TRACE_POINTS
 #include <trace/events/mce.h>
 
-#define SPINUNIT 100	/* 100ns */
+#define SPINUNIT		100	/* 100ns */
 
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
@@ -87,9 +87,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
 static DEFINE_PER_CPU(struct mce, mces_seen);
 static int			cpu_missing;
 
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
 /*
  * MCA banks polled by the period polling timer for corrected events.
  * With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -623,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
  * is already totally * confused. In this case it's likely it will
  * not fully execute the machine check handler either.
  */
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 {
+	bool error_logged = false;
 	struct mce m;
 	int severity;
 	int i;
@@ -647,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		if (!(m.status & MCI_STATUS_VAL))
 			continue;
 
-		this_cpu_write(mce_polled_error, 1);
+
 		/*
 		 * Uncorrected or signalled events are handled by the exception
 		 * handler when it is enabled, so don't process those here.
@@ -680,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 * Don't get the IP here because it's unlikely to
 		 * have anything to do with the actual error location.
 		 */
-		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+		if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+			error_logged = true;
 			mce_log(&m);
+		}
 
 		/*
 		 * Clear state for this bank.
@@ -695,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 	 */
 
 	sync_core();
+
+	return error_logged;
 }
 EXPORT_SYMBOL_GPL(machine_check_poll);
 
@@ -1311,7 +1313,7 @@ void mce_log_therm_throt_event(__u64 status)
  * poller finds an MCE, poll 2x faster.  When the poller finds no more
  * errors, poll 2x slower (up to check_interval seconds).
  */
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
 
 static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
 static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1321,49 +1323,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
 	return interval;
 }
 
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
-	mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
 
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
 {
-	unsigned long *v = this_cpu_ptr(&mce_polled_error);
+	unsigned long when = jiffies + interval;
+	unsigned long flags;
 
-	return test_and_clear_bit(0, v);
+	local_irq_save(flags);
+
+	if (timer_pending(t)) {
+		if (time_before(when, t->expires))
+			mod_timer_pinned(t, when);
+	} else {
+		t->expires = round_jiffies(when);
+		add_timer_on(t, smp_processor_id());
+	}
+
+	local_irq_restore(flags);
 }
 
 static void mce_timer_fn(unsigned long data)
 {
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
+	int cpu = smp_processor_id();
 	unsigned long iv;
-	int notify;
 
-	WARN_ON(smp_processor_id() != data);
+	WARN_ON(cpu != data);
+
+	iv = __this_cpu_read(mce_next_interval);
 
 	if (mce_available(this_cpu_ptr(&cpu_info))) {
-		machine_check_poll(MCP_TIMESTAMP,
-				this_cpu_ptr(&mce_poll_banks));
-		mce_intel_cmci_poll();
+		machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+		if (mce_intel_cmci_poll()) {
+			iv = mce_adjust_timer(iv);
+			goto done;
+		}
 	}
 
 	/*
-	 * Alert userspace if needed.  If we logged an MCE, reduce the
-	 * polling interval, otherwise increase the polling interval.
+	 * Alert userspace if needed. If we logged an MCE, reduce the polling
+	 * interval, otherwise increase the polling interval.
 	 */
-	iv = __this_cpu_read(mce_next_interval);
-	notify = mce_notify_irq();
-	notify |= cmc_error_seen();
-	if (notify) {
+	if (mce_notify_irq())
 		iv = max(iv / 2, (unsigned long) HZ/100);
-	} else {
+	else
 		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
-		iv = mce_adjust_timer(iv);
-	}
+
+done:
 	__this_cpu_write(mce_next_interval, iv);
-	/* Might have become 0 after CMCI storm subsided */
-	if (iv) {
-		t->expires = jiffies + iv;
-		add_timer_on(t, smp_processor_id());
-	}
+	__restart_timer(t, iv);
 }
 
 /*
@@ -1372,16 +1382,10 @@ static void mce_timer_fn(unsigned long data)
 void mce_timer_kick(unsigned long interval)
 {
 	struct timer_list *t = this_cpu_ptr(&mce_timer);
-	unsigned long when = jiffies + interval;
 	unsigned long iv = __this_cpu_read(mce_next_interval);
 
-	if (timer_pending(t)) {
-		if (time_before(when, t->expires))
-			mod_timer_pinned(t, when);
-	} else {
-		t->expires = round_jiffies(when);
-		add_timer_on(t, smp_processor_id());
-	}
+	__restart_timer(t, interval);
+
 	if (interval < iv)
 		__this_cpu_write(mce_next_interval, interval);
 }
@@ -1682,7 +1686,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	switch (c->x86_vendor) {
 	case X86_VENDOR_INTEL:
 		mce_intel_feature_init(c);
-		mce_adjust_timer = mce_intel_adjust_timer;
+		mce_adjust_timer = cmci_intel_adjust_timer;
 		break;
 	case X86_VENDOR_AMD:
 		mce_amd_feature_init(c);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97bafc123..b4a41cf030ed 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -38,6 +38,15 @@
  */
 static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
 
+/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
 /*
  * cmci_discover_lock protects against parallel discovery attempts
  * which could race against each other.
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
 
 #define CMCI_THRESHOLD		1
 #define CMCI_POLL_INTERVAL	(30 * HZ)
-#define CMCI_STORM_INTERVAL	(1 * HZ)
+#define CMCI_STORM_INTERVAL	(HZ)
 #define CMCI_STORM_THRESHOLD	15
 
 static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
 	return !!(cap & MCG_CMCI_P);
 }
 
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
 {
 	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
-		return;
-	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+		return false;
+
+	/*
+	 * Reset the counter if we've logged an error in the last poll
+	 * during the storm.
+	 */
+	if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+		this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+	else
+		this_cpu_dec(cmci_backoff_cnt);
+
+	return true;
 }
 
 void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
 	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
 }
 
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
 {
-	int r;
-
-	if (interval < CMCI_POLL_INTERVAL)
-		return interval;
+	if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+	    (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+		mce_notify_irq();
+		return CMCI_STORM_INTERVAL;
+	}
 
 	switch (__this_cpu_read(cmci_storm_state)) {
 	case CMCI_STORM_ACTIVE:
+
 		/*
 		 * We switch back to interrupt mode once the poll timer has
-		 * silenced itself. That means no events recorded and the
-		 * timer interval is back to our poll interval.
+		 * silenced itself. That means no events recorded and the timer
+		 * interval is back to our poll interval.
 		 */
 		__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
-		r = atomic_sub_return(1, &cmci_storm_on_cpus);
-		if (r == 0)
+		if (!atomic_sub_return(1, &cmci_storm_on_cpus))
 			pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
 		/* FALLTHROUGH */
 
 	case CMCI_STORM_SUBSIDED:
 		/*
-		 * We wait for all cpus to go back to SUBSIDED
-		 * state. When that happens we switch back to
-		 * interrupt mode.
+		 * We wait for all CPUs to go back to SUBSIDED state. When that
+		 * happens we switch back to interrupt mode.
 		 */
 		if (!atomic_read(&cmci_storm_on_cpus)) {
 			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
 		}
 		return CMCI_POLL_INTERVAL;
 	default:
-		/*
-		 * We have shiny weather. Let the poll do whatever it
-		 * thinks.
-		 */
+
+		/* We have shiny weather. Let the poll do whatever it thinks. */
 		return interval;
 	}
 }
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
 	cmci_storm_disable_banks();
 	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
 	r = atomic_add_return(1, &cmci_storm_on_cpus);
-	mce_timer_kick(CMCI_POLL_INTERVAL);
+	mce_timer_kick(CMCI_STORM_INTERVAL);
+	this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
 
 	if (r == 1)
 		pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
 {
 	if (cmci_storm_detect())
 		return;
+
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
 	mce_notify_irq();
 }
@@ -286,6 +306,7 @@ void cmci_recheck(void)
 
 	if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
 		return;
+
 	local_irq_save(flags);
 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
 	local_irq_restore(flags);
-- 
cgit v1.2.3


From 8af7043a3cd2c259e9f5d3158f3e736d06c05056 Mon Sep 17 00:00:00 2001
From: Derek Che
Date: Mon, 2 Feb 2015 10:30:21 -0800
Subject: x86/MCE: Make mce_panic() fatal machine check msg in the same pattern

There is another mce_panic call with "Fatal machine check on current CPU" in
the same mce.c file, why not keep them all in same pattern

	mce_panic("Fatal machine check on current CPU", &m, msg);

Signed-off-by: Derek Che <drc@yahoo-inc.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d60cbb8d78f7..aeeb93378cdb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -817,7 +817,7 @@ static void mce_reign(void)
 	 * other CPUs.
 	 */
 	if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
-		mce_panic("Fatal Machine check", m, msg);
+		mce_panic("Fatal machine check", m, msg);
 
 	/*
 	 * For UC somewhere we let the CPU who detects it handle it.
@@ -830,7 +830,7 @@ static void mce_reign(void)
 	 * source or one CPU is hung. Panic.
 	 */
 	if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
-		mce_panic("Machine check from unknown source", NULL, NULL);
+		mce_panic("Fatal machine check from unknown source", NULL, NULL);
 
 	/*
 	 * Now clear all the mces_seen so that they don't reappear on
-- 
cgit v1.2.3


From d79f931f1c8b6963e13a3738ef2906ba89bb8d12 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Mon, 2 Feb 2015 11:02:41 -0600
Subject: x86/MCE/AMD: Enable thresholding interrupts by default if supported

We setup APIC vectors for threshold errors if interrupt_capable.
However, we don't set interrupt_enable by default. Rework
threshold_restart_bank() so that when we set up lvt_offset, we also set
IntType to APIC and also enable thresholding interrupts for banks which
support it by default.

User is still allowed to disable interrupts through sysfs.

While at it, check if status is valid before we proceed to log error
using mce_log. This is because, in multi-node platforms, only the NBC
(Node Base Core, i.e. the first core in the node) has valid status info
in its MCA registers. So, the decoding of status values on the non-NBC
leads to noise on kernel logs like so:

  EDAC DEBUG: amd64_inject_write_store: section=0x80000000 word_bits=0x10020001
  [Hardware Error]: Corrected error, no action required.
  [Hardware Error]: CPU:25 (15:2:0) MC4_STATUS[-|CE|-|-|-
  [Hardware Error]: Corrected error, no action required.
  [Hardware Error]: CPU:26 (15:2:0) MC4_STATUS[-|CE|-|-|-
  <...>
  WARNING: CPU: 25 PID: 0 at drivers/edac/amd64_edac.c:2147 decode_bus_error+0x1ba/0x2a0()
  WARNING: CPU: 26 PID: 0 at drivers/edac/amd64_edac.c:2147 decode_bus_error+0x1ba/0x2a0()
  Something is rotten in the state of Denmark.

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Link: http://lkml.kernel.org/r/1422896561-7695-1-git-send-email-aravind.gopalakrishnan@amd.com
[ Massage commit message. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/mcheck/mce_amd.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39d073c20a5f..55ad9b37cae8 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
 			if (!b.interrupt_capable)
 				goto init;
 
+			b.interrupt_enable = 1;
 			new	= (high & MASK_LVTOFF_HI) >> 20;
 			offset  = setup_APIC_mce(offset, new);
 
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
 log:
 	mce_setup(&m);
 	rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+	if (!(m.status & MCI_STATUS_VAL))
+		return;
 	m.misc = ((u64)high << 32) | low;
 	m.bank = bank;
 	mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
 	b->interrupt_capable	= lvt_interrupt_supported(bank, high);
 	b->threshold_limit	= THRESHOLD_MAX;
 
-	if (b->interrupt_capable)
+	if (b->interrupt_capable) {
 		threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
-	else
+		b->interrupt_enable = 1;
+	} else {
 		threshold_ktype.default_attrs[2] = NULL;
+	}
 
 	INIT_LIST_HEAD(&b->miscj);
 
-- 
cgit v1.2.3


From 719d359dc7b6be3e43d6661f192ceb980b10ee26 Mon Sep 17 00:00:00 2001
From: Ross Zwisler
Date: Thu, 19 Feb 2015 10:37:28 -0700
Subject: x86/asm: Add support for the pcommit instruction

Add support for the new pcommit (persistent commit) instruction.
This instruction was announced in the document "Intel
Architecture Instruction Set Extensions Programming Reference"
with reference number 319433-022:

  https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

The pcommit instruction ensures that data that has been flushed
from the processor's cache hierarchy with clwb, clflushopt or
clflush is accepted to memory and is durable on the DIMM.  The
primary use case for this is persistent memory.

This function shows how to properly use clwb/clflushopt/clflush
and pcommit with appropriate fencing:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
	void *vend = vaddr + size - 1;

	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
		clwb(vaddr);

	/* Flush any possible final partial cacheline */
	clwb(vend);

	/*
	 * sfence to order clwb/clflushopt/clflush cache flushes
	 * mfence via mb() also works
	 */
	wmb();

	/* pcommit and the required sfence for ordering */
	pcommit_sfence();
}

After this function completes the data pointed to by vaddr is
has been accepted to memory and will be durable if the vaddr
points to persistent memory.

Pcommit must always be ordered by an mfence or sfence, so to
help simplify things we include both the pcommit and the
required sfence in the alternatives generated by
pcommit_sfence().  The other option is to keep them separated,
but on platforms that don't support pcommit this would then turn
into:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
        void *vend = vaddr + size - 1;

        for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
                clwb(vaddr);

        /* Flush any possible final partial cacheline */
        clwb(vend);

        /*
         * sfence to order clwb/clflushopt/clflush cache flushes
         * mfence via mb() also works
         */
        wmb();

        nop(); /* from pcommit(), via alternatives */

        /*
         * sfence to order pcommit
         * mfence via mb() also works
         */
        wmb();
}

This is still correct, but now you've got two fences separated
by only a nop.  With the commit and the fence together in
pcommit_sfence() you avoid the final unneeded fence.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1424367448-24254-1-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h    |  1 +
 arch/x86/include/asm/special_insns.h | 10 ++++++++++
 2 files changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..d6428ea5d316 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -231,6 +231,7 @@
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
 #define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index e820c080a4e9..096250143832 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
 
 #ifdef __KERNEL__
 
+#include <asm/nops.h>
+
 static inline void native_clts(void)
 {
 	asm volatile("clts");
@@ -199,6 +201,14 @@ static inline void clflushopt(volatile void *__p)
 		       "+m" (*(volatile char __force *)__p));
 }
 
+static inline void pcommit_sfence(void)
+{
+	alternative(ASM_NOP7,
+		    ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
+		    "sfence",
+		    X86_FEATURE_PCOMMIT);
+}
+
 #define nop() asm volatile ("nop")
 
 
-- 
cgit v1.2.3


From 650b7b23cb1e32d77daeefbac1ceb1329abf3b23 Mon Sep 17 00:00:00 2001
From: Petr Mladek
Date: Fri, 20 Feb 2015 15:07:29 +0100
Subject: kprobes/x86: Use 5-byte NOP when the code might be modified by ftrace

can_probe() checks if the given address points to the beginning
of an instruction. It analyzes all the instructions from the
beginning of the function until the given address. The code
might be modified by another Kprobe. In this case, the current
code is read into a buffer, int3 breakpoint is replaced by the
saved opcode in the buffer, and can_probe() analyzes the buffer
instead.

There is a bug that __recover_probed_insn() tries to restore
the original code even for Kprobes using the ftrace framework.
But in this case, the opcode is not stored. See the difference
between arch_prepare_kprobe() and arch_prepare_kprobe_ftrace().
The opcode is stored by arch_copy_kprobe() only from
arch_prepare_kprobe().

This patch makes Kprobe to use the ideal 5-byte NOP when the
code can be modified by ftrace. It is the original instruction,
see ftrace_make_nop() and ftrace_nop_replace().

Note that we always need to use the NOP for ftrace locations.
Kprobes do not block ftrace and the instruction might get
modified at anytime. It might even be in an inconsistent state
because it is modified step by step using the int3 breakpoint.

The patch also fixes indentation of the touched comment.

Note that I found this problem when playing with Kprobes. I did
it on x86_64 with gcc-4.8.3 that supported -mfentry. I modified
samples/kprobes/kprobe_example.c and added offset 5 to put
the probe right after the fentry area:

 static struct kprobe kp = {
 	.symbol_name	= "do_fork",
+	.offset = 5,
 };

Then I was able to load kprobe_example before jprobe_example
but not the other way around:

  $> modprobe jprobe_example
  $> modprobe kprobe_example
  modprobe: ERROR: could not insert 'kprobe_example': Invalid or incomplete multibyte or wide character

It did not make much sense and debugging pointed to the bug
described above.

Signed-off-by: Petr Mladek <pmladek@suse.cz>
Acked-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Ananth NMavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1424441250-27146-2-git-send-email-pmladek@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6a1146ea4d4d..c3b4b46b4797 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -223,27 +223,41 @@ static unsigned long
 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
 	struct kprobe *kp;
+	unsigned long faddr;
 
 	kp = get_kprobe((void *)addr);
-	/* There is no probe, return original address */
-	if (!kp)
+	faddr = ftrace_location(addr);
+	/*
+	 * Use the current code if it is not modified by Kprobe
+	 * and it cannot be modified by ftrace.
+	 */
+	if (!kp && !faddr)
 		return addr;
 
 	/*
-	 *  Basically, kp->ainsn.insn has an original instruction.
-	 *  However, RIP-relative instruction can not do single-stepping
-	 *  at different place, __copy_instruction() tweaks the displacement of
-	 *  that instruction. In that case, we can't recover the instruction
-	 *  from the kp->ainsn.insn.
+	 * Basically, kp->ainsn.insn has an original instruction.
+	 * However, RIP-relative instruction can not do single-stepping
+	 * at different place, __copy_instruction() tweaks the displacement of
+	 * that instruction. In that case, we can't recover the instruction
+	 * from the kp->ainsn.insn.
 	 *
-	 *  On the other hand, kp->opcode has a copy of the first byte of
-	 *  the probed instruction, which is overwritten by int3. And
-	 *  the instruction at kp->addr is not modified by kprobes except
-	 *  for the first byte, we can recover the original instruction
-	 *  from it and kp->opcode.
+	 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+	 * of the first byte of the probed instruction, which is overwritten
+	 * by int3. And the instruction at kp->addr is not modified by kprobes
+	 * except for the first byte, we can recover the original instruction
+	 * from it and kp->opcode.
+	 *
+	 * In case of Kprobes using ftrace, we do not have a copy of
+	 * the original instruction. In fact, the ftrace location might
+	 * be modified at anytime and even could be in an inconsistent state.
+	 * Fortunately, we know that the original code is the ideal 5-byte
+	 * long NOP.
 	 */
-	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	buf[0] = kp->opcode;
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (faddr)
+		memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+	else
+		buf[0] = kp->opcode;
 	return (unsigned long)buf;
 }
 
-- 
cgit v1.2.3


From 2a6730c8b6e075adf826a89a3e2caa705807afdb Mon Sep 17 00:00:00 2001
From: Petr Mladek
Date: Fri, 20 Feb 2015 15:07:30 +0100
Subject: kprobes/x86: Check for invalid ftrace location in
 __recover_probed_insn()

__recover_probed_insn() should always be called from an address
where an instructions starts. The check for ftrace_location()
might help to discover a potential inconsistency.

This patch adds WARN_ON() when the inconsistency is detected.
Also it adds handling of the situation when the original code
can not get recovered.

Suggested-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Signed-off-by: Petr Mladek <pmladek@suse.cz>
Cc: Ananth NMavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Steven Rostedt <rostedt@goodmis.org>
Link: http://lkml.kernel.org/r/1424441250-27146-3-git-send-email-pmladek@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 12 ++++++++++++
 arch/x86/kernel/kprobes/opt.c  |  2 ++
 2 files changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index c3b4b46b4797..4e3d5a9621fe 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -227,6 +227,13 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 
 	kp = get_kprobe((void *)addr);
 	faddr = ftrace_location(addr);
+	/*
+	 * Addresses inside the ftrace location are refused by
+	 * arch_check_ftrace_location(). Something went terribly wrong
+	 * if such an address is checked here.
+	 */
+	if (WARN_ON(faddr && faddr != addr))
+		return 0UL;
 	/*
 	 * Use the current code if it is not modified by Kprobe
 	 * and it cannot be modified by ftrace.
@@ -265,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered.
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -299,6 +307,8 @@ static int can_probe(unsigned long paddr)
 		 * normally used, we just go through if there is no kprobe.
 		 */
 		__addr = recover_probed_instruction(buf, addr);
+		if (!__addr)
+			return 0;
 		kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 
@@ -347,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src)
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
+	if (!recovered_insn)
+		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
 	/* Another subsystem puts a breakpoint, failed to recover */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 7c523bbf3dc8..3aef248ec1ee 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr)
 			 */
 			return 0;
 		recovered_insn = recover_probed_instruction(buf, addr);
+		if (!recovered_insn)
+			return 0;
 		kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 		/* Another subsystem puts a breakpoint */
-- 
cgit v1.2.3


From a927792c196f1c24410f3c12ccf45238a353783a Mon Sep 17 00:00:00 2001
From: Yannick Guerrini
Date: Sat, 21 Feb 2015 23:41:50 +0100
Subject: x86/cpu/intel: Fix trivial typo in intel_tlb_table[]

Change 'ssociative' to 'associative'

Signed-off-by: Yannick Guerrini <yguerrini@tomshardware.fr>
Cc: Borislav Petkov <bp@suse.de>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Chris Bainbridge <chris.bainbridge@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Steven Honeyman <stevenhoneyman@gmail.com>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1424558510-1420-1-git-send-email-yguerrini@tomshardware.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 94d7dcb12145..50163fa9034f 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -565,8 +565,8 @@ static const struct _tlb_table intel_tlb_table[] = {
 	{ 0xb2, TLB_INST_4K,		64,	" TLB_INST 4KByte pages, 4-way set associative" },
 	{ 0xb3, TLB_DATA_4K,		128,	" TLB_DATA 4 KByte pages, 4-way set associative" },
 	{ 0xb4, TLB_DATA_4K,		256,	" TLB_DATA 4 KByte pages, 4-way associative" },
-	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
-	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set ssociative" },
+	{ 0xb5, TLB_INST_4K,		64,	" TLB_INST 4 KByte pages, 8-way set associative" },
+	{ 0xb6, TLB_INST_4K,		128,	" TLB_INST 4 KByte pages, 8-way set associative" },
 	{ 0xba, TLB_DATA_4K,		64,	" TLB_DATA 4 KByte pages, 4-way associative" },
 	{ 0xc0, TLB_DATA_4K_4M,		8,	" TLB_DATA 4 KByte and 4 MByte pages, 4-way associative" },
 	{ 0xc1, STLB_4K_2M,		1024,	" STLB 4 KByte and 2 MByte pages, 8-way associative" },
-- 
cgit v1.2.3


From 338ea55579d1dbd83753b10db74da747b2f1998f Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 4 Jan 2015 11:49:56 +0100
Subject: x86/lib/copy_user_64.S: Remove FIX_ALIGNMENT define

It is unconditionally enabled so remove it. No object file change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/copy_user_64.S | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..1530ec2c1b12 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -45,7 +42,6 @@
 	.endm
 
 	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
@@ -67,7 +63,6 @@
 
 	_ASM_EXTABLE(100b,103b)
 	_ASM_EXTABLE(101b,103b)
-#endif
 	.endm
 
 /* Standard copy_to_user with segment limit checking */
-- 
cgit v1.2.3


From db477a3386dee183130916d6bbf21f5828b0b2e2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 30 Dec 2014 20:27:09 +0100
Subject: x86/alternatives: Cleanup DPRINTK macro

Make it pass __func__ implicitly. Also, dump info about each replacing
we're doing. Fixup comments and style while at it.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/alternative.c | 41 +++++++++++++++++++++++++----------------
 1 file changed, 25 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..1e86e85bcf58 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,10 @@ static int __init setup_noreplace_paravirt(char *str)
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, ...)				\
-do {							\
-	if (debug_alternative)				\
-		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+#define DPRINTK(fmt, args...)						\
+do {									\
+	if (debug_alternative)						\
+		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 } while (0)
 
 /*
@@ -243,12 +243,13 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that asymmetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
-
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
 void __init_or_module apply_alternatives(struct alt_instr *start,
 					 struct alt_instr *end)
 {
@@ -256,10 +257,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	u8 *instr, *replacement;
 	u8 insnbuf[MAX_PATCH_LEN];
 
-	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+	DPRINTK("alt table %p -> %p", start, end);
 	/*
 	 * The scan order should be from start to end. A later scanned
-	 * alternative code can overwrite a previous scanned alternative code.
+	 * alternative code can overwrite previously scanned alternative code.
 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 	 * patch code.
 	 *
@@ -275,11 +276,19 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		if (!boot_cpu_has(a->cpuid))
 			continue;
 
+		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+			a->cpuid >> 5,
+			a->cpuid & 0x1f,
+			instr, a->instrlen,
+			replacement, a->replacementlen);
+
 		memcpy(insnbuf, replacement, a->replacementlen);
 
 		/* 0xe8 is a relative jump; fix the offset. */
-		if (*insnbuf == 0xe8 && a->replacementlen == 5)
-		    *(s32 *)(insnbuf + 1) += replacement - instr;
+		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+			*(s32 *)(insnbuf + 1) += replacement - instr;
+			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
+		}
 
 		add_nops(insnbuf + a->replacementlen,
 			 a->instrlen - a->replacementlen);
@@ -371,8 +380,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 	smp->locks_end	= locks_end;
 	smp->text	= text;
 	smp->text_end	= text_end;
-	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-		__func__, smp->locks, smp->locks_end,
+	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+		smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 
 	list_add_tail(&smp->next, &smp_alt_modules);
-- 
cgit v1.2.3


From 4332195c5615bf748624094ce4ff6797e475024d Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 27 Dec 2014 10:41:52 +0100
Subject: x86/alternatives: Add instruction padding

Up until now we have always paid attention to make sure the length of
the new instruction replacing the old one is at least less or equal to
the length of the old instruction. If the new instruction is longer, at
the time it replaces the old instruction it will overwrite the beginning
of the next instruction in the kernel image and cause your pants to
catch fire.

So instead of having to pay attention, teach the alternatives framework
to pad shorter old instructions with NOPs at buildtime - but only in the
case when

  len(old instruction(s)) < len(new instruction(s))

and add nothing in the >= case. (In that case we do add_nops() when
patching).

This way the alternatives user shouldn't have to care about instruction
sizes and simply use the macros.

Add asm ALTERNATIVE* flavor macros too, while at it.

Also, we need to save the pad length in a separate struct alt_instr
member for NOP optimization and the way to do that reliably is to carry
the pad length instead of trying to detect whether we're looking at
single-byte NOPs or at pathological instruction offsets like e9 90 90 90
90, for example, which is a valid instruction.

Thanks to Michael Matz for the great help with toolchain questions.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/alternative-asm.h | 43 +++++++++++++++++++++-
 arch/x86/include/asm/alternative.h     | 65 +++++++++++++++++++++-------------
 arch/x86/include/asm/cpufeature.h      | 22 ++++++++----
 arch/x86/include/asm/smap.h            |  4 +--
 arch/x86/kernel/alternative.c          |  6 ++--
 arch/x86/kernel/entry_32.S             |  2 +-
 arch/x86/lib/clear_page_64.S           |  4 +--
 arch/x86/lib/copy_page_64.S            |  2 +-
 arch/x86/lib/copy_user_64.S            |  4 +--
 arch/x86/lib/memcpy_64.S               |  4 +--
 arch/x86/lib/memmove_64.S              |  2 +-
 arch/x86/lib/memset_64.S               |  4 +--
 12 files changed, 114 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..524bddce0b76 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
 	.endm
 #endif
 
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
 	.word \feature
 	.byte \orig_len
 	.byte \alt_len
+	.byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr
+144:
+	.popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+	.skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr1
+144:
+	\newinstr2
+145:
+	.popsection
 .endm
 
 #endif  /*  __ASSEMBLY__  */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..5aef6a97d80e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
 	s32 repl_offset;	/* offset to replacement instruction */
 	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
-	u8  replacementlen;	/* length of new instruction, <= instrlen */
-};
+	u8  replacementlen;	/* length of new instruction */
+	u8  padlen;		/* length of build-time padding */
+} __packed;
 
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
 }
 #endif	/* CONFIG_SMP */
 
-#define OLDINSTR(oldinstr)	"661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num)	"664"#num
+#define e_replacement(num)	"665"#num
 
-#define b_replacement(number)	"663"#number
-#define e_replacement(number)	"664"#number
+#define alt_end_marker		"663"
+#define alt_slen		"662b-661b"
+#define alt_pad_len		alt_end_marker"b-662b"
+#define alt_total_slen		alt_end_marker"b-661b"
+#define alt_rlen(num)		e_replacement(num)"f-"b_replacement(num)"f"
 
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num)					\
+	"661:\n\t" oldinstr "\n662:\n"					\
+	".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * "		\
+		"((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
 
-#define ALTINSTR_ENTRY(feature, number)					      \
+#define OLDINSTR(oldinstr, num)						\
+	__OLDINSTR(oldinstr, num)					\
+	alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2)					\
+	__OLDINSTR(oldinstr, num1)						\
+	".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+		"((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n"  \
+	alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num)					      \
 	" .long 661b - .\n"				/* label           */ \
-	" .long " b_replacement(number)"f - .\n"	/* new instruction */ \
+	" .long " b_replacement(num)"f - .\n"		/* new instruction */ \
 	" .word " __stringify(feature) "\n"		/* feature bit     */ \
-	" .byte " alt_slen "\n"				/* source len      */ \
-	" .byte " alt_rlen(number) "\n"			/* replacement len */
+	" .byte " alt_total_slen "\n"			/* source len      */ \
+	" .byte " alt_rlen(num) "\n"			/* replacement len */ \
+	" .byte " alt_pad_len "\n"			/* pad len */
 
-#define DISCARD_ENTRY(number)				/* rlen <= slen */    \
-	" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number)	/* replacement */     \
-	b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num)	/* replacement */     \
+	b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
 
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)			\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR(oldinstr, 1)						\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature, 1)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
 	".popsection"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR_2(oldinstr, 1, 2)					\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature1, 1)					\
 	ALTINSTR_ENTRY(feature2, 2)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	DISCARD_ENTRY(2)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alternative(oldinstr, newinstr, feature)			\
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+	asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
 /*
  * Alternative inline assembly with input.
  *
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..9b1df43dadbb 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -418,6 +418,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			 " .word %P0\n"		/* 1: do replace */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +433,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (bit) : : t_no);
@@ -457,6 +459,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -491,23 +494,28 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
  */
 		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
 			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
-			 " .long 3f - .\n"		/* repl offset */
+			 " .long 4f - .\n"		/* repl offset */
 			 " .word %P1\n"			/* always replace */
-			 " .byte 2b - 1b\n"		/* src len */
-			 " .byte 4f - 3f\n"		/* repl len */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
-			 "4:\n"
+			 "4: .byte 0xe9\n .long %l[t_no] - 2b\n"
+			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
 			 " .long 0\n"			/* no replacement */
 			 " .word %P0\n"			/* feature bit */
-			 " .byte 2b - 1b\n"		/* src len */
+			 " .byte 3b - 1b\n"		/* src len */
 			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
 			 ".previous\n"
 			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
 			 : : t_dynamic, t_no);
@@ -527,6 +535,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     " .word %P2\n"		/* always replace */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +550,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 4b - 3b\n"		/* src len */
 			     " .byte 6f - 5f\n"		/* repl len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..c56cb4f37be9 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -33,7 +33,7 @@
 	662: __ASM_CLAC ;						\
 	.popsection ;							\
 	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
+	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
 	.popsection
 
 #define ASM_STAC							\
@@ -42,7 +42,7 @@
 	662: __ASM_STAC ;						\
 	.popsection ;							\
 	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
+	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
 	.popsection
 
 #else /* CONFIG_X86_SMAP */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1e86e85bcf58..c99b0f13a90e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -270,7 +270,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	for (a = start; a < end; a++) {
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
-		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 		if (!boot_cpu_has(a->cpuid))
@@ -290,8 +289,9 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
 		}
 
-		add_nops(insnbuf + a->replacementlen,
-			 a->instrlen - a->replacementlen);
+		if (a->instrlen > a->replacementlen)
+			add_nops(insnbuf + a->replacementlen,
+				 a->instrlen - a->replacementlen);
 
 		text_poke_early(instr, insnbuf, a->instrlen);
 	}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d4199b03e..d23c9a1d40e1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -819,7 +819,7 @@ ENTRY(simd_coprocessor_error)
 661:	pushl_cfi $do_general_protection
 662:
 .section .altinstructions,"a"
-	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
+	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0
 .previous
 .section .altinstr_replacement,"ax"
 663:	pushl $do_simd_coprocessor_error
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..38e57faefd71 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -67,7 +67,7 @@ ENDPROC(clear_page)
 	.previous
 	.section .altinstructions,"a"
 	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-			     .Lclear_page_end-clear_page, 2b-1b
+			     .Lclear_page_end-clear_page, 2b-1b, 0
 	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-			     .Lclear_page_end-clear_page,3b-2b
+			     .Lclear_page_end-clear_page,3b-2b, 0
 	.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..f1ffdbb07755 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -106,5 +106,5 @@ ENDPROC(copy_page)
 	.previous
 	.section .altinstructions,"a"
 	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
-		.Lcopy_page_end-copy_page, 2b-1b
+		.Lcopy_page_end-copy_page, 2b-1b, 0
 	.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 1530ec2c1b12..a9aedd6aa7f7 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -36,8 +36,8 @@
 	.previous
 
 	.section .altinstructions,"a"
-	altinstruction_entry 0b,2b,\feature1,5,5
-	altinstruction_entry 0b,3b,\feature2,5,5
+	altinstruction_entry 0b,2b,\feature1,5,5,0
+	altinstruction_entry 0b,3b,\feature2,5,5,0
 	.previous
 	.endm
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..bbfdacc01760 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -202,7 +202,7 @@ ENDPROC(__memcpy)
 	 */
 	.section .altinstructions, "a"
 	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0
 	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
+			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..bbfa6b269ece 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -221,7 +221,7 @@ ENTRY(__memmove)
 	altinstruction_entry .Lmemmove_begin_forward,		\
 		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
-		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0
 	.previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..f6153c1cdddc 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -150,7 +150,7 @@ ENDPROC(__memset)
 	 */
 	.section .altinstructions,"a"
 	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0
 	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0
 	.previous
-- 
cgit v1.2.3


From 48c7a2509f9e237d8465399d9cdfe487d3212a23 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 5 Jan 2015 13:48:41 +0100
Subject: x86/alternatives: Make JMPs more robust

Up until now we had to pay attention to relative JMPs in alternatives
about how their relative offset gets computed so that the jump target
is still correct. Or, as it is the case for near CALLs (opcode e8), we
still have to go and readjust the offset at patching time.

What is more, the static_cpu_has_safe() facility had to forcefully
generate 5-byte JMPs since we couldn't rely on the compiler to generate
properly sized ones so we had to force the longest ones. Worse than
that, sometimes it would generate a replacement JMP which is longer than
the original one, thus overwriting the beginning of the next instruction
at patching time.

So, in order to alleviate all that and make using JMPs more
straight-forward we go and pad the original instruction in an
alternative block with NOPs at build time, should the replacement(s) be
longer. This way, alternatives users shouldn't pay special attention
so that original and replacement instruction sizes are fine but the
assembler would simply add padding where needed and not do anything
otherwise.

As a second aspect, we go and recompute JMPs at patching time so that we
can try to make 5-byte JMPs into two-byte ones if possible. If not, we
still have to recompute the offsets as the replacement JMP gets put far
away in the .altinstr_replacement section leading to a wrong offset if
copied verbatim.

For example, on a locally generated kernel image

  old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2
  __switch_to:
   ffffffff810014bd:      eb 21                   jmp ffffffff810014e0
  repl insn: size: 5
  ffffffff81d0b23c:       e9 b1 62 2f ff          jmpq ffffffff810014f2

gets corrected to a 2-byte JMP:

  apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5)
  alt_insn: e9 b1 62 2f ff
  recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2
  converted to: eb 33 90 90 90

and a 5-byte JMP:

  old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2
  __switch_to:
   ffffffff81001516:      eb 30                   jmp ffffffff81001548
  repl insn: size: 5
   ffffffff81d0b241:      e9 10 63 2f ff          jmpq ffffffff81001556

gets shortened into a two-byte one:

  apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5)
  alt_insn: e9 10 63 2f ff
  recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2
  converted to: eb 3e 90 90 90

... and so on.

This leads to a net win of around

40ish replacements * 3 bytes savings =~ 120 bytes of I$

on an AMD guest which means some savings of precious instruction cache
bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs
which on smart microarchitectures means discarding NOPs at decode time
and thus freeing up execution bandwidth.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/cpufeature.h |  10 +---
 arch/x86/kernel/alternative.c     | 103 ++++++++++++++++++++++++++++++++++++--
 arch/x86/lib/copy_user_64.S       |  11 ++--
 3 files changed, 105 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9b1df43dadbb..e0571a24d582 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -486,13 +486,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			         "((5f-4f) - (2b-1b)),0x90\n"
@@ -506,7 +500,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: .byte 0xe9\n .long %l[t_no] - 2b\n"
+			 "4: jmp %l[t_no]\n"
 			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c99b0f13a90e..715af37bf008 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -58,6 +58,21 @@ do {									\
 		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 } while (0)
 
+#define DUMP_BYTES(buf, len, fmt, args...)				\
+do {									\
+	if (unlikely(debug_alternative)) {				\
+		int j;							\
+									\
+		if (!(len))						\
+			break;						\
+									\
+		printk(KERN_DEBUG fmt, ##args);				\
+		for (j = 0; j < (len) - 1; j++)				\
+			printk(KERN_CONT "%02hhx ", buf[j]);		\
+		printk(KERN_CONT "%02hhx\n", buf[j]);			\
+	}								\
+} while (0)
+
 /*
  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
  * that correspond to that nop. Getting from one nop to the next, we
@@ -243,6 +258,71 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+	return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+	u8 *next_rip, *tgt_rip;
+	s32 n_dspl, o_dspl;
+	int repl_len;
+
+	if (a->replacementlen != 5)
+		return;
+
+	o_dspl = *(s32 *)(insnbuf + 1);
+
+	/* next_rip of the replacement JMP */
+	next_rip = repl_insn + a->replacementlen;
+	/* target rip of the replacement JMP */
+	tgt_rip  = next_rip + o_dspl;
+	n_dspl = tgt_rip - orig_insn;
+
+	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+	if (tgt_rip - orig_insn >= 0) {
+		if (n_dspl - 2 <= 127)
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	/* negative offset */
+	} else {
+		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	}
+
+two_byte_jmp:
+	n_dspl -= 2;
+
+	insnbuf[0] = 0xeb;
+	insnbuf[1] = (s8)n_dspl;
+	add_nops(insnbuf + 2, 3);
+
+	repl_len = 2;
+	goto done;
+
+five_byte_jmp:
+	n_dspl -= 5;
+
+	insnbuf[0] = 0xe9;
+	*(s32 *)&insnbuf[1] = n_dspl;
+
+	repl_len = 5;
+
+done:
+
+	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
 /*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
@@ -268,6 +348,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	 * order.
 	 */
 	for (a = start; a < end; a++) {
+		int insnbuf_sz = 0;
+
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insnbuf));
@@ -281,24 +363,35 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			instr, a->instrlen,
 			replacement, a->replacementlen);
 
+		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
+
 		memcpy(insnbuf, replacement, a->replacementlen);
+		insnbuf_sz = a->replacementlen;
 
 		/* 0xe8 is a relative jump; fix the offset. */
 		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
 			*(s32 *)(insnbuf + 1) += replacement - instr;
-			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
+			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+				*(s32 *)(insnbuf + 1),
+				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
 		}
 
-		if (a->instrlen > a->replacementlen)
+		if (a->replacementlen && is_jmp(replacement[0]))
+			recompute_jump(a, instr, replacement, insnbuf);
+
+		if (a->instrlen > a->replacementlen) {
 			add_nops(insnbuf + a->replacementlen,
 				 a->instrlen - a->replacementlen);
+			insnbuf_sz += a->instrlen - a->replacementlen;
+		}
+		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
 
-		text_poke_early(instr, insnbuf, a->instrlen);
+		text_poke_early(instr, insnbuf, insnbuf_sz);
 	}
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
 				  u8 *text, u8 *text_end)
 {
@@ -449,7 +542,7 @@ int alternatives_text_reserved(void *start, void *end)
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a9aedd6aa7f7..dad718ce805c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -25,14 +25,13 @@
  */
 	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
-	.byte 0xe9	/* 32bit jump */
-	.long \orig-1f	/* by default jump to orig */
+	jmp \orig
 1:
 	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
+2:
+	jmp \alt1
+3:
+	jmp \alt2
 	.previous
 
 	.section .altinstructions,"a"
-- 
cgit v1.2.3


From 4fd4b6e5537cec5b56db0b22546dd439ebb26830 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 10 Jan 2015 20:34:07 +0100
Subject: x86/alternatives: Use optimized NOPs for padding

Alternatives allow now for an empty old instruction. In this case we go
and pad the space with NOPs at assembly time. However, there are the
optimal, longer NOPs which should be used. Do that at patching time by
adding alt_instr.padlen-sized NOPs at the old instruction address.

Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/alternative.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 715af37bf008..af397cc98d05 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -323,6 +323,14 @@ done:
 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 }
 
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+		   instr, a->instrlen - a->padlen, a->padlen);
+}
+
 /*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
@@ -354,8 +362,12 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
-		if (!boot_cpu_has(a->cpuid))
+		if (!boot_cpu_has(a->cpuid)) {
+			if (a->padlen > 1)
+				optimize_nops(a, instr);
+
 			continue;
+		}
 
 		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
 			a->cpuid >> 5,
-- 
cgit v1.2.3


From 090a3f615524c3f75d09fdb37f15ea1868d79f7e Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 12 Jan 2015 18:19:40 +0100
Subject: x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro

... instead of the semi-version with the spelled out sections.

What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:

  ffffffff8130af80 <copy_page>:
  ffffffff8130af80:       e9 0b 00 00 00          jmpq   ffffffff8130af90 <copy_page_regs>
  ffffffff8130af85:       b9 00 02 00 00          mov    $0x200,%ecx
  ffffffff8130af8a:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)
  ffffffff8130af8d:       c3                      retq
  ffffffff8130af8e:       66 90                   xchg   %ax,%ax

  ffffffff8130af90 <copy_page_regs>:
  ...

and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:

  ffffffff8130af80 <copy_page>:
  ffffffff8130af80:  66 66 90		xchg   %ax,%ax
  ffffffff8130af83:  66 90		xchg   %ax,%ax
  ffffffff8130af85:  b9 00 02 00 00	mov    $0x200,%ecx
  ffffffff8130af8a:  f3 48 a5		rep movsq %ds:(%rsi),%es:(%rdi)
  ffffffff8130af8d:  c3			retq

On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/copy_page_64.S | 37 ++++++++++++-------------------------
 1 file changed, 12 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index f1ffdbb07755..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
 	ALIGN
-copy_page_rep:
+ENTRY(copy_page)
 	CFI_STARTPROC
+	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
 	CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- *  Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
 
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
 	CFI_STARTPROC
 	subq	$2*8,	%rsp
 	CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
 	addq	$2*8, %rsp
 	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-.Lcopy_page_end:
 	CFI_ENDPROC
-ENDPROC(copy_page)
-
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (copy_page_rep - copy_page) - (2f - 1b)	/* offset */
-2:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
-		.Lcopy_page_end-copy_page, 2b-1b, 0
-	.previous
+ENDPROC(copy_page_regs)
-- 
cgit v1.2.3


From de2ff888843a22918ef15306922083739d23dad3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 13 Jan 2015 01:38:17 +0100
Subject: x86/lib/copy_user_64.S: Convert to ALTERNATIVE_2

Use the asm macro and drop the locally grown version.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/copy_user_64.S | 40 ++++++++++------------------------------
 1 file changed, 10 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dad718ce805c..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -16,30 +16,6 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
-	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
-	jmp \orig
-1:
-	.section .altinstr_replacement,"ax"
-2:
-	jmp \alt1
-3:
-	jmp \alt2
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry 0b,2b,\feature1,5,5,0
-	altinstruction_entry 0b,3b,\feature2,5,5,0
-	.previous
-	.endm
-
 	.macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
@@ -73,9 +49,11 @@ ENTRY(_copy_to_user)
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_to_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -88,9 +66,11 @@ ENTRY(_copy_from_user)
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_from_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
-- 
cgit v1.2.3


From 669f8a900198599d3c2e2e463bafe12d30d96507 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 15 Jan 2015 09:17:12 +0100
Subject: x86/smap: Use ALTERNATIVE macro

... and drop unfolded version. No need for ASM_NOP3 anymore either as
the alternatives do the proper padding at build time and insert proper
NOPs at boot time.

There should be no apparent operational change from this patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/smap.h | 30 +++++++++---------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index c56cb4f37be9..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
 
 #ifdef CONFIG_X86_SMAP
 
-#define ASM_CLAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_CLAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
-	.popsection
-
-#define ASM_STAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_STAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
-	.popsection
+#define ASM_CLAC \
+	ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+	ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
 
 #else /* CONFIG_X86_SMAP */
 
@@ -61,20 +49,20 @@
 static __always_inline void clac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
 }
 
 /* These macros can be used in asm() statements */
 #define ASM_CLAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
 #define ASM_STAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
 
 #else /* CONFIG_X86_SMAP */
 
-- 
cgit v1.2.3


From 8e65f6e03a90927b8de16c15da976baa6c3fff69 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 18 Jan 2015 12:35:55 +0100
Subject: x86/entry_32: Convert X86_INVD_BUG to ALTERNATIVE macro

Booting a 486 kernel on an AMD guest with this patch applied, says:

  apply_alternatives: feat: 0*32+25, old: (c160a475, len: 5), repl: (c19557d4, len: 5)
  c160a475: alt_insn: 68 10 35 00 c1
  c19557d4: rpl_insn: 68 80 39 00 c1

which is:

  old insn VA: 0xc160a475, CPU feat: X86_FEATURE_XMM, size: 5
  simd_coprocessor_error:
           c160a475:      68 10 35 00 c1          push $0xc1003510 <do_general_protection>
  repl insn: 0xc19557d4, size: 5
           c160a475:      68 80 39 00 c1          push $0xc1003980 <do_simd_coprocessor_error>

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/entry_32.S | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d23c9a1d40e1..b910577d1ff9 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
 	pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:	pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
-	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0
-.previous
-.section .altinstr_replacement,"ax"
-663:	pushl $do_simd_coprocessor_error
-664:
-.previous
+	ALTERNATIVE "pushl_cfi $do_general_protection",	\
+		    "pushl $do_simd_coprocessor_error", \
+		    X86_FEATURE_XMM
 #else
 	pushl_cfi $do_simd_coprocessor_error
 #endif
-- 
cgit v1.2.3


From 6620ef28c812b4782b10adb676580c2847d2bec8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 18 Jan 2015 12:57:41 +0100
Subject: x86/lib/clear_page_64.S: Convert to ALTERNATIVE_2 macro

Move clear_page() up so that we can get 2-byte forward JMPs when
patching:

  apply_alternatives: feat: 3*32+16, old: (ffffffff8130adb0, len: 5), repl: (ffffffff81d0b859, len: 5)
  ffffffff8130adb0: alt_insn: 90 90 90 90 90
  recompute_jump: new_displ: 0x0000003e
  ffffffff81d0b859: rpl_insn: eb 3e 66 66 90

even though the compiler generated 5-byte JMPs which we padded with 5
NOPs.

Also, make the REP_GOOD version be the default as the majority of
machines set REP_GOOD. This way we get to save ourselves the JMP:

  old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_REP_GOOD, size: 5, padlen: 0
  clear_page:

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 0b 00 00 00          jmpq ffffffff813038c0
  repl insn: 0xffffffff81cf0e92, size: 0

  old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_ERMS, size: 5, padlen: 0
  clear_page:

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 0b 00 00 00          jmpq ffffffff813038c0
  repl insn: 0xffffffff81cf0e92, size: 5
   ffffffff81cf0e92:      e9 69 2a 61 ff          jmpq ffffffff81303900

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 69 2a 61 ff          jmpq ffffffff8091631e

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/clear_page_64.S | 66 ++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 39 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 38e57faefd71..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
 /*
- * Zero a page. 	
- * rdi	page
- */			
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi	- page
+ */
+ENTRY(clear_page)
 	CFI_STARTPROC
+
+	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp clear_page_c_e", X86_FEATURE_ERMS
+
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep stosq
 	ret
 	CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
 	CFI_STARTPROC
-	movl $4096,%ecx
-	xorl %eax,%eax
-	rep stosb
-	ret
-	CFI_ENDPROC
-ENDPROC(clear_page_c_e)
 
-ENTRY(clear_page)
-	CFI_STARTPROC
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
 	.p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
 	nop
 	ret
 	CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
-	/*
-	 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
-	 * It is recommended to use this when possible.
-	 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
-	 * Otherwise, use original function.
-	 *
-	 */
+ENDPROC(clear_page_orig)
 
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
-2:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c_e - clear_page) - (3f - 2b)	/* offset */
-3:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-			     .Lclear_page_end-clear_page, 2b-1b, 0
-	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-			     .Lclear_page_end-clear_page,3b-2b, 0
-	.previous
+ENTRY(clear_page_c_e)
+	CFI_STARTPROC
+	movl $4096,%ecx
+	xorl %eax,%eax
+	rep stosb
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_c_e)
-- 
cgit v1.2.3


From c70e1b475f37f07ab7181ad28458666d59aae634 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 18 Jan 2015 15:19:55 +0100
Subject: x86/asm: Use alternative_2() in rdtsc_barrier()

... now that we have it.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/barrier.h | 6 ++----
 arch/x86/um/asm/barrier.h      | 4 ++--
 2 files changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do {									\
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
  * code region.
- *
- * (Could use an alternative three way for this if there was one.)
  */
 static __always_inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
  */
 static inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif
-- 
cgit v1.2.3


From a930dc4543a2b213deb9fde12682716edff8a4a6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sun, 18 Jan 2015 17:48:18 +0100
Subject: x86/asm: Cleanup prefetch primitives

This is based on a patch originally by hpa.

With the current improvements to the alternatives, we can simply use %P1
as a mem8 operand constraint and rely on the toolchain to generate the
proper instruction sizes. For example, on 32-bit, where we use an empty
old instruction we get:

  apply_alternatives: feat: 6*32+8, old: (c104648b, len: 4), repl: (c195566c, len: 4)
  c104648b: alt_insn: 90 90 90 90
  c195566c: rpl_insn: 0f 0d 4b 5c

  ...

  apply_alternatives: feat: 6*32+8, old: (c18e09b4, len: 3), repl: (c1955948, len: 3)
  c18e09b4: alt_insn: 90 90 90
  c1955948: rpl_insn: 0f 0d 08

  ...

  apply_alternatives: feat: 6*32+8, old: (c1190cf9, len: 7), repl: (c1955a79, len: 7)
  c1190cf9: alt_insn: 90 90 90 90 90 90 90
  c1955a79: rpl_insn: 0f 0d 0d a0 d4 85 c1

all with the proper padding done depending on the size of the
replacement instruction the compiler generates.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>
---
 arch/x86/include/asm/apic.h      |  2 +-
 arch/x86/include/asm/processor.h | 16 +++++++---------
 arch/x86/kernel/cpu/amd.c        |  5 +++++
 3 files changed, 13 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..8118e94d50ab 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
-	alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+	alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
 		       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
 		       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..7be2c9a6caba 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,10 @@ extern char			ignore_fpu_irq;
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
 #ifdef CONFIG_X86_32
-# define BASE_PREFETCH		ASM_NOP4
+# define BASE_PREFETCH		""
 # define ARCH_HAS_PREFETCH
 #else
-# define BASE_PREFETCH		"prefetcht0 (%1)"
+# define BASE_PREFETCH		"prefetcht0 %P1"
 #endif
 
 /*
@@ -775,10 +775,9 @@ extern char			ignore_fpu_irq;
  */
 static inline void prefetch(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchnta (%1)",
+	alternative_input(BASE_PREFETCH, "prefetchnta %P1",
 			  X86_FEATURE_XMM,
-			  "r" (x));
+			  "m" (*(const char *)x));
 }
 
 /*
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
  */
 static inline void prefetchw(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchw (%1)",
-			  X86_FEATURE_3DNOW,
-			  "r" (x));
+	alternative_input(BASE_PREFETCH, "prefetchw %P1",
+			  X86_FEATURE_3DNOWPREFETCH,
+			  "m" (*(const char *)x));
 }
 
 static inline void spin_lock_prefetch(const void *x)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
 
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+	/* 3DNow or LM implies PREFETCHW */
+	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+			set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 84d95ad4cb9015ea953bf14cea05ba371d4d42bb Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 4 Feb 2015 08:57:00 +0100
Subject: x86/lib/memset_64.S: Convert to ALTERNATIVE_2 macro

Make alternatives replace single JMPs instead of whole memset functions,
thus decreasing the amount of instructions copied during patching time
at boot.

While at it, make it use the REP_GOOD version by default which means
alternatives NOP out the JMP to the other versions, as REP_GOOD is set
by default on the majority of relevant x86 processors.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/memset_64.S | 61 +++++++++++++++++++-----------------------------
 1 file changed, 24 insertions(+), 37 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index f6153c1cdddc..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+.weak memset
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
  * simpler and shorter than the orignal function as well.
- *	
+ *
  * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
  * rax   original destination
- */	
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+	 * to use it when possible. If not available, use fast string instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 */
+	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memset_erms", X86_FEATURE_ERMS
+
 	movq %rdi,%r9
 	movq %rdx,%rcx
 	andl $7,%edx
@@ -31,8 +42,8 @@
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e:
-	.previous
+ENDPROC(memset)
+ENDPROC(__memset)
 
 /*
  * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
  *
  * rax   original destination
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
 	movq %rdi,%r9
 	movb %sil,%al
 	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e_e:
-	.previous
-
-.weak memset
+ENDPROC(memset_erms)
 
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
 	CFI_STARTPROC
 	movq %rdi,%r10
 
@@ -134,23 +140,4 @@ ENTRY(__memset)
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
-	 * It is recommended to use this when possible.
-	 *
-	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
-	 * instructions.
-	 *
-	 * Otherwise, use original memset function.
-	 *
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 */
-	.section .altinstructions,"a"
-	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0
-	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0
-	.previous
+ENDPROC(memset_orig)
-- 
cgit v1.2.3


From a77600cd03a44e54df99ea742e02f4899b82c8e1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 4 Feb 2015 09:11:17 +0100
Subject: x86/lib/memmove_64.S: Convert memmove() to ALTERNATIVE macro

Make it execute the ERMS version if support is present and we're in the
forward memmove() part and remove the unfolded alternatives section
definition.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/memmove_64.S | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index bbfa6b269ece..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
  * This assembly file is re-written from memmove_64.c file.
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
-#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
 	jg 2f
 
 .Lmemmove_begin_forward:
+	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
 	/*
 	 * movsq instruction have many startup latency
 	 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
 13:
 	retq
 	CFI_ENDPROC
-
-	.section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
-	/* Forward moving data. */
-	movq %rdx, %rcx
-	rep movsb
-	retq
-.Lmemmove_end_forward_efs:
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry .Lmemmove_begin_forward,		\
-		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
-		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
-		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0
-	.previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
-- 
cgit v1.2.3


From e0bc8d179e39a31bb3a56974374e55374fbf29be Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 4 Feb 2015 15:36:49 +0100
Subject: x86/lib/memcpy_64.S: Convert memcpy to ALTERNATIVE_2 macro

Make REP_GOOD variant the default after alternatives have run.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/lib/memcpy_64.S | 68 +++++++++++++++---------------------------------
 1 file changed, 21 insertions(+), 47 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bbfdacc01760..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,11 +1,19 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
 /*
  * memcpy - Copy a memory block.
  *
@@ -17,15 +25,11 @@
  * Output:
  * rax original destination
  */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memcpy_erms", X86_FEATURE_ERMS
 
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	shrq $3, %rcx
@@ -34,29 +38,21 @@
 	movl %edx, %ecx
 	rep movsb
 	ret
-.Lmemcpy_e:
-	.previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 /*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	rep movsb
 	ret
-.Lmemcpy_e_e:
-	.previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
 
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
 	CFI_STARTPROC
 	movq %rdi, %rax
 
@@ -183,26 +179,4 @@ ENTRY(memcpy)
 .Lend:
 	retq
 	CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-	/*
-	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-	 * If the feature is supported, memcpy_c_e() is the first choice.
-	 * If enhanced rep movsb copy is not available, use fast string copy
-	 * memcpy_c() when possible. This is faster and code is simpler than
-	 * original memcpy().
-	 * Otherwise, original memcpy() is used.
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 *
-	 * Replace only beginning, memcpy is used to apply alternatives,
-	 * so it is silly to overwrite itself with nops - reboot is the
-	 * only outcome...
-	 */
-	.section .altinstructions, "a"
-	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0
-	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0
-	.previous
+ENDPROC(memcpy_orig)
-- 
cgit v1.2.3


From 7aeccb83e76316b365e4b44a1dd982ee22a7d8b2 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 19 Jan 2015 19:51:32 +0100
Subject: x86/fpu: __kernel_fpu_begin() should clear fpu_owner_task even if
 use_eager_fpu()

__kernel_fpu_begin() does nothing if !__thread_has_fpu() && use_eager_fpu(),
perhaps it assumes that this case is simply impossible. This is certainly
not possible if in_interrupt() == T; interrupted_user_mode() should have
FPU, and interrupted_kernel_fpu_idle() should fail if !__thread_has_fpu().

However, even if use_eager_fpu() == T a task can do drop_fpu(), then switch
to another thread which becomes fpu_owner_task, then resume and call some
function which does kernel_fpu_begin(). Say, an exiting task does a lot of
things after exit_thread(), it is not safe to assume that it can't use FPU
in these paths.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Pekka Riikonen <priikone@iki.fi>
Link: http://lkml.kernel.org/r/20150119185132.GB16427@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/i387.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f59d80622e60..ad3a2a23f248 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -93,9 +93,10 @@ void __kernel_fpu_begin(void)
 
 	if (__thread_has_fpu(me)) {
 		__save_init_fpu(me);
-	} else if (!use_eager_fpu()) {
+	} else {
 		this_cpu_write(fpu_owner_task, NULL);
-		clts();
+		if (!use_eager_fpu())
+			clts();
 	}
 }
 EXPORT_SYMBOL(__kernel_fpu_begin);
-- 
cgit v1.2.3


From 4b2e762e2e53c721458a83d547b222178bb72a34 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 19 Jan 2015 19:51:51 +0100
Subject: x86/fpu: Always allow FPU in interrupt if use_eager_fpu()

The __thread_has_fpu() check in interrupted_kernel_fpu_idle() was needed
to prevent the nested kernel_fpu_begin(). Now that we have in_kernel_fpu
and !__thread_has_fpu() case in __kernel_fpu_begin() does not depend on
use_eager_fpu() (except clts) we can remove it.

__thread_has_fpu() can be false even if use_eager_fpu(), but this case
does not differ from !use_eager_fpu() case except we should not worry
about X86_CR0_TS, __kernel_fpu_begin()/end() will not touch this bit.

Note: I think we can kill all irq_fpu_usable() checks except in_kernel_fpu,
just we need to record the state of X86_CR0_TS in __kernel_fpu_begin() and
conditionalize stts() in __kernel_fpu_end(), but this needs another patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150119185151.GC16427@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/i387.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index ad3a2a23f248..8416b5f85806 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -41,8 +41,8 @@ void kernel_fpu_enable(void)
  * be set (so that the clts/stts pair does nothing that is
  * visible in the interrupted kernel thread).
  *
- * Except for the eagerfpu case when we return 1 unless we've already
- * been eager and saved the state in kernel_fpu_begin().
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
  */
 static inline bool interrupted_kernel_fpu_idle(void)
 {
@@ -50,7 +50,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
 		return false;
 
 	if (use_eager_fpu())
-		return __thread_has_fpu(current);
+		return true;
 
 	return !__thread_has_fpu(current) &&
 		(read_cr0() & X86_CR0_TS);
-- 
cgit v1.2.3


From 110d7f7513bbb916b8654da9e2973ac5bed929a9 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Mon, 19 Jan 2015 19:52:12 +0100
Subject: x86/fpu: Don't abuse FPU in kernel threads if use_eager_fpu()

AFAICS, there is no reason why kernel threads should have FPU context
even if use_eager_fpu() == T. Now that interrupted_kernel_fpu_idle()
does not check __thread_has_fpu() in the use_eager_fpu() case, we
can remove the init_fpu() code from eager_fpu_init() and change
flush_thread() called by do_execve() to initialize FPU.

Note: of course, the change in flush_thread() is horrible and must be
cleanuped. We need the new helper, and flush_thread() should return the
error if init_fpu() fails.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Link: http://lkml.kernel.org/r/20150119185212.GD16427@redhat.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/process.c |  7 +++++++
 arch/x86/kernel/xsave.c   | 13 +------------
 2 files changed, 8 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ce8b10351e28..83480373a642 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -130,6 +130,7 @@ void flush_thread(void)
 
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
+
 	drop_init_fpu(tsk);
 	/*
 	 * Free the FPU state for non xsave platforms. They get reallocated
@@ -137,6 +138,12 @@ void flush_thread(void)
 	 */
 	if (!use_eager_fpu())
 		free_thread_xstate(tsk);
+	else if (!used_math()) {
+		/* kthread execs. TODO: cleanup this horror. */
+		if (WARN_ON(init_fpu(current)))
+			force_sig(SIGKILL, current);
+		math_state_restore();
+	}
 }
 
 static void hard_disable_TSC(void)
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0de1fae2bdf0..de9dcf89a302 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -688,7 +688,7 @@ void eager_fpu_init(void)
 {
 	static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
 
-	clear_used_math();
+	WARN_ON(used_math());
 	current_thread_info()->status = 0;
 
 	if (eagerfpu == ENABLE)
@@ -703,17 +703,6 @@ void eager_fpu_init(void)
 		boot_func();
 		boot_func = NULL;
 	}
-
-	/*
-	 * This is same as math_state_restore(). But use_xsave() is
-	 * not yet patched to use math_state_restore().
-	 */
-	init_fpu(current);
-	__thread_fpu_begin(current);
-	if (cpu_has_xsave)
-		xrstor_state(init_xstate_buf, -1);
-	else
-		fxrstor_checking(&init_xstate_buf->i387);
 }
 
 /*
-- 
cgit v1.2.3


From 31795b470b0872b66f7fa26f791b695c79821220 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky
Date: Wed, 11 Feb 2015 14:39:18 -0500
Subject: x86/xen: Make sure X2APIC_ENABLE bit of MSR_IA32_APICBASE is not set

Commit d524165cb8db ("x86/apic: Check x2apic early") tests X2APIC_ENABLE
bit of MSR_IA32_APICBASE when CONFIG_X86_X2APIC is off and panics
the kernel when this bit is set.

Xen's PV guests will pass this MSR read to the hypervisor which will
return its version of the MSR, where this bit might be set. Make sure
we clear it before returning MSR value to the caller.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/enlighten.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bd8b8459c3d0..efee14db009b 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1070,6 +1070,23 @@ static inline void xen_write_cr8(unsigned long val)
 	BUG_ON(val);
 }
 #endif
+
+static u64 xen_read_msr_safe(unsigned int msr, int *err)
+{
+	u64 val;
+
+	val = native_read_msr_safe(msr, err);
+	switch (msr) {
+	case MSR_IA32_APICBASE:
+#ifdef CONFIG_X86_X2APIC
+		if (!(cpuid_ecx(1) & (1 << (X86_FEATURE_X2APIC & 31))))
+#endif
+			val &= ~X2APIC_ENABLE;
+		break;
+	}
+	return val;
+}
+
 static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
 {
 	int ret;
@@ -1240,7 +1257,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 
 	.wbinvd = native_wbinvd,
 
-	.read_msr = native_read_msr_safe,
+	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
 	.read_tsc = native_read_tsc,
-- 
cgit v1.2.3


From fdfd811ddde3678247248ca9a27faa999ca4cd51 Mon Sep 17 00:00:00 2001
From: David Vrabel
Date: Thu, 19 Feb 2015 15:23:17 +0000
Subject: x86/xen: allow privcmd hypercalls to be preempted

Hypercalls submitted by user space tools via the privcmd driver can
take a long time (potentially many 10s of seconds) if the hypercall
has many sub-operations.

A fully preemptible kernel may deschedule such as task in any upcall
called from a hypercall continuation.

However, in a kernel with voluntary or no preemption, hypercall
continuations in Xen allow event handlers to be run but the task
issuing the hypercall will not be descheduled until the hypercall is
complete and the ioctl returns to user space.  These long running
tasks may also trigger the kernel's soft lockup detection.

Add xen_preemptible_hcall_begin() and xen_preemptible_hcall_end() to
bracket hypercalls that may be preempted.  Use these in the privcmd
driver.

When returning from an upcall, call xen_maybe_preempt_hcall() which
adds a schedule point if if the current task was within a preemptible
hypercall.

Since _cond_resched() can move the task to a different CPU, clear and
set xen_in_preemptible_hcall around the call.

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
 arch/x86/kernel/entry_32.S |  3 +++
 arch/x86/kernel/entry_64.S |  3 +++
 drivers/xen/Makefile       |  2 +-
 drivers/xen/preempt.c      | 44 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/xen/privcmd.c      |  2 ++
 include/xen/xen-ops.h      | 26 ++++++++++++++++++++++++++
 6 files changed, 79 insertions(+), 1 deletion(-)
 create mode 100644 drivers/xen/preempt.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d4199b03e..31e2d5bf3e38 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -982,6 +982,9 @@ ENTRY(xen_hypervisor_callback)
 ENTRY(xen_do_upcall)
 1:	mov %esp, %eax
 	call xen_evtchn_do_upcall
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  ret_from_intr
 	CFI_ENDPROC
 ENDPROC(xen_hypervisor_callback)
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index db13655c3a2a..10074ad9ebf8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1208,6 +1208,9 @@ ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
 	popq %rsp
 	CFI_DEF_CFA_REGISTER rsp
 	decl PER_CPU_VAR(irq_count)
+#ifndef CONFIG_PREEMPT
+	call xen_maybe_preempt_hcall
+#endif
 	jmp  error_exit
 	CFI_ENDPROC
 END(xen_do_hypervisor_callback)
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 2140398a2a8c..2ccd3592d41f 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -2,7 +2,7 @@ ifeq ($(filter y, $(CONFIG_ARM) $(CONFIG_ARM64)),)
 obj-$(CONFIG_HOTPLUG_CPU)		+= cpu_hotplug.o
 endif
 obj-$(CONFIG_X86)			+= fallback.o
-obj-y	+= grant-table.o features.o balloon.o manage.o
+obj-y	+= grant-table.o features.o balloon.o manage.o preempt.o
 obj-y	+= events/
 obj-y	+= xenbus/
 
diff --git a/drivers/xen/preempt.c b/drivers/xen/preempt.c
new file mode 100644
index 000000000000..a1800c150839
--- /dev/null
+++ b/drivers/xen/preempt.c
@@ -0,0 +1,44 @@
+/*
+ * Preemptible hypercalls
+ *
+ * Copyright (C) 2014 Citrix Systems R&D ltd.
+ *
+ * This source code is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ */
+
+#include <linux/sched.h>
+#include <xen/xen-ops.h>
+
+#ifndef CONFIG_PREEMPT
+
+/*
+ * Some hypercalls issued by the toolstack can take many 10s of
+ * seconds. Allow tasks running hypercalls via the privcmd driver to
+ * be voluntarily preempted even if full kernel preemption is
+ * disabled.
+ *
+ * Such preemptible hypercalls are bracketed by
+ * xen_preemptible_hcall_begin() and xen_preemptible_hcall_end()
+ * calls.
+ */
+
+DEFINE_PER_CPU(bool, xen_in_preemptible_hcall);
+EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
+
+asmlinkage __visible void xen_maybe_preempt_hcall(void)
+{
+	if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
+		     && should_resched())) {
+		/*
+		 * Clear flag as we may be rescheduled on a different
+		 * cpu.
+		 */
+		__this_cpu_write(xen_in_preemptible_hcall, false);
+		_cond_resched();
+		__this_cpu_write(xen_in_preemptible_hcall, true);
+	}
+}
+#endif /* CONFIG_PREEMPT */
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 569a13b9e856..59ac71c4a043 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -56,10 +56,12 @@ static long privcmd_ioctl_hypercall(void __user *udata)
 	if (copy_from_user(&hypercall, udata, sizeof(hypercall)))
 		return -EFAULT;
 
+	xen_preemptible_hcall_begin();
 	ret = privcmd_call(hypercall.op,
 			   hypercall.arg[0], hypercall.arg[1],
 			   hypercall.arg[2], hypercall.arg[3],
 			   hypercall.arg[4]);
+	xen_preemptible_hcall_end();
 
 	return ret;
 }
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 7491ee5d8164..83338210ee04 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -46,4 +46,30 @@ static inline efi_system_table_t __init *xen_efi_probe(void)
 }
 #endif
 
+#ifdef CONFIG_PREEMPT
+
+static inline void xen_preemptible_hcall_begin(void)
+{
+}
+
+static inline void xen_preemptible_hcall_end(void)
+{
+}
+
+#else
+
+DECLARE_PER_CPU(bool, xen_in_preemptible_hcall);
+
+static inline void xen_preemptible_hcall_begin(void)
+{
+	__this_cpu_write(xen_in_preemptible_hcall, true);
+}
+
+static inline void xen_preemptible_hcall_end(void)
+{
+	__this_cpu_write(xen_in_preemptible_hcall, false);
+}
+
+#endif /* CONFIG_PREEMPT */
+
 #endif /* INCLUDE_XEN_OPS_H */
-- 
cgit v1.2.3


From 5054daa285beaf706f051fbd395dc36c9f0f907f Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky
Date: Mon, 23 Feb 2015 11:01:00 -0500
Subject: x86/xen: Initialize cr4 shadow for 64-bit PV(H) guests

Commit 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")
introduced CR4 shadows.

These shadows are initialized in early boot code. The commit missed
initialization for 64-bit PV(H) guests that this patch adds.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/enlighten.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index efee14db009b..5240f563076d 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1758,6 +1758,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #ifdef CONFIG_X86_32
 	i386_start_kernel();
 #else
+	cr4_init_shadow(); /* 32b kernel does this in i386_start_kernel() */
 	x86_64_start_reservations((char *)__pa_symbol(&boot_params));
 #endif
 }
-- 
cgit v1.2.3


From 1fc7f61c3e604f6bf778b5c6afc2715d79ab7f36 Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht
Date: Fri, 20 Feb 2015 03:34:21 +0100
Subject: x86/kernel: Fix output of show_stack_log_lvl()

show_stack_log_lvl() does not set the log level after a new line, the
following messages printed with pr_cont() are thus assigned to the
default log level.

This patch prepends the log level to the next message following a new
line.

print_trace_address() uses printk(log_lvl). Using printk() with just
a log level is ignored and thus has no effect on the next pr_cont().
We need to prepend the log level directly into the message.

Signed-off-by: Adrien Schildknecht <adrien+dev@schischi.me>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1424399661-20327-1-git-send-email-adrien+dev@schischi.me
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/dumpstack.c    | 11 ++++++-----
 arch/x86/kernel/dumpstack_32.c |  9 ++++++---
 arch/x86/kernel/dumpstack_64.c |  9 ++++++---
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..81b3932edbdc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
 int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
 static int die_counter;
 
-static void printk_stack_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable,
+		void *data)
 {
-	pr_cont(" [<%p>] %s%pB\n",
-		(void *)address, reliable ? "" : "? ", (void *)address);
+	printk("%s [<%p>] %s%pB\n",
+		(char *)data, (void *)address, reliable ? "" : "? ",
+		(void *)address);
 }
 
 void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
 static void print_trace_address(void *data, unsigned long addr, int reliable)
 {
 	touch_nmi_watchdog();
-	printk(data);
-	printk_stack_address(addr, reliable);
+	printk_stack_address(addr, reliable, data);
 }
 
 static const struct stacktrace_ops print_trace_ops = {
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..efff5ed6045d 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 	for (i = 0; i < kstack_depth_to_print; i++) {
 		if (kstack_end(stack))
 			break;
-		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			pr_cont("\n");
-		pr_cont(" %08lx", *stack++);
+		if ((i % STACKSLOTS_PER_LINE) == 0) {
+			if (i != 0)
+				pr_cont("\n");
+			printk("%s %08lx", log_lvl, *stack++);
+		} else
+			pr_cont(" %08lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	pr_cont("\n");
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index ff86f19b5758..553573bcf0ee 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -283,9 +283,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 		if (((long) stack & (THREAD_SIZE-1)) == 0)
 			break;
 		}
-		if (i && ((i % STACKSLOTS_PER_LINE) == 0))
-			pr_cont("\n");
-		pr_cont(" %016lx", *stack++);
+		if ((i % STACKSLOTS_PER_LINE) == 0) {
+			if (i != 0)
+				pr_cont("\n");
+			printk("%s %016lx", log_lvl, *stack++);
+		} else
+			pr_cont(" %016lx", *stack++);
 		touch_nmi_watchdog();
 	}
 	preempt_enable();
-- 
cgit v1.2.3


From 04769ae3ac72f86324a189b69f53bf3bfb61acfd Mon Sep 17 00:00:00 2001
From: Adrien Schildknecht
Date: Sun, 22 Feb 2015 16:23:58 +0100
Subject: x86/kernel: Use kstack_end() in dumpstack_64.c

i386 is already using kstack_end() in dumpstack_32.c. We should also
use it to make the code clearer and unify the stack printing logic some
more.

Suggested-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Adrien Schildknecht <adrien+dev@schischi.me>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: c: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1424618638-6375-1-git-send-email-adrien+dev@schischi.me
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/dumpstack_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 553573bcf0ee..5f1c6266eb30 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -280,7 +280,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
 				pr_cont(" <EOI> ");
 			}
 		} else {
-		if (((long) stack & (THREAD_SIZE-1)) == 0)
+		if (kstack_end(stack))
 			break;
 		}
 		if ((i % STACKSLOTS_PER_LINE) == 0) {
-- 
cgit v1.2.3


From 21bc8dc5b729dbeecb43adff23b74b51321e1897 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Mon, 16 Feb 2015 15:36:33 +0100
Subject: KVM: VMX: fix build without CONFIG_SMP

'apic' is not defined if !CONFIG_X86_64 && !CONFIG_X86_LOCAL_APIC.
Posted interrupt makes no sense without CONFIG_SMP, and
CONFIG_X86_LOCAL_APIC will be set with it.

Reported-by: kbuild test robot <fengguang.wu@intel.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14c1a18d206a..f7b20b417a3a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4367,6 +4367,18 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
+{
+#ifdef CONFIG_SMP
+	if (vcpu->mode == IN_GUEST_MODE) {
+		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
+				POSTED_INTR_VECTOR);
+		return true;
+	}
+#endif
+	return false;
+}
+
 static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 						int vector)
 {
@@ -4375,9 +4387,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 	if (is_guest_mode(vcpu) &&
 	    vector == vmx->nested.posted_intr_nv) {
 		/* the PIR and ON have been set by L1. */
-		if (vcpu->mode == IN_GUEST_MODE)
-			apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-				POSTED_INTR_VECTOR);
+		kvm_vcpu_trigger_posted_interrupt(vcpu);
 		/*
 		 * If a posted intr is not recognized by hardware,
 		 * we will accomplish it in the next vmentry.
@@ -4409,12 +4419,7 @@ static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
 
 	r = pi_test_and_set_on(&vmx->pi_desc);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-#ifdef CONFIG_SMP
-	if (!r && (vcpu->mode == IN_GUEST_MODE))
-		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
-				POSTED_INTR_VECTOR);
-	else
-#endif
+	if (r || !kvm_vcpu_trigger_posted_interrupt(vcpu))
 		kvm_vcpu_kick(vcpu);
 }
 
-- 
cgit v1.2.3


From 4ff6f8e61eb7f96d3ca535c6d240f863ccd6fb7d Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Thu, 12 Feb 2015 17:04:47 +0100
Subject: KVM: emulate: fix CMPXCHG8B on 32-bit hosts

This has been broken for a long time: it broke first in 2.6.35, then was
almost fixed in 2.6.36 but this one-liner slipped through the cracks.
The bug shows up as an infinite loop in Windows 7 (and newer) boot on
32-bit hosts without EPT.

Windows uses CMPXCHG8B to write to page tables, which causes a
page fault if running without EPT; the emulator is then called from
kvm_mmu_page_fault.  The loop then happens if the higher 4 bytes are
not 0; the common case for this is that the NX bit (bit 63) is 1.

Fixes: 6550e1f165f384f3a46b60a1be9aba4bc3c2adad
Fixes: 16518d5ada690643453eb0aef3cc7841d3623c2d
Cc: stable@vger.kernel.org   # 2.6.35+
Reported-by: Erik Rull <erik.rull@rdsoftware.de>
Tested-by: Erik Rull <erik.rull@rdsoftware.de>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e0b794a84c35..106c01557f2b 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4950,7 +4950,8 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 			goto done;
 		}
 	}
-	ctxt->dst.orig_val = ctxt->dst.val;
+	/* Copy full 64-bit value for CMPXCHG8B.  */
+	ctxt->dst.orig_val64 = ctxt->dst.val64;
 
 special_insn:
 
-- 
cgit v1.2.3


From 579deee571a755c485ad702ef82c77a98a2ccc05 Mon Sep 17 00:00:00 2001
From: Yannick Guerrini
Date: Mon, 23 Feb 2015 17:52:38 +0100
Subject: x86/platform/intel-mid: Fix trivial printk message typo in
 intel_mid_arch_setup()

Change 'Uknown' to 'Unknown'

Signed-off-by: Yannick Guerrini <yguerrini@tomshardware.fr>
Cc: trivial@kernel.org
Link: http://lkml.kernel.org/r/1424710358-10140-1-git-send-email-yguerrini@tomshardware.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-mid/intel-mid.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c
index 1bbedc4b0f88..3005f0c89f2e 100644
--- a/arch/x86/platform/intel-mid/intel-mid.c
+++ b/arch/x86/platform/intel-mid/intel-mid.c
@@ -130,7 +130,7 @@ static void intel_mid_arch_setup(void)
 		intel_mid_ops = get_intel_mid_ops[__intel_mid_cpu_chip]();
 	else {
 		intel_mid_ops = get_intel_mid_ops[INTEL_MID_CPU_CHIP_PENWELL]();
-		pr_info("ARCH: Uknown SoC, assuming PENWELL!\n");
+		pr_info("ARCH: Unknown SoC, assuming PENWELL!\n");
 	}
 
 out:
-- 
cgit v1.2.3


From 8d4a40bc0651ea51c196a3d3016d041c41ec19a2 Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Tue, 24 Feb 2015 10:13:28 +0100
Subject: x86/mm: Use early_memunmap() instead of early_iounmap()

Memory mapped via early_memremap() should be unmapped with
early_memunmap() instead of early_iounmap().

Signed-off-by: Juergen Gross <jgross@suse.com>
Cc: matt.fleming@intel.com
Link: http://lkml.kernel.org/r/1424769211-11378-2-git-send-email-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/devicetree.c | 4 ++--
 arch/x86/kernel/e820.c       | 2 +-
 arch/x86/kernel/setup.c      | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3d3503351242..6367a780cc8c 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
 	initial_boot_params = dt = early_memremap(initial_dtb, map_len);
 	size = of_get_flat_dt_size();
 	if (map_len < size) {
-		early_iounmap(dt, map_len);
+		early_memunmap(dt, map_len);
 		initial_boot_params = dt = early_memremap(initial_dtb, size);
 		map_len = size;
 	}
 
 	unflatten_and_copy_device_tree();
-	early_iounmap(dt, map_len);
+	early_memunmap(dt, map_len);
 }
 #else
 static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..7d46bb260334 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
 	extmap = (struct e820entry *)(sdata->data);
 	__append_e820_map(extmap, entries);
 	sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
-	early_iounmap(sdata, data_len);
+	early_memunmap(sdata, data_len);
 	printk(KERN_INFO "e820: extended physical RAM map:\n");
 	e820_print_map("extended");
 }
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 98dc9317286e..733864a653ab 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -356,7 +356,7 @@ static void __init relocate_initrd(void)
 		mapaddr = ramdisk_image & PAGE_MASK;
 		p = early_memremap(mapaddr, clen+slop);
 		memcpy(q, p+slop, clen);
-		early_iounmap(p, clen+slop);
+		early_memunmap(p, clen+slop);
 		q += clen;
 		ramdisk_image += clen;
 		ramdisk_size  -= clen;
@@ -445,7 +445,7 @@ static void __init parse_setup_data(void)
 		data_len = data->len + sizeof(struct setup_data);
 		data_type = data->type;
 		pa_next = data->next;
-		early_iounmap(data, sizeof(*data));
+		early_memunmap(data, sizeof(*data));
 
 		switch (data_type) {
 		case SETUP_E820_EXT:
@@ -480,7 +480,7 @@ static void __init e820_reserve_setup_data(void)
 			 E820_RAM, E820_RESERVED_KERN);
 		found = 1;
 		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
+		early_memunmap(data, sizeof(*data));
 	}
 	if (!found)
 		return;
@@ -501,7 +501,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
 		data = early_memremap(pa_data, sizeof(*data));
 		memblock_reserve(pa_data, sizeof(*data) + data->len);
 		pa_data = data->next;
-		early_iounmap(data, sizeof(*data));
+		early_memunmap(data, sizeof(*data));
 	}
 }
 
-- 
cgit v1.2.3


From 954e12f7a800ce38b4722ca1d7a6d0293d377b55 Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Tue, 24 Feb 2015 10:13:31 +0100
Subject: x86/mm, efi: Use early_ioremap() in arch/x86/platform/efi/efi-bgrt.c

Use early_ioremap() to map an I/O-area instead of
early_memremap().

Signed-off-by: Juergen Gross <jgross@suse.com>
Cc: matt.fleming@intel.com
Link: http://lkml.kernel.org/r/1424769211-11378-5-git-send-email-jgross@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/efi/efi-bgrt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c
index d143d216d52b..d7f997f7c26d 100644
--- a/arch/x86/platform/efi/efi-bgrt.c
+++ b/arch/x86/platform/efi/efi-bgrt.c
@@ -67,7 +67,7 @@ void __init efi_bgrt_init(void)
 
 	image = efi_lookup_mapped_addr(bgrt_tab->image_address);
 	if (!image) {
-		image = early_memremap(bgrt_tab->image_address,
+		image = early_ioremap(bgrt_tab->image_address,
 				       sizeof(bmp_header));
 		ioremapped = true;
 		if (!image) {
@@ -89,7 +89,7 @@ void __init efi_bgrt_init(void)
 	}
 
 	if (ioremapped) {
-		image = early_memremap(bgrt_tab->image_address,
+		image = early_ioremap(bgrt_tab->image_address,
 				       bmp_header.size);
 		if (!image) {
 			pr_err("Ignoring BGRT: failed to map image memory\n");
-- 
cgit v1.2.3


From 08571f1ae327bfb631cb7171bde5ea605df626c6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 24 Feb 2015 16:01:38 -0800
Subject: x86/ptrace: Remove checks for TIF_IA32 when changing CS and SS

The ability for modified CS and/or SS to be useful has nothing
to do with TIF_IA32.  Similarly, if there's an exploit involving
changing CS or SS, it's exploitable with or without a TIF_IA32
check.

So just delete the check.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Link: http://lkml.kernel.org/r/71c7ab36456855d11ae07edd4945a7dfe80f9915.1424822291.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/ptrace.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..1e125817cf9f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
 	case offsetof(struct user_regs_struct,cs):
 		if (unlikely(value == 0))
 			return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-		if (test_tsk_thread_flag(task, TIF_IA32))
-			task_pt_regs(task)->cs = value;
-#endif
+		task_pt_regs(task)->cs = value;
 		break;
 	case offsetof(struct user_regs_struct,ss):
 		if (unlikely(value == 0))
 			return -EIO;
-#ifdef CONFIG_IA32_EMULATION
-		if (test_tsk_thread_flag(task, TIF_IA32))
-			task_pt_regs(task)->ss = value;
-#endif
+		task_pt_regs(task)->ss = value;
 		break;
 	}
 
-- 
cgit v1.2.3


From 72c6fb4f74b6b3797f5b1abd6944d7a1d2adbf04 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 24 Feb 2015 16:01:39 -0800
Subject: x86/ia32-compat: Fix CLONE_SETTLS bitness of copy_thread()

CLONE_SETTLS is expected to write a TLS entry in the GDT for
32-bit callers and to set FSBASE for 64-bit callers.

The correct check is is_ia32_task(), which returns true in the
context of a 32-bit syscall.  TIF_IA32 is set if the task itself
has a 32-bit personality, which is not the same thing.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Link: http://lkml.kernel.org/r/45e2d0d695393d76406a0c7225b82c76223e0cc5.1424822291.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 5a2c02913af3..936d43461dca 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -207,7 +207,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	 */
 	if (clone_flags & CLONE_SETTLS) {
 #ifdef CONFIG_IA32_EMULATION
-		if (test_thread_flag(TIF_IA32))
+		if (is_ia32_task())
 			err = do_set_thread_area(p, -1,
 				(struct user_desc __user *)childregs->si, 0);
 		else
-- 
cgit v1.2.3


From cbc82b17263877ea5d21e84c58ce03f0292458a1 Mon Sep 17 00:00:00 2001
From: Peter P Waskiewicz Jr
Date: Fri, 23 Jan 2015 18:45:43 +0000
Subject: x86: Add support for Intel Cache QoS Monitoring (CQM) detection

This patch adds support for the new Cache QoS Monitoring (CQM)
feature found in future Intel Xeon processors.  It includes the
new values to track CQM resources to the cpuinfo_x86 structure,
plus the CPUID detection routines for CQM.

CQM allows a process, or set of processes, to be tracked by the CPU
to determine the cache usage of that task group.  Using this data
from the CPU, software can be written to extract this data and
report cache usage and occupancy for a particular process, or
group of processes.

More information about Cache QoS Monitoring can be found in the
Intel (R) x86 Architecture Software Developer Manual, section 17.14.

Signed-off-by: Peter P Waskiewicz Jr <peter.p.waskiewicz.jr@intel.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Chris Webb <chris@arachsys.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Igor Mammedov <imammedo@redhat.com>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Steven Honeyman <stevenhoneyman@gmail.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-5-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h |  9 ++++++++-
 arch/x86/include/asm/processor.h  |  3 +++
 arch/x86/kernel/cpu/common.c      | 39 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..361922dcc9b1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -12,7 +12,7 @@
 #include <asm/disabled-features.h>
 #endif
 
-#define NCAPINTS	11	/* N 32-bit words worth of info */
+#define NCAPINTS	13	/* N 32-bit words worth of info */
 #define NBUGINTS	1	/* N 32-bit bug flags */
 
 /*
@@ -226,6 +226,7 @@
 #define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
 #define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
 #define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
 #define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
 #define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
 #define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
@@ -242,6 +243,12 @@
 #define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
 #define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
 
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
+#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
+
 /*
  * BUG word(s)
  */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..a12d50e04d7a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -109,6 +109,9 @@ struct cpuinfo_x86 {
 	/* in KB - valid for CPUS which support this call: */
 	int			x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
+	/* Cache QoS architectural values: */
+	int			x86_cache_max_rmid;	/* max index */
+	int			x86_cache_occ_scale;	/* scale to bytes */
 	int			x86_power;
 	unsigned long		loops_per_jiffy;
 	/* cpuid returned max cores value: */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 07f2fc3c13a4..9fa00b2ea0ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -645,6 +645,30 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_capability[10] = eax;
 	}
 
+	/* Additional Intel-defined flags: level 0x0000000F */
+	if (c->cpuid_level >= 0x0000000F) {
+		u32 eax, ebx, ecx, edx;
+
+		/* QoS sub-leaf, EAX=0Fh, ECX=0 */
+		cpuid_count(0x0000000F, 0, &eax, &ebx, &ecx, &edx);
+		c->x86_capability[11] = edx;
+		if (cpu_has(c, X86_FEATURE_CQM_LLC)) {
+			/* will be overridden if occupancy monitoring exists */
+			c->x86_cache_max_rmid = ebx;
+
+			/* QoS sub-leaf, EAX=0Fh, ECX=1 */
+			cpuid_count(0x0000000F, 1, &eax, &ebx, &ecx, &edx);
+			c->x86_capability[12] = edx;
+			if (cpu_has(c, X86_FEATURE_CQM_OCCUP_LLC)) {
+				c->x86_cache_max_rmid = ecx;
+				c->x86_cache_occ_scale = ebx;
+			}
+		} else {
+			c->x86_cache_max_rmid = -1;
+			c->x86_cache_occ_scale = -1;
+		}
+	}
+
 	/* AMD-defined flags: level 0x80000001 */
 	xlvl = cpuid_eax(0x80000000);
 	c->extended_cpuid_level = xlvl;
@@ -833,6 +857,20 @@ static void generic_identify(struct cpuinfo_x86 *c)
 	detect_nopl(c);
 }
 
+static void x86_init_cache_qos(struct cpuinfo_x86 *c)
+{
+	/*
+	 * The heavy lifting of max_rmid and cache_occ_scale are handled
+	 * in get_cpu_cap().  Here we just set the max_rmid for the boot_cpu
+	 * in case CQM bits really aren't there in this CPU.
+	 */
+	if (c != &boot_cpu_data) {
+		boot_cpu_data.x86_cache_max_rmid =
+			min(boot_cpu_data.x86_cache_max_rmid,
+			    c->x86_cache_max_rmid);
+	}
+}
+
 /*
  * This does the hard work of actually picking apart the CPU stuff...
  */
@@ -922,6 +960,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
 	init_hypervisor(c);
 	x86_init_rdrand(c);
+	x86_init_cache_qos(c);
 
 	/*
 	 * Clear/Set all flags overriden by options, need do it
-- 
cgit v1.2.3


From 4afbb24ce5e723c8a093a6674a3c33062175078a Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Fri, 23 Jan 2015 18:45:44 +0000
Subject: perf/x86/intel: Add Intel Cache QoS Monitoring support

Future Intel Xeon processors support a Cache QoS Monitoring feature that
allows tracking of the LLC occupancy for a task or task group, i.e. the
amount of data in pulled into the LLC for the task (group).

Currently the PMU only supports per-cpu events. We create an event for
each cpu and read out all the LLC occupancy values.

Because this results in duplicate values being written out to userspace,
we also export a .per-pkg event file so that the perf tools only
accumulate values for one cpu per package.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-6-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 530 +++++++++++++++++++++++++++++
 include/linux/perf_event.h                 |   7 +
 2 files changed, 537 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_cqm.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
new file mode 100644
index 000000000000..05b4cd26f426
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -0,0 +1,530 @@
+/*
+ * Intel Cache Quality-of-Service Monitoring (CQM) support.
+ *
+ * Based very, very heavily on work by Peter Zijlstra.
+ */
+
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include <asm/cpu_device_id.h>
+#include "perf_event.h"
+
+#define MSR_IA32_PQR_ASSOC	0x0c8f
+#define MSR_IA32_QM_CTR		0x0c8e
+#define MSR_IA32_QM_EVTSEL	0x0c8d
+
+static unsigned int cqm_max_rmid = -1;
+static unsigned int cqm_l3_scale; /* supposedly cacheline size */
+
+struct intel_cqm_state {
+	raw_spinlock_t		lock;
+	int			rmid;
+	int			cnt;
+};
+
+static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
+
+/*
+ * Protects cache_cgroups.
+ */
+static DEFINE_MUTEX(cache_mutex);
+
+/*
+ * Groups of events that have the same target(s), one RMID per group.
+ */
+static LIST_HEAD(cache_groups);
+
+/*
+ * Mask of CPUs for reading CQM values. We only need one per-socket.
+ */
+static cpumask_t cqm_cpumask;
+
+#define RMID_VAL_ERROR		(1ULL << 63)
+#define RMID_VAL_UNAVAIL	(1ULL << 62)
+
+#define QOS_L3_OCCUP_EVENT_ID	(1 << 0)
+
+#define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
+
+static u64 __rmid_read(unsigned long rmid)
+{
+	u64 val;
+
+	/*
+	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
+	 * it just says that to increase confusion.
+	 */
+	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
+	rdmsrl(MSR_IA32_QM_CTR, val);
+
+	/*
+	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
+	 * the number of cachelines tagged with @rmid.
+	 */
+	return val;
+}
+
+static unsigned long *cqm_rmid_bitmap;
+
+/*
+ * Returns < 0 on fail.
+ */
+static int __get_rmid(void)
+{
+	return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+}
+
+static void __put_rmid(int rmid)
+{
+	bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+}
+
+static int intel_cqm_setup_rmid_cache(void)
+{
+	cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL);
+	if (!cqm_rmid_bitmap)
+		return -ENOMEM;
+
+	bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+
+	/*
+	 * RMID 0 is special and is always allocated. It's used for all
+	 * tasks that are not monitored.
+	 */
+	bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+
+	return 0;
+}
+
+/*
+ * Determine if @a and @b measure the same set of tasks.
+ */
+static bool __match_event(struct perf_event *a, struct perf_event *b)
+{
+	if ((a->attach_state & PERF_ATTACH_TASK) !=
+	    (b->attach_state & PERF_ATTACH_TASK))
+		return false;
+
+	/* not task */
+
+	return true; /* if not task, we're machine wide */
+}
+
+/*
+ * Determine if @a's tasks intersect with @b's tasks
+ */
+static bool __conflict_event(struct perf_event *a, struct perf_event *b)
+{
+	/*
+	 * If one of them is not a task, same story as above with cgroups.
+	 */
+	if (!(a->attach_state & PERF_ATTACH_TASK) ||
+	    !(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Must be non-overlapping.
+	 */
+	return false;
+}
+
+/*
+ * Find a group and setup RMID.
+ *
+ * If we're part of a group, we use the group's RMID.
+ */
+static int intel_cqm_setup_event(struct perf_event *event,
+				 struct perf_event **group)
+{
+	struct perf_event *iter;
+	int rmid;
+
+	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		if (__match_event(iter, event)) {
+			/* All tasks in a group share an RMID */
+			event->hw.cqm_rmid = iter->hw.cqm_rmid;
+			*group = iter;
+			return 0;
+		}
+
+		if (__conflict_event(iter, event))
+			return -EBUSY;
+	}
+
+	rmid = __get_rmid();
+	if (rmid < 0)
+		return rmid;
+
+	event->hw.cqm_rmid = rmid;
+	return 0;
+}
+
+static void intel_cqm_event_read(struct perf_event *event)
+{
+	unsigned long rmid = event->hw.cqm_rmid;
+	u64 val;
+
+	val = __rmid_read(rmid);
+
+	/*
+	 * Ignore this reading on error states and do not update the value.
+	 */
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	local64_set(&event->count, val);
+}
+
+static void intel_cqm_event_start(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned long flags;
+
+	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
+		return;
+
+	event->hw.cqm_state &= ~PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+
+	if (state->cnt++)
+		WARN_ON_ONCE(state->rmid != rmid);
+	else
+		WARN_ON_ONCE(state->rmid);
+
+	state->rmid = rmid;
+	wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid);
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static void intel_cqm_event_stop(struct perf_event *event, int mode)
+{
+	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
+	unsigned long flags;
+
+	if (event->hw.cqm_state & PERF_HES_STOPPED)
+		return;
+
+	event->hw.cqm_state |= PERF_HES_STOPPED;
+
+	raw_spin_lock_irqsave(&state->lock, flags);
+	intel_cqm_event_read(event);
+
+	if (!--state->cnt) {
+		state->rmid = 0;
+		wrmsrl(MSR_IA32_PQR_ASSOC, 0);
+	} else {
+		WARN_ON_ONCE(!state->rmid);
+	}
+
+	raw_spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int intel_cqm_event_add(struct perf_event *event, int mode)
+{
+	int rmid;
+
+	event->hw.cqm_state = PERF_HES_STOPPED;
+	rmid = event->hw.cqm_rmid;
+	WARN_ON_ONCE(!rmid);
+
+	if (mode & PERF_EF_START)
+		intel_cqm_event_start(event, mode);
+
+	return 0;
+}
+
+static void intel_cqm_event_del(struct perf_event *event, int mode)
+{
+	intel_cqm_event_stop(event, mode);
+}
+
+static void intel_cqm_event_destroy(struct perf_event *event)
+{
+	struct perf_event *group_other = NULL;
+
+	mutex_lock(&cache_mutex);
+
+	/*
+	 * If there's another event in this group...
+	 */
+	if (!list_empty(&event->hw.cqm_group_entry)) {
+		group_other = list_first_entry(&event->hw.cqm_group_entry,
+					       struct perf_event,
+					       hw.cqm_group_entry);
+		list_del(&event->hw.cqm_group_entry);
+	}
+
+	/*
+	 * And we're the group leader..
+	 */
+	if (!list_empty(&event->hw.cqm_groups_entry)) {
+		/*
+		 * If there was a group_other, make that leader, otherwise
+		 * destroy the group and return the RMID.
+		 */
+		if (group_other) {
+			list_replace(&event->hw.cqm_groups_entry,
+				     &group_other->hw.cqm_groups_entry);
+		} else {
+			int rmid = event->hw.cqm_rmid;
+
+			__put_rmid(rmid);
+			list_del(&event->hw.cqm_groups_entry);
+		}
+	}
+
+	mutex_unlock(&cache_mutex);
+}
+
+static struct pmu intel_cqm_pmu;
+
+/*
+ * XXX there's a bit of a problem in that we cannot simply do the one
+ * event per node as one would want, since that one event would one get
+ * scheduled on the one cpu. But we want to 'schedule' the RMID on all
+ * CPUs.
+ *
+ * This means we want events for each CPU, however, that generates a lot
+ * of duplicate values out to userspace -- this is not to be helped
+ * unless we want to change the core code in some way. Fore more info,
+ * see intel_cqm_event_read().
+ */
+static int intel_cqm_event_init(struct perf_event *event)
+{
+	struct perf_event *group = NULL;
+	int err;
+
+	if (event->attr.type != intel_cqm_pmu.type)
+		return -ENOENT;
+
+	if (event->attr.config & ~QOS_EVENT_MASK)
+		return -EINVAL;
+
+	if (event->cpu == -1)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
+	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
+
+	event->destroy = intel_cqm_event_destroy;
+
+	mutex_lock(&cache_mutex);
+
+	err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+	if (err)
+		goto out;
+
+	if (group) {
+		list_add_tail(&event->hw.cqm_group_entry,
+			      &group->hw.cqm_group_entry);
+	} else {
+		list_add_tail(&event->hw.cqm_groups_entry,
+			      &cache_groups);
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return err;
+}
+
+EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
+EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
+EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
+EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
+EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
+
+static struct attribute *intel_cqm_events_attr[] = {
+	EVENT_PTR(intel_cqm_llc),
+	EVENT_PTR(intel_cqm_llc_pkg),
+	EVENT_PTR(intel_cqm_llc_unit),
+	EVENT_PTR(intel_cqm_llc_scale),
+	EVENT_PTR(intel_cqm_llc_snapshot),
+	NULL,
+};
+
+static struct attribute_group intel_cqm_events_group = {
+	.name = "events",
+	.attrs = intel_cqm_events_attr,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+static struct attribute *intel_cqm_formats_attr[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+
+static struct attribute_group intel_cqm_format_group = {
+	.name = "format",
+	.attrs = intel_cqm_formats_attr,
+};
+
+static const struct attribute_group *intel_cqm_attr_groups[] = {
+	&intel_cqm_events_group,
+	&intel_cqm_format_group,
+	NULL,
+};
+
+static struct pmu intel_cqm_pmu = {
+	.attr_groups	= intel_cqm_attr_groups,
+	.task_ctx_nr	= perf_sw_context,
+	.event_init	= intel_cqm_event_init,
+	.add		= intel_cqm_event_add,
+	.del		= intel_cqm_event_del,
+	.start		= intel_cqm_event_start,
+	.stop		= intel_cqm_event_stop,
+	.read		= intel_cqm_event_read,
+};
+
+static inline void cqm_pick_event_reader(int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	for_each_cpu(i, &cqm_cpumask) {
+		if (phys_id == topology_physical_package_id(i))
+			return;	/* already got reader for this socket */
+	}
+
+	cpumask_set_cpu(cpu, &cqm_cpumask);
+}
+
+static void intel_cqm_cpu_prepare(unsigned int cpu)
+{
+	struct intel_cqm_state *state = &per_cpu(cqm_state, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+	raw_spin_lock_init(&state->lock);
+	state->rmid = 0;
+	state->cnt  = 0;
+
+	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
+	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
+}
+
+static void intel_cqm_cpu_exit(unsigned int cpu)
+{
+	int phys_id = topology_physical_package_id(cpu);
+	int i;
+
+	/*
+	 * Is @cpu a designated cqm reader?
+	 */
+	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
+		return;
+
+	for_each_online_cpu(i) {
+		if (i == cpu)
+			continue;
+
+		if (phys_id == topology_physical_package_id(i)) {
+			cpumask_set_cpu(i, &cqm_cpumask);
+			break;
+		}
+	}
+}
+
+static int intel_cqm_cpu_notifier(struct notifier_block *nb,
+				  unsigned long action, void *hcpu)
+{
+	unsigned int cpu  = (unsigned long)hcpu;
+
+	switch (action & ~CPU_TASKS_FROZEN) {
+	case CPU_UP_PREPARE:
+		intel_cqm_cpu_prepare(cpu);
+		break;
+	case CPU_DOWN_PREPARE:
+		intel_cqm_cpu_exit(cpu);
+		break;
+	case CPU_STARTING:
+		cqm_pick_event_reader(cpu);
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static const struct x86_cpu_id intel_cqm_match[] = {
+	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
+	{}
+};
+
+static int __init intel_cqm_init(void)
+{
+	char *str, scale[20];
+	int i, cpu, ret;
+
+	if (!x86_match_cpu(intel_cqm_match))
+		return -ENODEV;
+
+	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
+
+	/*
+	 * It's possible that not all resources support the same number
+	 * of RMIDs. Instead of making scheduling much more complicated
+	 * (where we have to match a task's RMID to a cpu that supports
+	 * that many RMIDs) just find the minimum RMIDs supported across
+	 * all cpus.
+	 *
+	 * Also, check that the scales match on all cpus.
+	 */
+	cpu_notifier_register_begin();
+
+	for_each_online_cpu(cpu) {
+		struct cpuinfo_x86 *c = &cpu_data(cpu);
+
+		if (c->x86_cache_max_rmid < cqm_max_rmid)
+			cqm_max_rmid = c->x86_cache_max_rmid;
+
+		if (c->x86_cache_occ_scale != cqm_l3_scale) {
+			pr_err("Multiple LLC scale values, disabling\n");
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
+	str = kstrdup(scale, GFP_KERNEL);
+	if (!str) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	event_attr_intel_cqm_llc_scale.event_str = str;
+
+	ret = intel_cqm_setup_rmid_cache();
+	if (ret)
+		goto out;
+
+	for_each_online_cpu(i) {
+		intel_cqm_cpu_prepare(i);
+		cqm_pick_event_reader(i);
+	}
+
+	__perf_cpu_notifier(intel_cqm_cpu_notifier);
+
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
+
+	if (ret)
+		pr_err("Intel CQM perf registration failed: %d\n", ret);
+	else
+		pr_info("Intel CQM monitoring enabled\n");
+
+out:
+	cpu_notifier_register_done();
+
+	return ret;
+}
+device_initcall(intel_cqm_init);
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 9fc9b0d31442..ca5504c48f4f 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -123,6 +123,13 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
+		struct { /* intel_cqm */
+			int			cqm_state;
+			int			cqm_rmid;
+			struct list_head	cqm_events_entry;
+			struct list_head	cqm_groups_entry;
+			struct list_head	cqm_group_entry;
+		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
 			/*
-- 
cgit v1.2.3


From 35298e554c74b7849875e3676ba8eaf833c7b917 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Fri, 23 Jan 2015 18:45:45 +0000
Subject: perf/x86/intel: Implement LRU monitoring ID allocation for CQM

It's possible to run into issues with re-using unused monitoring IDs
because there may be stale cachelines associated with that ID from a
previous allocation. This can cause the LLC occupancy values to be
inaccurate.

To attempt to mitigate this problem we place the IDs on a least recently
used list, essentially a FIFO. The basic idea is that the longer the
time period between ID re-use the lower the probability that stale
cachelines exist in the cache.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-7-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 100 ++++++++++++++++++++++++++---
 1 file changed, 92 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 05b4cd26f426..b5d9d746dbc0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -25,7 +25,7 @@ struct intel_cqm_state {
 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
 
 /*
- * Protects cache_cgroups.
+ * Protects cache_cgroups and cqm_rmid_lru.
  */
 static DEFINE_MUTEX(cache_mutex);
 
@@ -64,36 +64,120 @@ static u64 __rmid_read(unsigned long rmid)
 	return val;
 }
 
-static unsigned long *cqm_rmid_bitmap;
+struct cqm_rmid_entry {
+	u64 rmid;
+	struct list_head list;
+};
+
+/*
+ * A least recently used list of RMIDs.
+ *
+ * Oldest entry at the head, newest (most recently used) entry at the
+ * tail. This list is never traversed, it's only used to keep track of
+ * the lru order. That is, we only pick entries of the head or insert
+ * them on the tail.
+ *
+ * All entries on the list are 'free', and their RMIDs are not currently
+ * in use. To mark an RMID as in use, remove its entry from the lru
+ * list.
+ *
+ * This list is protected by cache_mutex.
+ */
+static LIST_HEAD(cqm_rmid_lru);
+
+/*
+ * We use a simple array of pointers so that we can lookup a struct
+ * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
+ * and __put_rmid() from having to worry about dealing with struct
+ * cqm_rmid_entry - they just deal with rmids, i.e. integers.
+ *
+ * Once this array is initialized it is read-only. No locks are required
+ * to access it.
+ *
+ * All entries for all RMIDs can be looked up in the this array at all
+ * times.
+ */
+static struct cqm_rmid_entry **cqm_rmid_ptrs;
+
+static inline struct cqm_rmid_entry *__rmid_entry(int rmid)
+{
+	struct cqm_rmid_entry *entry;
+
+	entry = cqm_rmid_ptrs[rmid];
+	WARN_ON(entry->rmid != rmid);
+
+	return entry;
+}
 
 /*
  * Returns < 0 on fail.
+ *
+ * We expect to be called with cache_mutex held.
  */
 static int __get_rmid(void)
 {
-	return bitmap_find_free_region(cqm_rmid_bitmap, cqm_max_rmid, 0);
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	if (list_empty(&cqm_rmid_lru))
+		return -EAGAIN;
+
+	entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
+	list_del(&entry->list);
+
+	return entry->rmid;
 }
 
 static void __put_rmid(int rmid)
 {
-	bitmap_release_region(cqm_rmid_bitmap, rmid, 0);
+	struct cqm_rmid_entry *entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	entry = __rmid_entry(rmid);
+
+	list_add_tail(&entry->list, &cqm_rmid_lru);
 }
 
 static int intel_cqm_setup_rmid_cache(void)
 {
-	cqm_rmid_bitmap = kmalloc(sizeof(long) * BITS_TO_LONGS(cqm_max_rmid), GFP_KERNEL);
-	if (!cqm_rmid_bitmap)
+	struct cqm_rmid_entry *entry;
+	int r;
+
+	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
+				(cqm_max_rmid + 1), GFP_KERNEL);
+	if (!cqm_rmid_ptrs)
 		return -ENOMEM;
 
-	bitmap_zero(cqm_rmid_bitmap, cqm_max_rmid);
+	for (r = 0; r <= cqm_max_rmid; r++) {
+		struct cqm_rmid_entry *entry;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry)
+			goto fail;
+
+		INIT_LIST_HEAD(&entry->list);
+		entry->rmid = r;
+		cqm_rmid_ptrs[r] = entry;
+
+		list_add_tail(&entry->list, &cqm_rmid_lru);
+	}
 
 	/*
 	 * RMID 0 is special and is always allocated. It's used for all
 	 * tasks that are not monitored.
 	 */
-	bitmap_allocate_region(cqm_rmid_bitmap, 0, 0);
+	entry = __rmid_entry(0);
+	list_del(&entry->list);
 
 	return 0;
+fail:
+	while (r--)
+		kfree(cqm_rmid_ptrs[r]);
+
+	kfree(cqm_rmid_ptrs);
+	return -ENOMEM;
 }
 
 /*
-- 
cgit v1.2.3


From bfe1fcd2688f557a6b6a88f59ea7619228728bd7 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Fri, 23 Jan 2015 18:45:46 +0000
Subject: perf/x86/intel: Support task events with Intel CQM

Add support for task events as well as system-wide events. This change
has a big impact on the way that we gather LLC occupancy values in
intel_cqm_event_read().

Currently, for system-wide (per-cpu) events we defer processing to
userspace which knows how to discard all but one cpu result per package.

Things aren't so simple for task events because we need to do the value
aggregation ourselves. To do this, we defer updating the LLC occupancy
value in event->count from intel_cqm_event_read() and do an SMP
cross-call to read values for all packages in intel_cqm_event_count().
We need to ensure that we only do this for one task event per cache
group, otherwise we'll report duplicate values.

If we're a system-wide event we want to fallback to the default
perf_event_count() implementation. Refactor this into a common function
so that we don't duplicate the code.

Also, introduce PERF_TYPE_INTEL_CQM, since we need a way to track an
event's task (if the event isn't per-cpu) inside of the Intel CQM PMU
driver.  This task information is only availble in the upper layers of
the perf infrastructure.

Other perf backends stash the target task in event->hw.*target so we
need to do something similar. The task is used to determine whether
events should share a cache group and an RMID.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Cc: linux-api@vger.kernel.org
Link: http://lkml.kernel.org/r/1422038748-21397-8-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 195 +++++++++++++++++++++++++----
 include/linux/perf_event.h                 |   1 +
 include/uapi/linux/perf_event.h            |   1 +
 kernel/events/core.c                       |   2 +
 4 files changed, 178 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index b5d9d746dbc0..8003d87afd89 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -182,23 +182,124 @@ fail:
 
 /*
  * Determine if @a and @b measure the same set of tasks.
+ *
+ * If @a and @b measure the same set of tasks then we want to share a
+ * single RMID.
  */
 static bool __match_event(struct perf_event *a, struct perf_event *b)
 {
+	/* Per-cpu and task events don't mix */
 	if ((a->attach_state & PERF_ATTACH_TASK) !=
 	    (b->attach_state & PERF_ATTACH_TASK))
 		return false;
 
-	/* not task */
+#ifdef CONFIG_CGROUP_PERF
+	if (a->cgrp != b->cgrp)
+		return false;
+#endif
+
+	/* If not task event, we're machine wide */
+	if (!(b->attach_state & PERF_ATTACH_TASK))
+		return true;
+
+	/*
+	 * Events that target same task are placed into the same cache group.
+	 */
+	if (a->hw.cqm_target == b->hw.cqm_target)
+		return true;
+
+	/*
+	 * Are we an inherited event?
+	 */
+	if (b->parent == a)
+		return true;
+
+	return false;
+}
+
+#ifdef CONFIG_CGROUP_PERF
+static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
+{
+	if (event->attach_state & PERF_ATTACH_TASK)
+		return perf_cgroup_from_task(event->hw.cqm_target);
 
-	return true; /* if not task, we're machine wide */
+	return event->cgrp;
 }
+#endif
 
 /*
  * Determine if @a's tasks intersect with @b's tasks
+ *
+ * There are combinations of events that we explicitly prohibit,
+ *
+ *		   PROHIBITS
+ *     system-wide    -> 	cgroup and task
+ *     cgroup 	      ->	system-wide
+ *     		      ->	task in cgroup
+ *     task 	      -> 	system-wide
+ *     		      ->	task in cgroup
+ *
+ * Call this function before allocating an RMID.
  */
 static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 {
+#ifdef CONFIG_CGROUP_PERF
+	/*
+	 * We can have any number of cgroups but only one system-wide
+	 * event at a time.
+	 */
+	if (a->cgrp && b->cgrp) {
+		struct perf_cgroup *ac = a->cgrp;
+		struct perf_cgroup *bc = b->cgrp;
+
+		/*
+		 * This condition should have been caught in
+		 * __match_event() and we should be sharing an RMID.
+		 */
+		WARN_ON_ONCE(ac == bc);
+
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+
+	if (a->cgrp || b->cgrp) {
+		struct perf_cgroup *ac, *bc;
+
+		/*
+		 * cgroup and system-wide events are mutually exclusive
+		 */
+		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
+		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
+			return true;
+
+		/*
+		 * Ensure neither event is part of the other's cgroup
+		 */
+		ac = event_to_cgroup(a);
+		bc = event_to_cgroup(b);
+		if (ac == bc)
+			return true;
+
+		/*
+		 * Must have cgroup and non-intersecting task events.
+		 */
+		if (!ac || !bc)
+			return false;
+
+		/*
+		 * We have cgroup and task events, and the task belongs
+		 * to a cgroup. Check for for overlap.
+		 */
+		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
+		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
+			return true;
+
+		return false;
+	}
+#endif
 	/*
 	 * If one of them is not a task, same story as above with cgroups.
 	 */
@@ -245,9 +346,16 @@ static int intel_cqm_setup_event(struct perf_event *event,
 
 static void intel_cqm_event_read(struct perf_event *event)
 {
-	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned long rmid;
 	u64 val;
 
+	/*
+	 * Task events are handled by intel_cqm_event_count().
+	 */
+	if (event->cpu == -1)
+		return;
+
+	rmid = event->hw.cqm_rmid;
 	val = __rmid_read(rmid);
 
 	/*
@@ -259,6 +367,63 @@ static void intel_cqm_event_read(struct perf_event *event)
 	local64_set(&event->count, val);
 }
 
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info)
+{
+	struct rmid_read *rr = info;
+	u64 val;
+
+	val = __rmid_read(rr->rmid);
+
+	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
+		return;
+
+	atomic64_add(val, &rr->value);
+}
+
+static inline bool cqm_group_leader(struct perf_event *event)
+{
+	return !list_empty(&event->hw.cqm_groups_entry);
+}
+
+static u64 intel_cqm_event_count(struct perf_event *event)
+{
+	struct rmid_read rr = {
+		.rmid = event->hw.cqm_rmid,
+		.value = ATOMIC64_INIT(0),
+	};
+
+	/*
+	 * We only need to worry about task events. System-wide events
+	 * are handled like usual, i.e. entirely with
+	 * intel_cqm_event_read().
+	 */
+	if (event->cpu != -1)
+		return __perf_event_count(event);
+
+	/*
+	 * Only the group leader gets to report values. This stops us
+	 * reporting duplicate values to userspace, and gives us a clear
+	 * rule for which task gets to report the values.
+	 *
+	 * Note that it is impossible to attribute these values to
+	 * specific packages - we forfeit that ability when we create
+	 * task events.
+	 */
+	if (!cqm_group_leader(event))
+		return 0;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+
+	local64_set(&event->count, atomic64_read(&rr.value));
+
+	return __perf_event_count(event);
+}
+
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
@@ -344,7 +509,7 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 	/*
 	 * And we're the group leader..
 	 */
-	if (!list_empty(&event->hw.cqm_groups_entry)) {
+	if (cqm_group_leader(event)) {
 		/*
 		 * If there was a group_other, make that leader, otherwise
 		 * destroy the group and return the RMID.
@@ -365,17 +530,6 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 
 static struct pmu intel_cqm_pmu;
 
-/*
- * XXX there's a bit of a problem in that we cannot simply do the one
- * event per node as one would want, since that one event would one get
- * scheduled on the one cpu. But we want to 'schedule' the RMID on all
- * CPUs.
- *
- * This means we want events for each CPU, however, that generates a lot
- * of duplicate values out to userspace -- this is not to be helped
- * unless we want to change the core code in some way. Fore more info,
- * see intel_cqm_event_read().
- */
 static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
@@ -387,9 +541,6 @@ static int intel_cqm_event_init(struct perf_event *event)
 	if (event->attr.config & ~QOS_EVENT_MASK)
 		return -EINVAL;
 
-	if (event->cpu == -1)
-		return -EINVAL;
-
 	/* unsupported modes and filters */
 	if (event->attr.exclude_user   ||
 	    event->attr.exclude_kernel ||
@@ -407,7 +558,8 @@ static int intel_cqm_event_init(struct perf_event *event)
 
 	mutex_lock(&cache_mutex);
 
-	err = intel_cqm_setup_event(event, &group); /* will also set rmid */
+	/* Will also set rmid */
+	err = intel_cqm_setup_event(event, &group);
 	if (err)
 		goto out;
 
@@ -470,6 +622,7 @@ static struct pmu intel_cqm_pmu = {
 	.start		= intel_cqm_event_start,
 	.stop		= intel_cqm_event_stop,
 	.read		= intel_cqm_event_read,
+	.count		= intel_cqm_event_count,
 };
 
 static inline void cqm_pick_event_reader(int cpu)
@@ -599,8 +752,8 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
-
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
+				PERF_TYPE_INTEL_CQM);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ca5504c48f4f..dac4c2831d82 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,7 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
+			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3c8b45de57ec 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,6 +32,7 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
+	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1fc3bae5904a..71109a045450 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7181,6 +7181,8 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		else if (attr->type == PERF_TYPE_BREAKPOINT)
 			event->hw.bp_target = task;
 #endif
+		else if (attr->type == PERF_TYPE_INTEL_CQM)
+			event->hw.cqm_target = task;
 	}
 
 	if (!overflow_handler && parent_event) {
-- 
cgit v1.2.3


From bff671dba7981195a644a5dc210d65de8ae2d251 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Fri, 23 Jan 2015 18:45:47 +0000
Subject: perf/x86/intel: Perform rotation on Intel CQM RMIDs

There are many use cases where people will want to monitor more tasks
than there exist RMIDs in the hardware, meaning that we have to perform
some kind of multiplexing.

We do this by "rotating" the RMIDs in a workqueue, and assigning an RMID
to a waiting event when the RMID becomes unused.

This scheme reserves one RMID at all times for rotation. When we need to
schedule a new event we give it the reserved RMID, pick a victim event
from the front of the global CQM list and wait for the victim's RMID to
drop to zero occupancy, before it becomes the new reserved RMID.

We put the victim's RMID onto the limbo list, where it resides for a
"minimum queue time", which is intended to save ourselves an expensive
smp IPI when the RMID is unlikely to have a occupancy value below
__intel_cqm_threshold.

If we fail to recycle an RMID, even after waiting the minimum queue time
then we need to increment __intel_cqm_threshold. There is an upper bound
on this threshold, __intel_cqm_max_threshold, which is programmable from
userland as /sys/devices/intel_cqm/max_recycling_threshold.

The comments above __intel_cqm_rmid_rotate() have more details.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-9-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 671 ++++++++++++++++++++++++++---
 1 file changed, 623 insertions(+), 48 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 8003d87afd89..e31f5086f2b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -25,9 +25,13 @@ struct intel_cqm_state {
 static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state);
 
 /*
- * Protects cache_cgroups and cqm_rmid_lru.
+ * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
+ * Also protects event->hw.cqm_rmid
+ *
+ * Hold either for stability, both for modification of ->hw.cqm_rmid.
  */
 static DEFINE_MUTEX(cache_mutex);
+static DEFINE_RAW_SPINLOCK(cache_lock);
 
 /*
  * Groups of events that have the same target(s), one RMID per group.
@@ -46,7 +50,34 @@ static cpumask_t cqm_cpumask;
 
 #define QOS_EVENT_MASK	QOS_L3_OCCUP_EVENT_ID
 
-static u64 __rmid_read(unsigned long rmid)
+/*
+ * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
+ *
+ * This rmid is always free and is guaranteed to have an associated
+ * near-zero occupancy value, i.e. no cachelines are tagged with this
+ * RMID, once __intel_cqm_rmid_rotate() returns.
+ */
+static unsigned int intel_cqm_rotation_rmid;
+
+#define INVALID_RMID		(-1)
+
+/*
+ * Is @rmid valid for programming the hardware?
+ *
+ * rmid 0 is reserved by the hardware for all non-monitored tasks, which
+ * means that we should never come across an rmid with that value.
+ * Likewise, an rmid value of -1 is used to indicate "no rmid currently
+ * assigned" and is used as part of the rotation code.
+ */
+static inline bool __rmid_valid(unsigned int rmid)
+{
+	if (!rmid || rmid == INVALID_RMID)
+		return false;
+
+	return true;
+}
+
+static u64 __rmid_read(unsigned int rmid)
 {
 	u64 val;
 
@@ -64,13 +95,21 @@ static u64 __rmid_read(unsigned long rmid)
 	return val;
 }
 
+enum rmid_recycle_state {
+	RMID_YOUNG = 0,
+	RMID_AVAILABLE,
+	RMID_DIRTY,
+};
+
 struct cqm_rmid_entry {
-	u64 rmid;
+	unsigned int rmid;
+	enum rmid_recycle_state state;
 	struct list_head list;
+	unsigned long queue_time;
 };
 
 /*
- * A least recently used list of RMIDs.
+ * cqm_rmid_free_lru - A least recently used list of RMIDs.
  *
  * Oldest entry at the head, newest (most recently used) entry at the
  * tail. This list is never traversed, it's only used to keep track of
@@ -81,9 +120,18 @@ struct cqm_rmid_entry {
  * in use. To mark an RMID as in use, remove its entry from the lru
  * list.
  *
- * This list is protected by cache_mutex.
+ *
+ * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
+ *
+ * This list is contains RMIDs that no one is currently using but that
+ * may have a non-zero occupancy value associated with them. The
+ * rotation worker moves RMIDs from the limbo list to the free list once
+ * the occupancy value drops below __intel_cqm_threshold.
+ *
+ * Both lists are protected by cache_mutex.
  */
-static LIST_HEAD(cqm_rmid_lru);
+static LIST_HEAD(cqm_rmid_free_lru);
+static LIST_HEAD(cqm_rmid_limbo_lru);
 
 /*
  * We use a simple array of pointers so that we can lookup a struct
@@ -120,37 +168,43 @@ static int __get_rmid(void)
 
 	lockdep_assert_held(&cache_mutex);
 
-	if (list_empty(&cqm_rmid_lru))
-		return -EAGAIN;
+	if (list_empty(&cqm_rmid_free_lru))
+		return INVALID_RMID;
 
-	entry = list_first_entry(&cqm_rmid_lru, struct cqm_rmid_entry, list);
+	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
 	list_del(&entry->list);
 
 	return entry->rmid;
 }
 
-static void __put_rmid(int rmid)
+static void __put_rmid(unsigned int rmid)
 {
 	struct cqm_rmid_entry *entry;
 
 	lockdep_assert_held(&cache_mutex);
 
+	WARN_ON(!__rmid_valid(rmid));
 	entry = __rmid_entry(rmid);
 
-	list_add_tail(&entry->list, &cqm_rmid_lru);
+	entry->queue_time = jiffies;
+	entry->state = RMID_YOUNG;
+
+	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
 }
 
 static int intel_cqm_setup_rmid_cache(void)
 {
 	struct cqm_rmid_entry *entry;
-	int r;
+	unsigned int nr_rmids;
+	int r = 0;
 
+	nr_rmids = cqm_max_rmid + 1;
 	cqm_rmid_ptrs = kmalloc(sizeof(struct cqm_rmid_entry *) *
-				(cqm_max_rmid + 1), GFP_KERNEL);
+				nr_rmids, GFP_KERNEL);
 	if (!cqm_rmid_ptrs)
 		return -ENOMEM;
 
-	for (r = 0; r <= cqm_max_rmid; r++) {
+	for (; r <= cqm_max_rmid; r++) {
 		struct cqm_rmid_entry *entry;
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -161,7 +215,7 @@ static int intel_cqm_setup_rmid_cache(void)
 		entry->rmid = r;
 		cqm_rmid_ptrs[r] = entry;
 
-		list_add_tail(&entry->list, &cqm_rmid_lru);
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
 	}
 
 	/*
@@ -171,6 +225,10 @@ static int intel_cqm_setup_rmid_cache(void)
 	entry = __rmid_entry(0);
 	list_del(&entry->list);
 
+	mutex_lock(&cache_mutex);
+	intel_cqm_rotation_rmid = __get_rmid();
+	mutex_unlock(&cache_mutex);
+
 	return 0;
 fail:
 	while (r--)
@@ -313,6 +371,424 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b)
 	return false;
 }
 
+struct rmid_read {
+	unsigned int rmid;
+	atomic64_t value;
+};
+
+static void __intel_cqm_event_count(void *info);
+
+/*
+ * Exchange the RMID of a group of events.
+ */
+static unsigned int
+intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid)
+{
+	struct perf_event *event;
+	unsigned int old_rmid = group->hw.cqm_rmid;
+	struct list_head *head = &group->hw.cqm_group_entry;
+
+	lockdep_assert_held(&cache_mutex);
+
+	/*
+	 * If our RMID is being deallocated, perform a read now.
+	 */
+	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
+		struct rmid_read rr = {
+			.value = ATOMIC64_INIT(0),
+			.rmid = old_rmid,
+		};
+
+		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count,
+				 &rr, 1);
+		local64_set(&group->count, atomic64_read(&rr.value));
+	}
+
+	raw_spin_lock_irq(&cache_lock);
+
+	group->hw.cqm_rmid = rmid;
+	list_for_each_entry(event, head, hw.cqm_group_entry)
+		event->hw.cqm_rmid = rmid;
+
+	raw_spin_unlock_irq(&cache_lock);
+
+	return old_rmid;
+}
+
+/*
+ * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
+ * cachelines are still tagged with RMIDs in limbo, we progressively
+ * increment the threshold until we find an RMID in limbo with <=
+ * __intel_cqm_threshold lines tagged. This is designed to mitigate the
+ * problem where cachelines tagged with an RMID are not steadily being
+ * evicted.
+ *
+ * On successful rotations we decrease the threshold back towards zero.
+ *
+ * __intel_cqm_max_threshold provides an upper bound on the threshold,
+ * and is measured in bytes because it's exposed to userland.
+ */
+static unsigned int __intel_cqm_threshold;
+static unsigned int __intel_cqm_max_threshold;
+
+/*
+ * Test whether an RMID has a zero occupancy value on this cpu.
+ */
+static void intel_cqm_stable(void *arg)
+{
+	struct cqm_rmid_entry *entry;
+
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		if (entry->state != RMID_AVAILABLE)
+			break;
+
+		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
+			entry->state = RMID_DIRTY;
+	}
+}
+
+/*
+ * If we have group events waiting for an RMID that don't conflict with
+ * events already running, assign @rmid.
+ */
+static bool intel_cqm_sched_in_event(unsigned int rmid)
+{
+	struct perf_event *leader, *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	leader = list_first_entry(&cache_groups, struct perf_event,
+				  hw.cqm_groups_entry);
+	event = leader;
+
+	list_for_each_entry_continue(event, &cache_groups,
+				     hw.cqm_groups_entry) {
+		if (__rmid_valid(event->hw.cqm_rmid))
+			continue;
+
+		if (__conflict_event(event, leader))
+			continue;
+
+		intel_cqm_xchg_rmid(event, rmid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initially use this constant for both the limbo queue time and the
+ * rotation timer interval, pmu::hrtimer_interval_ms.
+ *
+ * They don't need to be the same, but the two are related since if you
+ * rotate faster than you recycle RMIDs, you may run out of available
+ * RMIDs.
+ */
+#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
+
+static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
+
+/*
+ * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
+ * @nr_available: number of freeable RMIDs on the limbo list
+ *
+ * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
+ * cachelines are tagged with those RMIDs. After this we can reuse them
+ * and know that the current set of active RMIDs is stable.
+ *
+ * Return %true or %false depending on whether stabilization needs to be
+ * reattempted.
+ *
+ * If we return %true then @nr_available is updated to indicate the
+ * number of RMIDs on the limbo list that have been queued for the
+ * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
+ * are above __intel_cqm_threshold.
+ */
+static bool intel_cqm_rmid_stabilize(unsigned int *available)
+{
+	struct cqm_rmid_entry *entry, *tmp;
+	struct perf_event *event;
+
+	lockdep_assert_held(&cache_mutex);
+
+	*available = 0;
+	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
+		unsigned long min_queue_time;
+		unsigned long now = jiffies;
+
+		/*
+		 * We hold RMIDs placed into limbo for a minimum queue
+		 * time. Before the minimum queue time has elapsed we do
+		 * not recycle RMIDs.
+		 *
+		 * The reasoning is that until a sufficient time has
+		 * passed since we stopped using an RMID, any RMID
+		 * placed onto the limbo list will likely still have
+		 * data tagged in the cache, which means we'll probably
+		 * fail to recycle it anyway.
+		 *
+		 * We can save ourselves an expensive IPI by skipping
+		 * any RMIDs that have not been queued for the minimum
+		 * time.
+		 */
+		min_queue_time = entry->queue_time +
+			msecs_to_jiffies(__rmid_queue_time_ms);
+
+		if (time_after(min_queue_time, now))
+			break;
+
+		entry->state = RMID_AVAILABLE;
+		(*available)++;
+	}
+
+	/*
+	 * Fast return if none of the RMIDs on the limbo list have been
+	 * sitting on the queue for the minimum queue time.
+	 */
+	if (!*available)
+		return false;
+
+	/*
+	 * Test whether an RMID is free for each package.
+	 */
+	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
+
+	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
+		/*
+		 * Exhausted all RMIDs that have waited min queue time.
+		 */
+		if (entry->state == RMID_YOUNG)
+			break;
+
+		if (entry->state == RMID_DIRTY)
+			continue;
+
+		list_del(&entry->list);	/* remove from limbo */
+
+		/*
+		 * The rotation RMID gets priority if it's
+		 * currently invalid. In which case, skip adding
+		 * the RMID to the the free lru.
+		 */
+		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
+			intel_cqm_rotation_rmid = entry->rmid;
+			continue;
+		}
+
+		/*
+		 * If we have groups waiting for RMIDs, hand
+		 * them one now.
+		 */
+		list_for_each_entry(event, &cache_groups,
+				    hw.cqm_groups_entry) {
+			if (__rmid_valid(event->hw.cqm_rmid))
+				continue;
+
+			intel_cqm_xchg_rmid(event, entry->rmid);
+			entry = NULL;
+			break;
+		}
+
+		if (!entry)
+			continue;
+
+		/*
+		 * Otherwise place it onto the free list.
+		 */
+		list_add_tail(&entry->list, &cqm_rmid_free_lru);
+	}
+
+
+	return __rmid_valid(intel_cqm_rotation_rmid);
+}
+
+/*
+ * Pick a victim group and move it to the tail of the group list.
+ */
+static struct perf_event *
+__intel_cqm_pick_and_rotate(void)
+{
+	struct perf_event *rotor;
+
+	lockdep_assert_held(&cache_mutex);
+	lockdep_assert_held(&cache_lock);
+
+	rotor = list_first_entry(&cache_groups, struct perf_event,
+				 hw.cqm_groups_entry);
+	list_rotate_left(&cache_groups);
+
+	return rotor;
+}
+
+/*
+ * Attempt to rotate the groups and assign new RMIDs.
+ *
+ * Rotating RMIDs is complicated because the hardware doesn't give us
+ * any clues.
+ *
+ * There's problems with the hardware interface; when you change the
+ * task:RMID map cachelines retain their 'old' tags, giving a skewed
+ * picture. In order to work around this, we must always keep one free
+ * RMID - intel_cqm_rotation_rmid.
+ *
+ * Rotation works by taking away an RMID from a group (the old RMID),
+ * and assigning the free RMID to another group (the new RMID). We must
+ * then wait for the old RMID to not be used (no cachelines tagged).
+ * This ensure that all cachelines are tagged with 'active' RMIDs. At
+ * this point we can start reading values for the new RMID and treat the
+ * old RMID as the free RMID for the next rotation.
+ *
+ * Return %true or %false depending on whether we did any rotating.
+ */
+static bool __intel_cqm_rmid_rotate(void)
+{
+	struct perf_event *group, *rotor, *start = NULL;
+	unsigned int threshold_limit;
+	unsigned int nr_needed = 0;
+	unsigned int nr_available;
+	unsigned int rmid;
+	bool rotated = false;
+
+	mutex_lock(&cache_mutex);
+
+again:
+	/*
+	 * Fast path through this function if there are no groups and no
+	 * RMIDs that need cleaning.
+	 */
+	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
+		if (!__rmid_valid(group->hw.cqm_rmid)) {
+			if (!start)
+				start = group;
+			nr_needed++;
+		}
+	}
+
+	/*
+	 * We have some event groups, but they all have RMIDs assigned
+	 * and no RMIDs need cleaning.
+	 */
+	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
+		goto out;
+
+	if (!nr_needed)
+		goto stabilize;
+
+	/*
+	 * We have more event groups without RMIDs than available RMIDs.
+	 *
+	 * We force deallocate the rmid of the group at the head of
+	 * cache_groups. The first event group without an RMID then gets
+	 * assigned intel_cqm_rotation_rmid. This ensures we always make
+	 * forward progress.
+	 *
+	 * Rotate the cache_groups list so the previous head is now the
+	 * tail.
+	 */
+	rotor = __intel_cqm_pick_and_rotate();
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned.
+	 */
+	if (!__rmid_valid(rmid))
+		goto stabilize;
+
+	/*
+	 * If the rotation is going to succeed, reduce the threshold so
+	 * that we don't needlessly reuse dirty RMIDs.
+	 */
+	if (__rmid_valid(intel_cqm_rotation_rmid)) {
+		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
+		intel_cqm_rotation_rmid = INVALID_RMID;
+
+		if (__intel_cqm_threshold)
+			__intel_cqm_threshold--;
+	}
+
+	__put_rmid(rmid);
+
+	rotated = true;
+
+stabilize:
+	/*
+	 * We now need to stablize the RMID we freed above (if any) to
+	 * ensure that the next time we rotate we have an RMID with zero
+	 * occupancy value.
+	 *
+	 * Alternatively, if we didn't need to perform any rotation,
+	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
+	 */
+	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
+
+	while (intel_cqm_rmid_stabilize(&nr_available) &&
+	       __intel_cqm_threshold < threshold_limit) {
+		unsigned int steal_limit;
+
+		/*
+		 * Don't spin if nobody is actively waiting for an RMID,
+		 * the rotation worker will be kicked as soon as an
+		 * event needs an RMID anyway.
+		 */
+		if (!nr_needed)
+			break;
+
+		/* Allow max 25% of RMIDs to be in limbo. */
+		steal_limit = (cqm_max_rmid + 1) / 4;
+
+		/*
+		 * We failed to stabilize any RMIDs so our rotation
+		 * logic is now stuck. In order to make forward progress
+		 * we have a few options:
+		 *
+		 *   1. rotate ("steal") another RMID
+		 *   2. increase the threshold
+		 *   3. do nothing
+		 *
+		 * We do both of 1. and 2. until we hit the steal limit.
+		 *
+		 * The steal limit prevents all RMIDs ending up on the
+		 * limbo list. This can happen if every RMID has a
+		 * non-zero occupancy above threshold_limit, and the
+		 * occupancy values aren't dropping fast enough.
+		 *
+		 * Note that there is prioritisation at work here - we'd
+		 * rather increase the number of RMIDs on the limbo list
+		 * than increase the threshold, because increasing the
+		 * threshold skews the event data (because we reuse
+		 * dirty RMIDs) - threshold bumps are a last resort.
+		 */
+		if (nr_available < steal_limit)
+			goto again;
+
+		__intel_cqm_threshold++;
+	}
+
+out:
+	mutex_unlock(&cache_mutex);
+	return rotated;
+}
+
+static void intel_cqm_rmid_rotate(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
+
+static struct pmu intel_cqm_pmu;
+
+static void intel_cqm_rmid_rotate(struct work_struct *work)
+{
+	unsigned long delay;
+
+	__intel_cqm_rmid_rotate();
+
+	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
+	schedule_delayed_work(&intel_cqm_rmid_work, delay);
+}
+
 /*
  * Find a group and setup RMID.
  *
@@ -322,7 +798,6 @@ static int intel_cqm_setup_event(struct perf_event *event,
 				 struct perf_event **group)
 {
 	struct perf_event *iter;
-	int rmid;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
 		if (__match_event(iter, event)) {
@@ -336,17 +811,14 @@ static int intel_cqm_setup_event(struct perf_event *event,
 			return -EBUSY;
 	}
 
-	rmid = __get_rmid();
-	if (rmid < 0)
-		return rmid;
-
-	event->hw.cqm_rmid = rmid;
+	event->hw.cqm_rmid = __get_rmid();
 	return 0;
 }
 
 static void intel_cqm_event_read(struct perf_event *event)
 {
-	unsigned long rmid;
+	unsigned long flags;
+	unsigned int rmid;
 	u64 val;
 
 	/*
@@ -355,23 +827,25 @@ static void intel_cqm_event_read(struct perf_event *event)
 	if (event->cpu == -1)
 		return;
 
+	raw_spin_lock_irqsave(&cache_lock, flags);
 	rmid = event->hw.cqm_rmid;
+
+	if (!__rmid_valid(rmid))
+		goto out;
+
 	val = __rmid_read(rmid);
 
 	/*
 	 * Ignore this reading on error states and do not update the value.
 	 */
 	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
-		return;
+		goto out;
 
 	local64_set(&event->count, val);
+out:
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
 }
 
-struct rmid_read {
-	unsigned int rmid;
-	atomic64_t value;
-};
-
 static void __intel_cqm_event_count(void *info)
 {
 	struct rmid_read *rr = info;
@@ -392,8 +866,8 @@ static inline bool cqm_group_leader(struct perf_event *event)
 
 static u64 intel_cqm_event_count(struct perf_event *event)
 {
+	unsigned long flags;
 	struct rmid_read rr = {
-		.rmid = event->hw.cqm_rmid,
 		.value = ATOMIC64_INIT(0),
 	};
 
@@ -417,17 +891,36 @@ static u64 intel_cqm_event_count(struct perf_event *event)
 	if (!cqm_group_leader(event))
 		return 0;
 
-	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
+	/*
+	 * Notice that we don't perform the reading of an RMID
+	 * atomically, because we can't hold a spin lock across the
+	 * IPIs.
+	 *
+	 * Speculatively perform the read, since @event might be
+	 * assigned a different (possibly invalid) RMID while we're
+	 * busying performing the IPI calls. It's therefore necessary to
+	 * check @event's RMID afterwards, and if it has changed,
+	 * discard the result of the read.
+	 */
+	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
 
-	local64_set(&event->count, atomic64_read(&rr.value));
+	if (!__rmid_valid(rr.rmid))
+		goto out;
+
+	on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, &rr, 1);
 
+	raw_spin_lock_irqsave(&cache_lock, flags);
+	if (event->hw.cqm_rmid == rr.rmid)
+		local64_set(&event->count, atomic64_read(&rr.value));
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+out:
 	return __perf_event_count(event);
 }
 
 static void intel_cqm_event_start(struct perf_event *event, int mode)
 {
 	struct intel_cqm_state *state = this_cpu_ptr(&cqm_state);
-	unsigned long rmid = event->hw.cqm_rmid;
+	unsigned int rmid = event->hw.cqm_rmid;
 	unsigned long flags;
 
 	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
@@ -473,15 +966,19 @@ static void intel_cqm_event_stop(struct perf_event *event, int mode)
 
 static int intel_cqm_event_add(struct perf_event *event, int mode)
 {
-	int rmid;
+	unsigned long flags;
+	unsigned int rmid;
+
+	raw_spin_lock_irqsave(&cache_lock, flags);
 
 	event->hw.cqm_state = PERF_HES_STOPPED;
 	rmid = event->hw.cqm_rmid;
-	WARN_ON_ONCE(!rmid);
 
-	if (mode & PERF_EF_START)
+	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
 		intel_cqm_event_start(event, mode);
 
+	raw_spin_unlock_irqrestore(&cache_lock, flags);
+
 	return 0;
 }
 
@@ -518,9 +1015,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 			list_replace(&event->hw.cqm_groups_entry,
 				     &group_other->hw.cqm_groups_entry);
 		} else {
-			int rmid = event->hw.cqm_rmid;
+			unsigned int rmid = event->hw.cqm_rmid;
 
-			__put_rmid(rmid);
+			if (__rmid_valid(rmid))
+				__put_rmid(rmid);
 			list_del(&event->hw.cqm_groups_entry);
 		}
 	}
@@ -528,11 +1026,10 @@ static void intel_cqm_event_destroy(struct perf_event *event)
 	mutex_unlock(&cache_mutex);
 }
 
-static struct pmu intel_cqm_pmu;
-
 static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
+	bool rotate = false;
 	int err;
 
 	if (event->attr.type != intel_cqm_pmu.type)
@@ -569,10 +1066,24 @@ static int intel_cqm_event_init(struct perf_event *event)
 	} else {
 		list_add_tail(&event->hw.cqm_groups_entry,
 			      &cache_groups);
+
+		/*
+		 * All RMIDs are either in use or have recently been
+		 * used. Kick the rotation worker to clean/free some.
+		 *
+		 * We only do this for the group leader, rather than for
+		 * every event in a group to save on needless work.
+		 */
+		if (!__rmid_valid(event->hw.cqm_rmid))
+			rotate = true;
 	}
 
 out:
 	mutex_unlock(&cache_mutex);
+
+	if (rotate)
+		schedule_delayed_work(&intel_cqm_rmid_work, 0);
+
 	return err;
 }
 
@@ -607,22 +1118,76 @@ static struct attribute_group intel_cqm_format_group = {
 	.attrs = intel_cqm_formats_attr,
 };
 
+static ssize_t
+max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
+			   char *page)
+{
+	ssize_t rv;
+
+	mutex_lock(&cache_mutex);
+	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
+	mutex_unlock(&cache_mutex);
+
+	return rv;
+}
+
+static ssize_t
+max_recycle_threshold_store(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int bytes, cachelines;
+	int ret;
+
+	ret = kstrtouint(buf, 0, &bytes);
+	if (ret)
+		return ret;
+
+	mutex_lock(&cache_mutex);
+
+	__intel_cqm_max_threshold = bytes;
+	cachelines = bytes / cqm_l3_scale;
+
+	/*
+	 * The new maximum takes effect immediately.
+	 */
+	if (__intel_cqm_threshold > cachelines)
+		__intel_cqm_threshold = cachelines;
+
+	mutex_unlock(&cache_mutex);
+
+	return count;
+}
+
+static DEVICE_ATTR_RW(max_recycle_threshold);
+
+static struct attribute *intel_cqm_attrs[] = {
+	&dev_attr_max_recycle_threshold.attr,
+	NULL,
+};
+
+static const struct attribute_group intel_cqm_group = {
+	.attrs = intel_cqm_attrs,
+};
+
 static const struct attribute_group *intel_cqm_attr_groups[] = {
 	&intel_cqm_events_group,
 	&intel_cqm_format_group,
+	&intel_cqm_group,
 	NULL,
 };
 
 static struct pmu intel_cqm_pmu = {
-	.attr_groups	= intel_cqm_attr_groups,
-	.task_ctx_nr	= perf_sw_context,
-	.event_init	= intel_cqm_event_init,
-	.add		= intel_cqm_event_add,
-	.del		= intel_cqm_event_del,
-	.start		= intel_cqm_event_start,
-	.stop		= intel_cqm_event_stop,
-	.read		= intel_cqm_event_read,
-	.count		= intel_cqm_event_count,
+	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
+	.attr_groups	     = intel_cqm_attr_groups,
+	.task_ctx_nr	     = perf_sw_context,
+	.event_init	     = intel_cqm_event_init,
+	.add		     = intel_cqm_event_add,
+	.del		     = intel_cqm_event_del,
+	.start		     = intel_cqm_event_start,
+	.stop		     = intel_cqm_event_stop,
+	.read		     = intel_cqm_event_read,
+	.count		     = intel_cqm_event_count,
 };
 
 static inline void cqm_pick_event_reader(int cpu)
@@ -732,6 +1297,16 @@ static int __init intel_cqm_init(void)
 		}
 	}
 
+	/*
+	 * A reasonable upper limit on the max threshold is the number
+	 * of lines tagged per RMID if all RMIDs have the same number of
+	 * lines tagged in the LLC.
+	 *
+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
+	 */
+	__intel_cqm_max_threshold =
+		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
+
 	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
 	str = kstrdup(scale, GFP_KERNEL);
 	if (!str) {
-- 
cgit v1.2.3


From 59bf7fd45c90a8fde22a7717b5413e4ed9666c32 Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Fri, 23 Jan 2015 18:45:48 +0000
Subject: perf/x86/intel: Enable conflicting event scheduling for CQM

We can leverage the workqueue that we use for RMID rotation to support
scheduling of conflicting monitoring events. Allowing events that
monitor conflicting things is done at various other places in the perf
subsystem, so there's precedent there.

An example of two conflicting events would be monitoring a cgroup and
simultaneously monitoring a task within that cgroup.

This uses the cache_groups list as a queuing mechanism, where every
event that reaches the front of the list gets the chance to be scheduled
in, possibly descheduling any conflicting events that are running.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Kanaka Juvva <kanaka.d.juvva@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vikas Shivappa <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1422038748-21397-10-git-send-email-matt@codeblueprint.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_cqm.c | 130 +++++++++++++++++++----------
 1 file changed, 84 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index e31f5086f2b5..9a8ef8376fcd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -507,7 +507,6 @@ static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
 static bool intel_cqm_rmid_stabilize(unsigned int *available)
 {
 	struct cqm_rmid_entry *entry, *tmp;
-	struct perf_event *event;
 
 	lockdep_assert_held(&cache_mutex);
 
@@ -577,19 +576,9 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 
 		/*
 		 * If we have groups waiting for RMIDs, hand
-		 * them one now.
+		 * them one now provided they don't conflict.
 		 */
-		list_for_each_entry(event, &cache_groups,
-				    hw.cqm_groups_entry) {
-			if (__rmid_valid(event->hw.cqm_rmid))
-				continue;
-
-			intel_cqm_xchg_rmid(event, entry->rmid);
-			entry = NULL;
-			break;
-		}
-
-		if (!entry)
+		if (intel_cqm_sched_in_event(entry->rmid))
 			continue;
 
 		/*
@@ -604,25 +593,73 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available)
 
 /*
  * Pick a victim group and move it to the tail of the group list.
+ * @next: The first group without an RMID
  */
-static struct perf_event *
-__intel_cqm_pick_and_rotate(void)
+static void __intel_cqm_pick_and_rotate(struct perf_event *next)
 {
 	struct perf_event *rotor;
+	unsigned int rmid;
 
 	lockdep_assert_held(&cache_mutex);
-	lockdep_assert_held(&cache_lock);
 
 	rotor = list_first_entry(&cache_groups, struct perf_event,
 				 hw.cqm_groups_entry);
+
+	/*
+	 * The group at the front of the list should always have a valid
+	 * RMID. If it doesn't then no groups have RMIDs assigned and we
+	 * don't need to rotate the list.
+	 */
+	if (next == rotor)
+		return;
+
+	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
+	__put_rmid(rmid);
+
 	list_rotate_left(&cache_groups);
+}
+
+/*
+ * Deallocate the RMIDs from any events that conflict with @event, and
+ * place them on the back of the group list.
+ */
+static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
+{
+	struct perf_event *group, *g;
+	unsigned int rmid;
+
+	lockdep_assert_held(&cache_mutex);
+
+	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
+		if (group == event)
+			continue;
+
+		rmid = group->hw.cqm_rmid;
+
+		/*
+		 * Skip events that don't have a valid RMID.
+		 */
+		if (!__rmid_valid(rmid))
+			continue;
+
+		/*
+		 * No conflict? No problem! Leave the event alone.
+		 */
+		if (!__conflict_event(group, event))
+			continue;
 
-	return rotor;
+		intel_cqm_xchg_rmid(group, INVALID_RMID);
+		__put_rmid(rmid);
+	}
 }
 
 /*
  * Attempt to rotate the groups and assign new RMIDs.
  *
+ * We rotate for two reasons,
+ *   1. To handle the scheduling of conflicting events
+ *   2. To recycle RMIDs
+ *
  * Rotating RMIDs is complicated because the hardware doesn't give us
  * any clues.
  *
@@ -642,11 +679,10 @@ __intel_cqm_pick_and_rotate(void)
  */
 static bool __intel_cqm_rmid_rotate(void)
 {
-	struct perf_event *group, *rotor, *start = NULL;
+	struct perf_event *group, *start = NULL;
 	unsigned int threshold_limit;
 	unsigned int nr_needed = 0;
 	unsigned int nr_available;
-	unsigned int rmid;
 	bool rotated = false;
 
 	mutex_lock(&cache_mutex);
@@ -678,7 +714,9 @@ again:
 		goto stabilize;
 
 	/*
-	 * We have more event groups without RMIDs than available RMIDs.
+	 * We have more event groups without RMIDs than available RMIDs,
+	 * or we have event groups that conflict with the ones currently
+	 * scheduled.
 	 *
 	 * We force deallocate the rmid of the group at the head of
 	 * cache_groups. The first event group without an RMID then gets
@@ -688,15 +726,7 @@ again:
 	 * Rotate the cache_groups list so the previous head is now the
 	 * tail.
 	 */
-	rotor = __intel_cqm_pick_and_rotate();
-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
-
-	/*
-	 * The group at the front of the list should always have a valid
-	 * RMID. If it doesn't then no groups have RMIDs assigned.
-	 */
-	if (!__rmid_valid(rmid))
-		goto stabilize;
+	__intel_cqm_pick_and_rotate(start);
 
 	/*
 	 * If the rotation is going to succeed, reduce the threshold so
@@ -704,14 +734,14 @@ again:
 	 */
 	if (__rmid_valid(intel_cqm_rotation_rmid)) {
 		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
-		intel_cqm_rotation_rmid = INVALID_RMID;
+		intel_cqm_rotation_rmid = __get_rmid();
+
+		intel_cqm_sched_out_conflicting_events(start);
 
 		if (__intel_cqm_threshold)
 			__intel_cqm_threshold--;
 	}
 
-	__put_rmid(rmid);
-
 	rotated = true;
 
 stabilize:
@@ -794,25 +824,37 @@ static void intel_cqm_rmid_rotate(struct work_struct *work)
  *
  * If we're part of a group, we use the group's RMID.
  */
-static int intel_cqm_setup_event(struct perf_event *event,
-				 struct perf_event **group)
+static void intel_cqm_setup_event(struct perf_event *event,
+				  struct perf_event **group)
 {
 	struct perf_event *iter;
+	unsigned int rmid;
+	bool conflict = false;
 
 	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
+		rmid = iter->hw.cqm_rmid;
+
 		if (__match_event(iter, event)) {
 			/* All tasks in a group share an RMID */
-			event->hw.cqm_rmid = iter->hw.cqm_rmid;
+			event->hw.cqm_rmid = rmid;
 			*group = iter;
-			return 0;
+			return;
 		}
 
-		if (__conflict_event(iter, event))
-			return -EBUSY;
+		/*
+		 * We only care about conflicts for events that are
+		 * actually scheduled in (and hence have a valid RMID).
+		 */
+		if (__conflict_event(iter, event) && __rmid_valid(rmid))
+			conflict = true;
 	}
 
-	event->hw.cqm_rmid = __get_rmid();
-	return 0;
+	if (conflict)
+		rmid = INVALID_RMID;
+	else
+		rmid = __get_rmid();
+
+	event->hw.cqm_rmid = rmid;
 }
 
 static void intel_cqm_event_read(struct perf_event *event)
@@ -1030,7 +1072,6 @@ static int intel_cqm_event_init(struct perf_event *event)
 {
 	struct perf_event *group = NULL;
 	bool rotate = false;
-	int err;
 
 	if (event->attr.type != intel_cqm_pmu.type)
 		return -ENOENT;
@@ -1056,9 +1097,7 @@ static int intel_cqm_event_init(struct perf_event *event)
 	mutex_lock(&cache_mutex);
 
 	/* Will also set rmid */
-	err = intel_cqm_setup_event(event, &group);
-	if (err)
-		goto out;
+	intel_cqm_setup_event(event, &group);
 
 	if (group) {
 		list_add_tail(&event->hw.cqm_group_entry,
@@ -1078,13 +1117,12 @@ static int intel_cqm_event_init(struct perf_event *event)
 			rotate = true;
 	}
 
-out:
 	mutex_unlock(&cache_mutex);
 
 	if (rotate)
 		schedule_delayed_work(&intel_cqm_rmid_work, 0);
 
-	return err;
+	return 0;
 }
 
 EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
-- 
cgit v1.2.3


From b4d8327024637cb2a1f7910dcb5d0ad7a096f473 Mon Sep 17 00:00:00 2001
From: Wang Nan
Date: Thu, 26 Feb 2015 13:49:39 +0800
Subject: x86/traps: Enable DEBUG_STACK after cpu_init() for TRAP_DB/BP

Before this patch early_trap_init() installs DEBUG_STACK for
X86_TRAP_BP and X86_TRAP_DB. However, DEBUG_STACK doesn't work
correctly until cpu_init() <-- trap_init().

This patch passes 0 to set_intr_gate_ist() and
set_system_intr_gate_ist() instead of DEBUG_STACK to let it use
same stack as kernel, and installs DEBUG_STACK for them in
trap_init().

As core runs at ring 0 between early_trap_init() and
trap_init(), there is no chance to get a bad stack before
trap_init().

As NMI is also enabled in trap_init(), we don't need to care
about is_debug_stack() and related things used in
arch/x86/kernel/nmi.c.

Signed-off-by: Wang Nan <wangnan0@huawei.com>
Reviewed-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <dave.hansen@linux.intel.com>
Cc: <lizefan@huawei.com>
Cc: <luto@amacapital.net>
Cc: <oleg@redhat.com>
Link: http://lkml.kernel.org/r/1424929779-13174-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e2ecc9..42819886be0c 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -925,9 +925,17 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 /* Set of traps needed for early debugging. */
 void __init early_trap_init(void)
 {
-	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+	/*
+	 * Don't set ist to DEBUG_STACK as it doesn't work until TSS is
+	 * ready in cpu_init() <-- trap_init(). Before trap_init(), CPU
+	 * runs at ring 0 so it is impossible to hit an invalid stack.
+	 * Using the original stack works well enough at this early
+	 * stage. DEBUG_STACK will be equipped after cpu_init() in
+	 * trap_init().
+	 */
+	set_intr_gate_ist(X86_TRAP_DB, &debug, 0);
 	/* int3 can be called from all */
-	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+	set_system_intr_gate_ist(X86_TRAP_BP, &int3, 0);
 #ifdef CONFIG_X86_32
 	set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
@@ -1005,6 +1013,15 @@ void __init trap_init(void)
 	 */
 	cpu_init();
 
+	/*
+	 * X86_TRAP_DB and X86_TRAP_BP have been set
+	 * in early_trap_init(). However, DEBUG_STACK works only after
+	 * cpu_init() loads TSS. See comments in early_trap_init().
+	 */
+	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+	/* int3 can be called from all */
+	set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
 	x86_init.irqs.trap_init();
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From 66c046b407166686aeea2e8c0abfa1c4e4b94f26 Mon Sep 17 00:00:00 2001
From: Lad, Prabhakar
Date: Thu, 5 Feb 2015 10:29:35 +0000
Subject: crypto: sha-mb - Fix big integer constant sparse warning

this patch fixes following sparse warning:

sha1_mb_mgr_init_avx2.c:59:31: warning: constant 0xF76543210 is so big it is long

Signed-off-by: Lad, Prabhakar <prabhakar.csengg@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
index 4ca7e166a2aa..822acb5b464c 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb_mgr_init_avx2.c
@@ -56,7 +56,7 @@
 void sha1_mb_mgr_init_avx2(struct sha1_mb_mgr *state)
 {
 	unsigned int j;
-	state->unused_lanes = 0xF76543210;
+	state->unused_lanes = 0xF76543210ULL;
 	for (j = 0; j < 8; j++) {
 		state->lens[j] = 0xFFFFFFFF;
 		state->ldata[j].job_in_lane = NULL;
-- 
cgit v1.2.3


From b8f05c8803fce899d79ca66f8d7f348cf15fb40e Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Fri, 27 Feb 2015 15:45:29 +0100
Subject: x86/xen: correct bug in p2m list initialization

Commit 054954eb051f35e74b75a566a96fe756015352c8 ("xen: switch to
linear virtual mapped sparse p2m list") introduced an error.

During initialization of the p2m list a p2m identity area mapped by
a complete identity pmd entry has to be split up into smaller chunks
sometimes, if a non-identity pfn is introduced in this area.

If this non-identity pfn is not at index 0 of a p2m page the new
p2m page needed is initialized with wrong identity entries, as the
identity pfns don't start with the value corresponding to index 0,
but with the initial non-identity pfn. This results in weird wrong
mappings.

Correct the wrong initialization by starting with the correct pfn.

Cc: stable@vger.kernel.org # 3.19
Reported-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Tested-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/p2m.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 740ae3026a14..9f93af56a5fc 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -563,7 +563,7 @@ static bool alloc_p2m(unsigned long pfn)
 		if (p2m_pfn == PFN_DOWN(__pa(p2m_missing)))
 			p2m_init(p2m);
 		else
-			p2m_init_identity(p2m, pfn);
+			p2m_init_identity(p2m, pfn & ~(P2M_PER_PAGE - 1));
 
 		spin_lock_irqsave(&p2m_update_lock, flags);
 
-- 
cgit v1.2.3


From 5b2bdbc84556774afbe11bcfd24c2f6411cfa92b Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Fri, 27 Feb 2015 14:50:19 -0500
Subject: x86: Init per-cpu shadow copy of CR4 on 32-bit CPUs too

Commit:

   1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")

added a shadow CR4 such that reads and writes that do not
modify the CR4 execute much faster than always reading the
register itself.

The change modified cpu_init() in common.c, so that the
shadow CR4 gets initialized before anything uses it.

Unfortunately, there's two cpu_init()s in common.c. There's
one for 64-bit and one for 32-bit. The commit only added
the shadow init to the 64-bit path, but the 32-bit path
needs the init too.

Link: http://lkml.kernel.org/r/20150227125208.71c36402@gandalf.local.home Fixes: 1e02ce4cccdc "x86: Store a per-cpu shadow copy of CR4"
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/20150227145019.2bdd4354@gandalf.local.home
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index b5c8ff5e9dfc..2346c95c6ab1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1396,6 +1396,12 @@ void cpu_init(void)
 
 	wait_for_master_cpu(cpu);
 
+	/*
+	 * Initialize the CR4 shadow before doing anything that could
+	 * try to read it.
+	 */
+	cr4_init_shadow();
+
 	show_ucode_info_early();
 
 	printk(KERN_INFO "Initializing CPU#%d\n", cpu);
-- 
cgit v1.2.3


From 6bbb614ec478961c7443086bdf7fd6784479c14a Mon Sep 17 00:00:00 2001
From: Daniel Borkmann
Date: Fri, 27 Feb 2015 15:55:40 +0100
Subject: x86/mm: Unexport set_memory_ro() and set_memory_rw()

This effectively unexports set_memory_ro() and set_memory_rw()
functions, and thus reverts:

  a03352d2c1dc ("x86: export set_memory_ro and set_memory_rw").

They have been introduced for debugging purposes in e1000e, but
no module user is in mainline kernel (anymore?) and we
explicitly do not want modules to use these functions, as they
i.e. protect eBPF (interpreted & JIT'ed) images from malicious
modifications or bugs.

Outside of eBPF scope, I believe also other set_memory_*()
functions should be unexported on x86 for modules.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@plumgrid.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Bruce Allan <bruce.w.allan@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jesse Brandeburg <jesse.brandeburg@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: davem@davemloft.net
Link: http://lkml.kernel.org/r/a064393a0a5d319eebde5c761cfd743132d4f213.1425040940.git.daniel@iogearbox.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/pageattr.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 536ea2fb6e33..81e8282d8c2f 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1654,13 +1654,11 @@ int set_memory_ro(unsigned long addr, int numpages)
 {
 	return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
-EXPORT_SYMBOL_GPL(set_memory_ro);
 
 int set_memory_rw(unsigned long addr, int numpages)
 {
 	return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
 }
-EXPORT_SYMBOL_GPL(set_memory_rw);
 
 int set_memory_np(unsigned long addr, int numpages)
 {
-- 
cgit v1.2.3


From 81e397d937a8e9f46f024a9f876cf14d8e2b45a7 Mon Sep 17 00:00:00 2001
From: Tadeusz Struk
Date: Fri, 6 Feb 2015 10:25:20 -0800
Subject: crypto: aesni - make driver-gcm-aes-aesni helper a proper aead alg

Changed the __driver-gcm-aes-aesni to be a proper aead algorithm.
This required a valid setkey and setauthsize functions to be added and also
some changes to make sure that math context is not corrupted when the alg is
used directly.
Note that the __driver-gcm-aes-aesni should not be used directly by modules
that can use it in interrupt context as we don't have a good fallback mechanism
in this case.

Signed-off-by: Adrian Hoban <adrian.hoban@intel.com>
Signed-off-by: Tadeusz Struk <tadeusz.struk@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 164 +++++++++++++++++++++++++------------
 1 file changed, 110 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 947c6bf52c33..6893f4947583 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -890,15 +890,12 @@ out_free_ablkcipher:
 	return ret;
 }
 
-static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
-						   unsigned int key_len)
+static int common_rfc4106_set_key(struct crypto_aead *aead, const u8 *key,
+				  unsigned int key_len)
 {
 	int ret = 0;
-	struct crypto_tfm *tfm = crypto_aead_tfm(parent);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-	struct aesni_rfc4106_gcm_ctx *child_ctx =
-                                 aesni_rfc4106_gcm_ctx_get(cryptd_child);
+	struct crypto_tfm *tfm = crypto_aead_tfm(aead);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(aead);
 	u8 *new_key_align, *new_key_mem = NULL;
 
 	if (key_len < 4) {
@@ -943,20 +940,31 @@ static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
 		goto exit;
 	}
 	ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
-	memcpy(child_ctx, ctx, sizeof(*ctx));
 exit:
 	kfree(new_key_mem);
 	return ret;
 }
 
-/* This is the Integrity Check Value (aka the authentication tag length and can
- * be 8, 12 or 16 bytes long. */
-static int rfc4106_set_authsize(struct crypto_aead *parent,
-				unsigned int authsize)
+static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
+			   unsigned int key_len)
 {
 	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
-	struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
+	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
+	struct aesni_rfc4106_gcm_ctx *c_ctx = aesni_rfc4106_gcm_ctx_get(child);
+	struct cryptd_aead *cryptd_tfm = ctx->cryptd_tfm;
+	int ret;
 
+	ret = crypto_aead_setkey(child, key, key_len);
+	if (!ret) {
+		memcpy(ctx, c_ctx, sizeof(*ctx));
+		ctx->cryptd_tfm = cryptd_tfm;
+	}
+	return ret;
+}
+
+static int common_rfc4106_set_authsize(struct crypto_aead *aead,
+				       unsigned int authsize)
+{
 	switch (authsize) {
 	case 8:
 	case 12:
@@ -965,51 +973,23 @@ static int rfc4106_set_authsize(struct crypto_aead *parent,
 	default:
 		return -EINVAL;
 	}
-	crypto_aead_crt(parent)->authsize = authsize;
-	crypto_aead_crt(cryptd_child)->authsize = authsize;
+	crypto_aead_crt(aead)->authsize = authsize;
 	return 0;
 }
 
-static int rfc4106_encrypt(struct aead_request *req)
-{
-	int ret;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
-
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_aead_encrypt(cryptd_req);
-	} else {
-		struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-		kernel_fpu_begin();
-		ret = cryptd_child->base.crt_aead.encrypt(req);
-		kernel_fpu_end();
-		return ret;
-	}
-}
-
-static int rfc4106_decrypt(struct aead_request *req)
+/* This is the Integrity Check Value (aka the authentication tag length and can
+ * be 8, 12 or 16 bytes long. */
+static int rfc4106_set_authsize(struct crypto_aead *parent,
+				unsigned int authsize)
 {
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
+	struct crypto_aead *child = cryptd_aead_child(ctx->cryptd_tfm);
 	int ret;
-	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
-	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
 
-	if (!irq_fpu_usable()) {
-		struct aead_request *cryptd_req =
-			(struct aead_request *) aead_request_ctx(req);
-		memcpy(cryptd_req, req, sizeof(*req));
-		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
-		return crypto_aead_decrypt(cryptd_req);
-	} else {
-		struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
-		kernel_fpu_begin();
-		ret = cryptd_child->base.crt_aead.decrypt(req);
-		kernel_fpu_end();
-		return ret;
-	}
+	ret = crypto_aead_setauthsize(child, authsize);
+	if (!ret)
+		crypto_aead_crt(parent)->authsize = authsize;
+	return ret;
 }
 
 static int __driver_rfc4106_encrypt(struct aead_request *req)
@@ -1185,6 +1165,78 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 	}
 	return retval;
 }
+
+static int rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		ret = crypto_aead_encrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_encrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+	struct crypto_aead *tfm = crypto_aead_reqtfm(req);
+	struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
+
+	if (!irq_fpu_usable()) {
+		struct aead_request *cryptd_req =
+			(struct aead_request *) aead_request_ctx(req);
+
+		memcpy(cryptd_req, req, sizeof(*req));
+		aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
+		ret = crypto_aead_decrypt(cryptd_req);
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_decrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int helper_rfc4106_encrypt(struct aead_request *req)
+{
+	int ret;
+
+	if (unlikely(!irq_fpu_usable())) {
+		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+		ret = -EINVAL;
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_encrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
+
+static int helper_rfc4106_decrypt(struct aead_request *req)
+{
+	int ret;
+
+	if (unlikely(!irq_fpu_usable())) {
+		WARN_ONCE(1, "__gcm-aes-aesni alg used in invalid context");
+		ret = -EINVAL;
+	} else {
+		kernel_fpu_begin();
+		ret = __driver_rfc4106_decrypt(req);
+		kernel_fpu_end();
+	}
+	return ret;
+}
 #endif
 
 static struct crypto_alg aesni_algs[] = { {
@@ -1366,8 +1418,12 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_module		= THIS_MODULE,
 	.cra_u = {
 		.aead = {
-			.encrypt	= __driver_rfc4106_encrypt,
-			.decrypt	= __driver_rfc4106_decrypt,
+			.setkey		= common_rfc4106_set_key,
+			.setauthsize	= common_rfc4106_set_authsize,
+			.encrypt	= helper_rfc4106_encrypt,
+			.decrypt	= helper_rfc4106_decrypt,
+			.ivsize		= 8,
+			.maxauthsize	= 16,
 		},
 	},
 }, {
-- 
cgit v1.2.3


From 020b37ac66c5fcec70b6fa51113b84bdfff6a4bc Mon Sep 17 00:00:00 2001
From: Rusty Russell
Date: Mon, 2 Mar 2015 22:05:49 +1030
Subject: x86: Fix up obsolete __cpu_set() function usage

Thanks to spatch, plus manual removal of "&*".

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425296150-4722-8-git-send-email-rusty@rustcorp.com.au
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_cluster.c | 8 ++++----
 arch/x86/kernel/irq.c                 | 4 ++--
 arch/x86/platform/uv/tlb_uv.c         | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21681c8..d9d0bd2faaf4 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
 
 	per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
 
-	__cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+	cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
 	for_each_online_cpu(cpu) {
 		if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
 			continue;
-		__cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
-		__cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+		cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+		cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
 	}
 }
 
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
 
 	BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
 
-	__cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+	cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
 	register_hotcpu_notifier(&x2apic_cpu_notifier);
 	return 1;
 }
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe0093a..e5952c225532 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
-	cpu_clear(this_cpu, online_new);
+	cpumask_clear_cpu(this_cpu, &online_new);
 
 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
 
 			data = irq_desc_get_irq_data(desc);
 			cpumask_copy(&affinity_new, data->affinity);
-			cpu_clear(this_cpu, affinity_new);
+			cpumask_clear_cpu(this_cpu, &affinity_new);
 
 			/* Do not count inactive or per-cpu irqs. */
 			if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 994798548b1a..3b6ec42718e4 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -415,7 +415,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 	struct reset_args reset_args;
 
 	reset_args.sender = sender;
-	cpus_clear(*mask);
+	cpumask_clear(mask);
 	/* find a single cpu for each uvhub in this distribution mask */
 	maskbits = sizeof(struct pnmask) * BITSPERBYTE;
 	/* each bit is a pnode relative to the partition base pnode */
@@ -425,7 +425,7 @@ static void reset_with_ipi(struct pnmask *distribution, struct bau_control *bcp)
 			continue;
 		apnode = pnode + bcp->partition_base_pnode;
 		cpu = pnode_to_first_cpu(apnode, smaster);
-		cpu_set(cpu, *mask);
+		cpumask_set_cpu(cpu, mask);
 	}
 
 	/* IPI all cpus; preemption is already disabled */
@@ -1126,7 +1126,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	/* don't actually do a shootdown of the local cpu */
 	cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
 
-	if (cpu_isset(cpu, *cpumask))
+	if (cpumask_test_cpu(cpu, cpumask))
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-- 
cgit v1.2.3


From d496a002ae1f02425168e5211c237abee588651a Mon Sep 17 00:00:00 2001
From: Quentin Casasnovas
Date: Thu, 26 Feb 2015 18:03:59 +0100
Subject: x86/microcode/intel: Fix out of bounds memory access to the extended
 header

Improper pointer arithmetics when calculating the address of the
extended header could lead to an out of bounds memory read and kernel
panic.

Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Link: http://lkml.kernel.org/r/20150225094125.GB30434@chrystal.uk.oracle.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb933189c..3a6c6136c9da 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -180,8 +180,7 @@ matching_model_microcode(struct microcode_header_intel *mc_header,
 	if (total_size <= data_size + MC_HEADER_SIZE)
 		return UCODE_NFOUND;
 
-	ext_header = (struct extended_sigtable *)
-		     mc_header + data_size + MC_HEADER_SIZE;
+	ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
 	ext_sigcount = ext_header->count;
 	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
@@ -457,8 +456,7 @@ static void __ref show_saved_mc(void)
 		if (total_size <= data_size + MC_HEADER_SIZE)
 			continue;
 
-		ext_header = (struct extended_sigtable *)
-			     mc_saved_header + data_size + MC_HEADER_SIZE;
+		ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
 		ext_sigcount = ext_header->count;
 		ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
-- 
cgit v1.2.3


From 776d3cdc93d83808bf5929d716a56c69bbe01d2f Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 5 Feb 2015 19:35:19 +0100
Subject: x86/microcode/intel: Check if microcode was found before applying

We should check the return value of the routines fishing out the proper
microcode and not try to apply if we haven't found a suitable blob.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 3a6c6136c9da..d515ff3feb8b 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -727,9 +727,10 @@ _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 
 	ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
 			     initrd_start_early, uci);
+	if (ret != UCODE_OK)
+		return;
 
-	if (ret == UCODE_OK)
-		apply_microcode_early(uci, true);
+	apply_microcode_early(uci, true);
 }
 
 void __init
@@ -769,6 +770,7 @@ void load_ucode_intel_ap(void)
 	struct ucode_cpu_info uci;
 	unsigned long *mc_saved_in_initrd_p;
 	unsigned long initrd_start_addr;
+	enum ucode_state ret;
 #ifdef CONFIG_X86_32
 	unsigned long *initrd_start_p;
 
@@ -791,8 +793,12 @@ void load_ucode_intel_ap(void)
 		return;
 
 	collect_cpu_info_early(&uci);
-	load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
-		       initrd_start_addr, &uci);
+	ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+			     initrd_start_addr, &uci);
+
+	if (ret != UCODE_OK)
+		return;
+
 	apply_microcode_early(&uci, true);
 }
 
-- 
cgit v1.2.3


From f9524e6f5447277e238b419afc3d0712941fa2a5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 5 Feb 2015 20:11:41 +0100
Subject: x86/microcode/intel: Do the mc_saved_src NULL check first

... and only then deref it. Also, shorten some variable names and rename
others so as to diminish the ubiquitous presence of the "mc_" prefix
everywhere and make it a bit more readable.

Use kcalloc so that we don't kfree() uninitialized memory on the unwind
path, as suggested by Quentin.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 38 +++++++++++++++++------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index d515ff3feb8b..95516006958d 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -203,7 +203,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
 	       unsigned int mc_saved_count)
 {
 	int i, j;
-	struct microcode_intel **mc_saved_p;
+	struct microcode_intel **saved_ptr;
 	int ret;
 
 	if (!mc_saved_count)
@@ -212,39 +212,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
 	/*
 	 * Copy new microcode data.
 	 */
-	mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
-			     GFP_KERNEL);
-	if (!mc_saved_p)
+	saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+	if (!saved_ptr)
 		return -ENOMEM;
 
 	for (i = 0; i < mc_saved_count; i++) {
-		struct microcode_intel *mc = mc_saved_src[i];
-		struct microcode_header_intel *mc_header = &mc->hdr;
-		unsigned long mc_size = get_totalsize(mc_header);
-		mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
-		if (!mc_saved_p[i]) {
-			ret = -ENOMEM;
-			goto err;
-		}
+		struct microcode_header_intel *mc_hdr;
+		struct microcode_intel *mc;
+		unsigned long size;
+
 		if (!mc_saved_src[i]) {
 			ret = -EINVAL;
 			goto err;
 		}
-		memcpy(mc_saved_p[i], mc, mc_size);
+
+		mc     = mc_saved_src[i];
+		mc_hdr = &mc->hdr;
+		size   = get_totalsize(mc_hdr);
+
+		saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+		if (!saved_ptr[i]) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		memcpy(saved_ptr[i], mc, size);
 	}
 
 	/*
 	 * Point to newly saved microcode.
 	 */
-	mc_saved_data->mc_saved = mc_saved_p;
+	mc_saved_data->mc_saved = saved_ptr;
 	mc_saved_data->mc_saved_count = mc_saved_count;
 
 	return 0;
 
 err:
 	for (j = 0; j <= i; j++)
-		kfree(mc_saved_p[j]);
-	kfree(mc_saved_p);
+		kfree(saved_ptr[j]);
+	kfree(saved_ptr);
 
 	return ret;
 }
-- 
cgit v1.2.3


From 2d48bb9b6ec6fb0f28e5135fd080edc23152ae45 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 17:40:03 +0100
Subject: x86/microcode/intel: Get rid of last arg to load_ucode_intel_bsp()

Allocate it on the helper's _load_ucode_intel_bsp() stack instead and do
not hand it down.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 95516006958d..ed30bc77dd38 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -722,21 +722,21 @@ static void __init
 _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 		      unsigned long *mc_saved_in_initrd,
 		      unsigned long initrd_start_early,
-		      unsigned long initrd_end_early,
-		      struct ucode_cpu_info *uci)
+		      unsigned long initrd_end_early)
 {
+	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 
-	collect_cpu_info_early(uci);
+	collect_cpu_info_early(&uci);
 	scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
-		       mc_saved_in_initrd, uci);
+		       mc_saved_in_initrd, &uci);
 
 	ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
-			     initrd_start_early, uci);
+			     initrd_start_early, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-	apply_microcode_early(uci, true);
+	apply_microcode_early(&uci, true);
 }
 
 void __init
@@ -744,7 +744,6 @@ load_ucode_intel_bsp(void)
 {
 	u64 ramdisk_image, ramdisk_size;
 	unsigned long initrd_start_early, initrd_end_early;
-	struct ucode_cpu_info uci;
 #ifdef CONFIG_X86_32
 	struct boot_params *boot_params_p;
 
@@ -757,7 +756,7 @@ load_ucode_intel_bsp(void)
 	_load_ucode_intel_bsp(
 		(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
 		(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-		initrd_start_early, initrd_end_early, &uci);
+		initrd_start_early, initrd_end_early);
 #else
 	ramdisk_image = boot_params.hdr.ramdisk_image;
 	ramdisk_size  = boot_params.hdr.ramdisk_size;
@@ -765,8 +764,7 @@ load_ucode_intel_bsp(void)
 	initrd_end_early = initrd_start_early + ramdisk_size;
 
 	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
-			      initrd_start_early, initrd_end_early,
-			      &uci);
+			      initrd_start_early, initrd_end_early);
 #endif
 }
 
-- 
cgit v1.2.3


From 02f35177fb00ee0d73f9af2d6006af6b75b58c11 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 22:59:51 +0100
Subject: x86/microcode/intel: Simplify load_ucode_intel_bsp()

Don't compute start and end from start and size in order to compute size
again down the path in scan_microcode(). So pass size directly instead
and simplify a bunch. Shorten variable names and remove useless ones.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 49 +++++++++++------------------
 1 file changed, 18 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index ed30bc77dd38..34a2ff9217f8 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -552,12 +552,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
-		struct mc_saved_data *mc_saved_data,
-		unsigned long *mc_saved_in_initrd,
-		struct ucode_cpu_info *uci)
+scan_microcode(unsigned long start, unsigned long size,
+	       struct mc_saved_data *mc_saved_data,
+	       unsigned long *mc_saved_in_initrd, struct ucode_cpu_info *uci)
 {
-	unsigned int size = end - start + 1;
 	struct cpio_data cd;
 	long offset = 0;
 #ifdef CONFIG_X86_32
@@ -573,7 +571,6 @@ scan_microcode(unsigned long start, unsigned long end,
 	if (!cd.data)
 		return UCODE_ERROR;
 
-
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
 					    mc_saved_data, mc_saved_in_initrd,
 					    uci);
@@ -721,50 +718,40 @@ int __init save_microcode_in_initrd_intel(void)
 static void __init
 _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
 		      unsigned long *mc_saved_in_initrd,
-		      unsigned long initrd_start_early,
-		      unsigned long initrd_end_early)
+		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 
 	collect_cpu_info_early(&uci);
-	scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
-		       mc_saved_in_initrd, &uci);
+	scan_microcode(start, size, mc_saved_data, mc_saved_in_initrd, &uci);
 
-	ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
-			     initrd_start_early, &uci);
+	ret = load_microcode(mc_saved_data, mc_saved_in_initrd, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
 	apply_microcode_early(&uci, true);
 }
 
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
 {
-	u64 ramdisk_image, ramdisk_size;
-	unsigned long initrd_start_early, initrd_end_early;
+	u64 start, size;
 #ifdef CONFIG_X86_32
-	struct boot_params *boot_params_p;
+	struct boot_params *p;
 
-	boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
-	ramdisk_image = boot_params_p->hdr.ramdisk_image;
-	ramdisk_size  = boot_params_p->hdr.ramdisk_size;
-	initrd_start_early = ramdisk_image;
-	initrd_end_early = initrd_start_early + ramdisk_size;
+	p	= (struct boot_params *)__pa_nodebug(&boot_params);
+	start	= p->hdr.ramdisk_image;
+	size	= p->hdr.ramdisk_size;
 
 	_load_ucode_intel_bsp(
-		(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
-		(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
-		initrd_start_early, initrd_end_early);
+			(struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+			(unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+			start, size);
 #else
-	ramdisk_image = boot_params.hdr.ramdisk_image;
-	ramdisk_size  = boot_params.hdr.ramdisk_size;
-	initrd_start_early = ramdisk_image + PAGE_OFFSET;
-	initrd_end_early = initrd_start_early + ramdisk_size;
+	start	= boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+	size	= boot_params.hdr.ramdisk_size;
 
-	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
-			      initrd_start_early, initrd_end_early);
+	_load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
 #endif
 }
 
-- 
cgit v1.2.3


From a5de5e242b80c085913041e99ab245408083f273 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 17:49:43 +0100
Subject: x86/microcode/intel: Make _save_mc() return the updated saved count

... of microcode patches instead of handing in a pointer which is used
for I/O in an otherwise void function.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 34a2ff9217f8..f1d9a321d424 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -262,17 +262,18 @@ err:
  * - or if it is a newly discovered microcode patch.
  *
  * The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
  */
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
-		     unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+			     u8 *ucode_ptr, unsigned int num_saved)
 {
-	int i;
-	int found = 0;
-	unsigned int mc_saved_count = *mc_saved_count_p;
 	struct microcode_header_intel *mc_header;
+	int found = 0, i;
 
 	mc_header = (struct microcode_header_intel *)ucode_ptr;
-	for (i = 0; i < mc_saved_count; i++) {
+
+	for (i = 0; i < num_saved; i++) {
 		unsigned int sig, pf;
 		unsigned int new_rev;
 		struct microcode_header_intel *mc_saved_header =
@@ -289,21 +290,20 @@ static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
 				 * Replace the older one with this newer
 				 * one.
 				 */
-				mc_saved[i] =
-					(struct microcode_intel *)ucode_ptr;
+				mc_saved[i] = (struct microcode_intel *)ucode_ptr;
 				break;
 			}
 		}
 	}
-	if (i >= mc_saved_count && !found)
+
+	if (i >= num_saved && !found)
 		/*
 		 * This ucode is first time discovered in ucode file.
 		 * Save it to memory.
 		 */
-		mc_saved[mc_saved_count++] =
-				 (struct microcode_intel *)ucode_ptr;
+		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
 
-	*mc_saved_count_p = mc_saved_count;
+	return num_saved;
 }
 
 /*
@@ -351,7 +351,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
 			continue;
 		}
 
-		_save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+		mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
 
 		ucode_ptr += mc_size;
 	}
@@ -519,8 +519,7 @@ int save_mc_for_early(u8 *mc)
 	 * Save the microcode patch mc in mc_save_tmp structure if it's a newer
 	 * version.
 	 */
-
-	_save_mc(mc_saved_tmp, mc, &mc_saved_count);
+	mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
 
 	/*
 	 * Save the mc_save_tmp in global mc_saved_data.
-- 
cgit v1.2.3


From c868570e745781736366b83191aae2ccb9de2d1c Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 17:59:01 +0100
Subject: x86/microcode/intel: Sanitize _save_mc()

Shorten local variable names for better readability and flatten loop
indentation levels.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 49 ++++++++++++++---------------
 1 file changed, 23 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index f1d9a321d424..94ec304b7af3 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -268,39 +268,36 @@ err:
 static unsigned int _save_mc(struct microcode_intel **mc_saved,
 			     u8 *ucode_ptr, unsigned int num_saved)
 {
-	struct microcode_header_intel *mc_header;
+	struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+	unsigned int sig, pf, new_rev;
 	int found = 0, i;
 
-	mc_header = (struct microcode_header_intel *)ucode_ptr;
+	mc_hdr = (struct microcode_header_intel *)ucode_ptr;
 
 	for (i = 0; i < num_saved; i++) {
-		unsigned int sig, pf;
-		unsigned int new_rev;
-		struct microcode_header_intel *mc_saved_header =
-			     (struct microcode_header_intel *)mc_saved[i];
-		sig = mc_saved_header->sig;
-		pf = mc_saved_header->pf;
-		new_rev = mc_header->rev;
-
-		if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
-			found = 1;
-			if (update_match_revision(mc_header, new_rev)) {
-				/*
-				 * Found an older ucode saved before.
-				 * Replace the older one with this newer
-				 * one.
-				 */
-				mc_saved[i] = (struct microcode_intel *)ucode_ptr;
-				break;
-			}
-		}
-	}
+		mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+		sig	     = mc_saved_hdr->sig;
+		pf	     = mc_saved_hdr->pf;
+		new_rev	     = mc_hdr->rev;
+
+		if (!get_matching_sig(sig, pf, ucode_ptr, new_rev))
+			continue;
+
+		found = 1;
+
+		if (!update_match_revision(mc_hdr, new_rev))
+			continue;
 
-	if (i >= num_saved && !found)
 		/*
-		 * This ucode is first time discovered in ucode file.
-		 * Save it to memory.
+		 * Found an older ucode saved earlier. Replace it with
+		 * this newer one.
 		 */
+		mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+		break;
+	}
+
+	/* Newly detected microcode, save it to memory. */
+	if (i >= num_saved && !found)
 		mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
 
 	return num_saved;
-- 
cgit v1.2.3


From 4f5e5f2b574804ed330e20456d5b86a4259544ad Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 18:10:29 +0100
Subject: x86/microcode/intel: Rename update_match_revision()

... to revision_is_newer() and push it up into the header and make it an
inline function.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/microcode_intel.h      | 8 ++++++--
 arch/x86/kernel/cpu/microcode/intel_early.c | 2 +-
 arch/x86/kernel/cpu/microcode/intel_lib.c   | 8 +-------
 3 files changed, 8 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index dd4c20043ce7..a6b753a131cd 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -60,8 +60,12 @@ extern int
 get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
 extern int microcode_sanity_check(void *mc, int print_err);
 extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
-extern int
-update_match_revision(struct microcode_header_intel *mc_header, int rev);
+
+static inline int
+revision_is_newer(struct microcode_header_intel *mc_header, int rev)
+{
+	return (mc_header->rev <= rev) ? 0 : 1;
+}
 
 #ifdef CONFIG_MICROCODE_INTEL_EARLY
 extern void __init load_ucode_intel_bsp(void);
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 94ec304b7af3..cd42b3a55897 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -285,7 +285,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 
 		found = 1;
 
-		if (!update_match_revision(mc_hdr, new_rev))
+		if (!revision_is_newer(mc_hdr, new_rev))
 			continue;
 
 		/*
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320d0179..7e259d99b0aa 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
 	return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
 }
 
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
-	return (mc_header->rev <= rev) ? 0 : 1;
-}
-
 int microcode_sanity_check(void *mc, int print_err)
 {
 	unsigned long total_size, data_size, ext_table_size;
@@ -166,7 +160,7 @@ int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
 {
 	struct microcode_header_intel *mc_header = mc;
 
-	if (!update_match_revision(mc_header, rev))
+	if (!revision_is_newer(mc_header, rev))
 		return 0;
 
 	return get_matching_sig(csig, cpf, mc, rev);
-- 
cgit v1.2.3


From 58ce8d6d3a7616014dc70fd8d8f945176d74957c Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 9 Feb 2015 21:42:34 +0100
Subject: x86/microcode: Consolidate family,model, ... code

... to the header. Split the family acquiring function into a
main one, doing CPUID and a helper which computes the extended
family and is used in multiple places. Get rid of the locally-grown
get_x86_{family,model}().

While at it, rename local variables to something more descriptive and
vertically align assignments for better readability.

There should be no functionality change resulting from this patch.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/microcode.h            | 73 ++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/microcode/core_early.c  | 75 +++++------------------------
 arch/x86/kernel/cpu/microcode/intel_early.c | 58 ++++++----------------
 3 files changed, 101 insertions(+), 105 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index 201b520521ed..2fb20d6f7e23 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -75,6 +75,79 @@ static inline void __exit exit_amd_microcode(void) {}
 
 #ifdef CONFIG_MICROCODE_EARLY
 #define MAX_UCODE_COUNT 128
+
+#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
+#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
+#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
+#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
+#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
+#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
+#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
+
+#define CPUID_IS(a, b, c, ebx, ecx, edx)	\
+		(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
+
+/*
+ * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
+ * x86_vendor() gets vendor id for BSP.
+ *
+ * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
+ * coding, we still use x86_vendor() to get vendor id for AP.
+ *
+ * x86_vendor() gets vendor information directly from CPUID.
+ */
+static inline int x86_vendor(void)
+{
+	u32 eax = 0x00000000;
+	u32 ebx, ecx = 0, edx;
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
+		return X86_VENDOR_INTEL;
+
+	if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
+		return X86_VENDOR_AMD;
+
+	return X86_VENDOR_UNKNOWN;
+}
+
+static inline unsigned int __x86_family(unsigned int sig)
+{
+	unsigned int x86;
+
+	x86 = (sig >> 8) & 0xf;
+
+	if (x86 == 0xf)
+		x86 += (sig >> 20) & 0xff;
+
+	return x86;
+}
+
+static inline unsigned int x86_family(void)
+{
+	u32 eax = 0x00000001;
+	u32 ebx, ecx = 0, edx;
+
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+
+	return __x86_family(eax);
+}
+
+static inline unsigned int x86_model(unsigned int sig)
+{
+	unsigned int x86, model;
+
+	x86 = __x86_family(sig);
+
+	model = (sig >> 4) & 0xf;
+
+	if (x86 == 0x6 || x86 == 0xf)
+		model += ((sig >> 16) & 0xf) << 4;
+
+	return model;
+}
+
 extern void __init load_ucode_bsp(void);
 extern void load_ucode_ap(void);
 extern int __init save_microcode_in_initrd(void);
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd16ab..a413a69cbd74 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
 #include <asm/processor.h>
 #include <asm/cmdline.h>
 
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx)	\
-		(!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
-	u32 eax = 0x00000000;
-	u32 ebx, ecx = 0, edx;
-
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
-		return X86_VENDOR_INTEL;
-
-	if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
-		return X86_VENDOR_AMD;
-
-	return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
-	u32 eax = 0x00000001;
-	u32 ebx, ecx = 0, edx;
-	int x86;
-
-	native_cpuid(&eax, &ebx, &ecx, &edx);
-
-	x86 = (eax >> 8) & 0xf;
-	if (x86 == 15)
-		x86 += (eax >> 20) & 0xff;
-
-	return x86;
-}
-
 static bool __init check_loader_disabled_bsp(void)
 {
 #ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
 
 void __init load_ucode_bsp(void)
 {
-	int vendor, x86;
+	int vendor, family;
 
 	if (check_loader_disabled_bsp())
 		return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
 		return;
 
 	vendor = x86_vendor();
-	x86 = x86_family();
+	family = x86_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (x86 >= 6)
+		if (family >= 6)
 			load_ucode_intel_bsp();
 		break;
 	case X86_VENDOR_AMD:
-		if (x86 >= 0x10)
+		if (family >= 0x10)
 			load_ucode_amd_bsp();
 		break;
 	default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
 
 void load_ucode_ap(void)
 {
-	int vendor, x86;
+	int vendor, family;
 
 	if (check_loader_disabled_ap())
 		return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
 		return;
 
 	vendor = x86_vendor();
-	x86 = x86_family();
+	family = x86_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (x86 >= 6)
+		if (family >= 6)
 			load_ucode_intel_ap();
 		break;
 	case X86_VENDOR_AMD:
-		if (x86 >= 0x10)
+		if (family >= 0x10)
 			load_ucode_amd_ap();
 		break;
 	default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
 
 void reload_early_microcode(void)
 {
-	int vendor, x86;
+	int vendor, family;
 
 	vendor = x86_vendor();
-	x86 = x86_family();
+	family = x86_family();
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (x86 >= 6)
+		if (family >= 6)
 			reload_ucode_intel();
 		break;
 	case X86_VENDOR_AMD:
-		if (x86 >= 0x10)
+		if (family >= 0x10)
 			reload_ucode_amd();
 		break;
 	default:
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index cd42b3a55897..5c7896bf0e4d 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -126,31 +126,6 @@ load_microcode(struct mc_saved_data *mc_saved_data,
 	}
 }
 
-static u8 get_x86_family(unsigned long sig)
-{
-	u8 x86;
-
-	x86 = (sig >> 8) & 0xf;
-
-	if (x86 == 0xf)
-		x86 += (sig >> 20) & 0xff;
-
-	return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
-	u8 x86, x86_model;
-
-	x86 = get_x86_family(sig);
-	x86_model = (sig >> 4) & 0xf;
-
-	if (x86 == 0x6 || x86 == 0xf)
-		x86_model += ((sig >> 16) & 0xf) << 4;
-
-	return x86_model;
-}
-
 /*
  * Given CPU signature and a microcode patch, this function finds if the
  * microcode patch has matching family and model with the CPU.
@@ -159,41 +134,40 @@ static enum ucode_state
 matching_model_microcode(struct microcode_header_intel *mc_header,
 			unsigned long sig)
 {
-	u8 x86, x86_model;
-	u8 x86_ucode, x86_model_ucode;
+	unsigned int fam, model;
+	unsigned int fam_ucode, model_ucode;
 	struct extended_sigtable *ext_header;
 	unsigned long total_size = get_totalsize(mc_header);
 	unsigned long data_size = get_datasize(mc_header);
 	int ext_sigcount, i;
 	struct extended_signature *ext_sig;
 
-	x86 = get_x86_family(sig);
-	x86_model = get_x86_model(sig);
+	fam   = __x86_family(sig);
+	model = x86_model(sig);
 
-	x86_ucode = get_x86_family(mc_header->sig);
-	x86_model_ucode = get_x86_model(mc_header->sig);
+	fam_ucode   = __x86_family(mc_header->sig);
+	model_ucode = x86_model(mc_header->sig);
 
-	if (x86 == x86_ucode && x86_model == x86_model_ucode)
+	if (fam == fam_ucode && model == model_ucode)
 		return UCODE_OK;
 
 	/* Look for ext. headers: */
 	if (total_size <= data_size + MC_HEADER_SIZE)
 		return UCODE_NFOUND;
 
-	ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+	ext_header   = (void *) mc_header + data_size + MC_HEADER_SIZE;
+	ext_sig      = (void *)ext_header + EXT_HEADER_SIZE;
 	ext_sigcount = ext_header->count;
-	ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
 
 	for (i = 0; i < ext_sigcount; i++) {
-		x86_ucode = get_x86_family(ext_sig->sig);
-		x86_model_ucode = get_x86_model(ext_sig->sig);
+		fam_ucode   = __x86_family(ext_sig->sig);
+		model_ucode = x86_model(ext_sig->sig);
 
-		if (x86 == x86_ucode && x86_model == x86_model_ucode)
+		if (fam == fam_ucode && model == model_ucode)
 			return UCODE_OK;
 
 		ext_sig++;
 	}
-
 	return UCODE_NFOUND;
 }
 
@@ -374,7 +348,7 @@ out:
 static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 {
 	unsigned int val[2];
-	u8 x86, x86_model;
+	unsigned int family, model;
 	struct cpu_signature csig;
 	unsigned int eax, ebx, ecx, edx;
 
@@ -389,10 +363,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	csig.sig = eax;
 
-	x86 = get_x86_family(csig.sig);
-	x86_model = get_x86_model(csig.sig);
+	family = __x86_family(csig.sig);
+	model  = x86_model(csig.sig);
 
-	if ((x86_model >= 5) || (x86 > 6)) {
+	if ((model >= 5) || (family > 6)) {
 		/* get processor flags from MSR 0x17 */
 		native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
 		csig.pf = 1 << ((val[1] >> 18) & 7);
-- 
cgit v1.2.3


From 9e02bb46d366b3635da966e29c5a53920ee7fde8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 10 Feb 2015 10:56:37 +0100
Subject: x86/microcode/intel: Simplify generic_load_microcode_early()

* remove state variable and out label
* get rid of completely unused mc_size
* shorten variable names
* get rid of local variables
* don't do assignments in local var declarations for less cluttered code
* finally rename it to the shorter and perfectly fine load_microcode_early()

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 55 ++++++++++++++---------------
 1 file changed, 26 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 5c7896bf0e4d..d7192124f50b 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -35,38 +35,35 @@ static struct mc_saved_data {
 } mc_saved_data;
 
 static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
-			     unsigned int mc_saved_count,
-			     struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+		     unsigned int num_saved, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *ucode_ptr, *new_mc = NULL;
-	int new_rev = uci->cpu_sig.rev;
-	enum ucode_state state = UCODE_OK;
-	unsigned int mc_size;
-	struct microcode_header_intel *mc_header;
-	unsigned int csig = uci->cpu_sig.sig;
-	unsigned int cpf = uci->cpu_sig.pf;
-	int i;
+	struct microcode_header_intel *mc_hdr;
+	int new_rev, ret, i;
 
-	for (i = 0; i < mc_saved_count; i++) {
-		ucode_ptr = mc_saved_p[i];
+	new_rev = uci->cpu_sig.rev;
 
-		mc_header = (struct microcode_header_intel *)ucode_ptr;
-		mc_size = get_totalsize(mc_header);
-		if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
-			new_rev = mc_header->rev;
-			new_mc  = ucode_ptr;
-		}
-	}
+	for (i = 0; i < num_saved; i++) {
+		ucode_ptr = saved[i];
+		mc_hdr	  = (struct microcode_header_intel *)ucode_ptr;
+
+		ret = get_matching_microcode(uci->cpu_sig.sig,
+					     uci->cpu_sig.pf,
+					     ucode_ptr,
+					     new_rev);
+		if (!ret)
+			continue;
 
-	if (!new_mc) {
-		state = UCODE_NFOUND;
-		goto out;
+		new_rev = mc_hdr->rev;
+		new_mc  = ucode_ptr;
 	}
 
+	if (!new_mc)
+		return UCODE_NFOUND;
+
 	uci->mc = (struct microcode_intel *)new_mc;
-out:
-	return state;
+	return UCODE_OK;
 }
 
 static void
@@ -114,13 +111,13 @@ load_microcode(struct mc_saved_data *mc_saved_data,
 		microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
 				  initrd_start, count);
 
-		return generic_load_microcode_early(mc_saved_tmp, count, uci);
+		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
 #ifdef CONFIG_X86_32
 		microcode_phys(mc_saved_tmp, mc_saved_data);
-		return generic_load_microcode_early(mc_saved_tmp, count, uci);
+		return load_microcode_early(mc_saved_tmp, count, uci);
 #else
-		return generic_load_microcode_early(mc_saved_data->mc_saved,
+		return load_microcode_early(mc_saved_data->mc_saved,
 						    count, uci);
 #endif
 	}
@@ -773,8 +770,8 @@ void reload_ucode_intel(void)
 
 	collect_cpu_info_early(&uci);
 
-	ret = generic_load_microcode_early(mc_saved_data.mc_saved,
-					   mc_saved_data.mc_saved_count, &uci);
+	ret = load_microcode_early(mc_saved_data.mc_saved,
+				   mc_saved_data.mc_saved_count, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit v1.2.3


From e3d8f6747663b468ccedb8af0f38f2be82874c63 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 10 Feb 2015 11:28:23 +0100
Subject: x86/microcode/intel: Move mc arg last in get_matching_{microcode|sig}

... arguments list so that it comes more natural for those functions to
have the signature, processor flags and revision together, before the
rest of the args.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/microcode_intel.h      |  5 ++---
 arch/x86/kernel/cpu/microcode/intel.c       |  4 ++--
 arch/x86/kernel/cpu/microcode/intel_early.c |  6 +++---
 arch/x86/kernel/cpu/microcode/intel_lib.c   | 16 +++++++---------
 4 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h
index a6b753a131cd..2b9209c46ca9 100644
--- a/arch/x86/include/asm/microcode_intel.h
+++ b/arch/x86/include/asm/microcode_intel.h
@@ -56,10 +56,9 @@ struct extended_sigtable {
 
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-extern int
-get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev);
+extern int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc);
 extern int microcode_sanity_check(void *mc, int print_err);
-extern int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev);
+extern int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc);
 
 static inline int
 revision_is_newer(struct microcode_header_intel *mc_header, int rev)
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd08aad..a41beadb3db9 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
 	cpf = cpu_sig.pf;
 	crev = cpu_sig.rev;
 
-	return get_matching_microcode(csig, cpf, mc_intel, crev);
+	return get_matching_microcode(csig, cpf, crev, mc_intel);
 }
 
 static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
 
 		csig = uci->cpu_sig.sig;
 		cpf = uci->cpu_sig.pf;
-		if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+		if (get_matching_microcode(csig, cpf, new_rev, mc)) {
 			vfree(new_mc);
 			new_rev = mc_header.rev;
 			new_mc  = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index d7192124f50b..844c75895160 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -50,8 +50,8 @@ load_microcode_early(struct microcode_intel **saved,
 
 		ret = get_matching_microcode(uci->cpu_sig.sig,
 					     uci->cpu_sig.pf,
-					     ucode_ptr,
-					     new_rev);
+					     new_rev,
+					     ucode_ptr);
 		if (!ret)
 			continue;
 
@@ -251,7 +251,7 @@ static unsigned int _save_mc(struct microcode_intel **mc_saved,
 		pf	     = mc_saved_hdr->pf;
 		new_rev	     = mc_hdr->rev;
 
-		if (!get_matching_sig(sig, pf, ucode_ptr, new_rev))
+		if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
 			continue;
 
 		found = 1;
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index 7e259d99b0aa..cd47a510a3f1 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -122,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
 EXPORT_SYMBOL_GPL(microcode_sanity_check);
 
 /*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
 {
 	struct microcode_header_intel *mc_header = mc;
 	struct extended_sigtable *ext_header;
@@ -153,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
 }
 
 /*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
  */
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
 {
-	struct microcode_header_intel *mc_header = mc;
+	struct microcode_header_intel *mc_hdr = mc;
 
-	if (!revision_is_newer(mc_header, rev))
+	if (!revision_is_newer(mc_hdr, rev))
 		return 0;
 
-	return get_matching_sig(csig, cpf, mc, rev);
+	return get_matching_sig(csig, cpf, rev, mc);
 }
 EXPORT_SYMBOL_GPL(get_matching_microcode);
-- 
cgit v1.2.3


From 140f74fcedbfc862178c082c50b5f54dd52c1ce4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 10 Feb 2015 12:00:56 +0100
Subject: x86/microcode/intel: Sanitize microcode_pointer()

Shorten variable names and rename it to what it does.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 844c75895160..7bcad1aacce3 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -66,16 +66,14 @@ load_microcode_early(struct microcode_intel **saved,
 	return UCODE_OK;
 }
 
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
-		  unsigned long *mc_saved_in_initrd,
-		  unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+		  unsigned long off, int num_saved)
 {
 	int i;
 
-	for (i = 0; i < mc_saved_count; i++)
-		mc_saved[i] = (struct microcode_intel *)
-			      (mc_saved_in_initrd[i] + initrd_start);
+	for (i = 0; i < num_saved; i++)
+		mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
 }
 
 #ifdef CONFIG_X86_32
@@ -99,17 +97,14 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
 #endif
 
 static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
-	       unsigned long *mc_saved_in_initrd,
-	       unsigned long initrd_start,
-	       struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+	       unsigned long initrd_start, struct ucode_cpu_info *uci)
 {
 	struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
 	unsigned int count = mc_saved_data->mc_saved_count;
 
 	if (!mc_saved_data->mc_saved) {
-		microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
-				  initrd_start, count);
+		copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
 
 		return load_microcode_early(mc_saved_tmp, count, uci);
 	} else {
@@ -672,7 +667,7 @@ int __init save_microcode_in_initrd_intel(void)
 	if (count == 0)
 		return ret;
 
-	microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+	copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
 	ret = save_microcode(&mc_saved_data, mc_saved, count);
 	if (ret)
 		pr_err("Cannot save microcode patches from initrd.\n");
-- 
cgit v1.2.3


From 4f1f605cfe3046c4c9f07920c47d3f801d31c0ba Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Tue, 10 Feb 2015 12:45:16 +0100
Subject: x86/microcode/intel: Check scan_microcode()'s retval

... and do not attempt to load anything in case of error.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 7bcad1aacce3..88a0348f455c 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -514,9 +514,9 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
 
 static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
 static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long size,
-	       struct mc_saved_data *mc_saved_data,
-	       unsigned long *mc_saved_in_initrd, struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+	       unsigned long start, unsigned long size,
+	       struct ucode_cpu_info *uci)
 {
 	struct cpio_data cd;
 	long offset = 0;
@@ -534,8 +534,7 @@ scan_microcode(unsigned long start, unsigned long size,
 		return UCODE_ERROR;
 
 	return get_matching_model_microcode(0, start, cd.data, cd.size,
-					    mc_saved_data, mc_saved_in_initrd,
-					    uci);
+					    mc_saved_data, initrd, uci);
 }
 
 /*
@@ -679,16 +678,19 @@ int __init save_microcode_in_initrd_intel(void)
 
 static void __init
 _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
-		      unsigned long *mc_saved_in_initrd,
+		      unsigned long *initrd,
 		      unsigned long start, unsigned long size)
 {
 	struct ucode_cpu_info uci;
 	enum ucode_state ret;
 
 	collect_cpu_info_early(&uci);
-	scan_microcode(start, size, mc_saved_data, mc_saved_in_initrd, &uci);
 
-	ret = load_microcode(mc_saved_data, mc_saved_in_initrd, start, &uci);
+	ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+	if (ret != UCODE_OK)
+		return;
+
+	ret = load_microcode(mc_saved_data, initrd, start, &uci);
 	if (ret != UCODE_OK)
 		return;
 
-- 
cgit v1.2.3


From a858b5e50442579a73112c7c79bf6096e96de535 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 11 Feb 2015 14:54:42 +0100
Subject: x86/microcode/intel: Fix printing of microcode blobs in
 show_saved_mc()

When doing

  echo 1 > /sys/devices/system/cpu/microcode/reload

in order to reload microcode, I get:

  microcode: Total microcode saved: 1
  BUG: using smp_processor_id() in preemptible [00000000] code: bash/2606
  caller is debug_smp_processor_id+0x17/0x20
  CPU: 1 PID: 2606 Comm: bash Not tainted 3.19.0-rc7+ #9
  Hardware name: LENOVO 2320CTO/2320CTO, BIOS G2ET86WW (2.06 ) 11/13/2012
   ffffffff81a4266d ffff8802131db808 ffffffff81666588 0000000000000007
   0000000000000001 ffff8802131db838 ffffffff812e6eef ffff8802131db868
   00000000000306a9 0000000000000010 0000000000000015 ffff8802131db848
  Call Trace:
   dump_stack
   check_preemption_disabled
   debug_smp_processor_id
   show_saved_mc
   ? save_microcode.constprop.8
   save_mc_for_early
   ? print_context_stack
   ? dump_trace
   ? __bfs
   ? mark_held_locks
   ? get_page_from_freelist
   ? trace_hardirqs_on_caller
   ? trace_hardirqs_on
   ? __alloc_pages_nodemask
   ? __get_vm_area_node
   ? map_vm_area
   ? __vmalloc_node_range
   ? generic_load_microcode
   generic_load_microcode
   ? microcode_fini_cpu
   request_microcode_fw
   reload_store
   dev_attr_store
   sysfs_kf_write
   kernfs_fop_write
   vfs_write
   ? sysret_check
   SyS_write
   system_call_fastpath
  microcode: CPU1: sig=0x306a9, pf=0x10, rev=0x15
  microcode: mc_saved[0]: sig=0x306a9, pf=0x12, rev=0x1b, toal size=0x3000, date = 2014-05-29

because we're using smp_processor_id() in preemtible context. And we
don't really need to use it there because the microcode container we're
dumping is global and CPU-specific info is irrelevant.

While at it, make pr_* stuff use "microcode: " prefix for easier
grepping and document how to enable the DEBUG build.

Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/microcode/intel_early.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 88a0348f455c..2f49ab4ac0ae 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
  *	as published by the Free Software Foundation; either version
  *	2 of the License, or (at your option) any later version.
  */
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
 #include <linux/module.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
@@ -28,6 +36,9 @@
 #include <asm/tlbflush.h>
 #include <asm/setup.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt)	"microcode: " fmt
+
 static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
 static struct mc_saved_data {
 	unsigned int mc_saved_count;
@@ -397,8 +408,7 @@ static void __ref show_saved_mc(void)
 	sig = uci.cpu_sig.sig;
 	pf = uci.cpu_sig.pf;
 	rev = uci.cpu_sig.rev;
-	pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
-		 smp_processor_id(), sig, pf, rev);
+	pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
 
 	for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
 		struct microcode_header_intel *mc_saved_header;
-- 
cgit v1.2.3


From f563db4bdb8ef5ea73d0f5ea2b20384c10fbd617 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Fri, 27 Feb 2015 16:32:38 +0100
Subject: KVM: SVM: fix interrupt injection (apic->isr_count always 0)

In commit b4eef9b36db4, we started to use hwapic_isr_update() != NULL
instead of kvm_apic_vid_enabled(vcpu->kvm).  This didn't work because
SVM had it defined and "apicv" path in apic_{set,clear}_isr() does not
change apic->isr_count, because it should always be 1.  The initial
value of apic->isr_count was based on kvm_apic_vid_enabled(vcpu->kvm),
which is always 0 for SVM, so KVM could have injected interrupts when it
shouldn't.

Fix it by implicitly setting SVM's hwapic_isr_update to NULL and make the
initial isr_count depend on hwapic_isr_update() for good measure.

Fixes: b4eef9b36db4 ("kvm: x86: vmx: NULL out hwapic_isr_update() in case of !enable_apicv")
Reported-and-tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/lapic.c | 4 ++--
 arch/x86/kvm/svm.c   | 6 ------
 2 files changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e55b5fc344eb..bd4e34de24c7 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1572,7 +1572,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu)
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
-	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
@@ -1782,7 +1782,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
-	apic->isr_count = kvm_apic_vid_enabled(vcpu->kvm) ?
+	apic->isr_count = kvm_x86_ops->hwapic_isr_update ?
 				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
 	if (kvm_x86_ops->hwapic_irr_update)
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d319e0c24758..cc618c882f90 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3649,11 +3649,6 @@ static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	return;
 }
 
-static void svm_hwapic_isr_update(struct kvm *kvm, int isr)
-{
-	return;
-}
-
 static void svm_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 {
 	return;
@@ -4403,7 +4398,6 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 	.vm_has_apicv = svm_vm_has_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
-	.hwapic_isr_update = svm_hwapic_isr_update,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
 	.set_tss_addr = svm_set_tss_addr,
-- 
cgit v1.2.3


From 29a5ff97fa0d8045d262a772c3853e3ef1ed98d8 Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Tue, 3 Mar 2015 22:31:32 -0500
Subject: x86/compat: Remove compat_ni_syscall()

compat_ni_syscall() does the same thing as sys_ni_syscall().

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425439896-8322-2-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/Makefile       | 2 +-
 arch/x86/ia32/nosyscall.c    | 7 -------
 arch/x86/ia32/syscall_ia32.c | 4 ++--
 3 files changed, 3 insertions(+), 10 deletions(-)
 delete mode 100644 arch/x86/ia32/nosyscall.c

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..e66d85021aa2 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,7 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
+obj-$(CONFIG_IA32_EMULATION) += syscall_ia32.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-
-long compat_ni_syscall(void)
-{
-	return -ENOSYS;
-}
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
index 4754ba0f5d9f..3429b14793e3 100644
--- a/arch/x86/ia32/syscall_ia32.c
+++ b/arch/x86/ia32/syscall_ia32.c
@@ -13,13 +13,13 @@
 
 typedef void (*sys_call_ptr_t)(void);
 
-extern void compat_ni_syscall(void);
+extern asmlinkage void sys_ni_syscall(void);
 
 const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
+	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
-- 
cgit v1.2.3


From 2aa4a710928863e84cb71e60b7c839d12403f5ca Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Tue, 3 Mar 2015 22:31:33 -0500
Subject: x86/compat: Merge native and compat 32-bit syscall tables

Combine the 32-bit syscall tables into one file.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425439896-8322-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/Makefile       |  1 -
 arch/x86/ia32/syscall_ia32.c | 25 -------------------------
 arch/x86/kernel/Makefile     |  1 +
 arch/x86/kernel/syscall_32.c | 16 ++++++++++++----
 4 files changed, 13 insertions(+), 30 deletions(-)
 delete mode 100644 arch/x86/ia32/syscall_ia32.c

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e66d85021aa2..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
 #
 
 obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += syscall_ia32.o
 
 obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
 
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 3429b14793e3..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* System call table for ia32 emulation. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern asmlinkage void sys_ni_syscall(void);
-
-const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
-	/*
-	 * Smells like a compiler bug -- it doesn't work
-	 * when the & below is removed.
-	 */
-	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
-#include <asm/syscalls_32.h>
-};
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5d4502c8b983..62fbe7177cdd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_X86_32)	+= i386_ksyms_32.o
 obj-$(CONFIG_X86_64)	+= sys_x86_64.o x8664_ksyms_64.o
 obj-$(CONFIG_X86_64)	+= mcount_64.o
 obj-y			+= syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION)	+= syscall_32.o
 obj-$(CONFIG_X86_VSYSCALL_EMULATION)	+= vsyscall_64.o vsyscall_emu_64.o
 obj-$(CONFIG_X86_ESPFIX64)	+= espfix_64.o
 obj-$(CONFIG_SYSFS)	+= ksysfs.o
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
 #include <linux/cache.h>
 #include <asm/asm-offsets.h>
 
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
 #include <asm/syscalls_32.h>
 #undef __SYSCALL_I386
 
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
 
 typedef asmlinkage void (*sys_call_ptr_t)(void);
 
 extern asmlinkage void sys_ni_syscall(void);
 
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
 	/*
 	 * Smells like a compiler bug -- it doesn't work
 	 * when the & below is removed.
 	 */
-	[0 ... __NR_syscall_max] = &sys_ni_syscall,
+	[0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
 #include <asm/syscalls_32.h>
 };
-- 
cgit v1.2.3


From 7e8e385aaf6ed5b64b5d9108081cfcdcdd021b78 Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Tue, 3 Mar 2015 22:31:34 -0500
Subject: x86/compat: Remove sys32_vm86_warning

The check against lastcomm is racy, and the message it produces
isn't necessary.  vm86 support can be disabled on a 32-bit
kernel also, and doesn't have this message.  Switch to
sys_ni_syscall instead.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425439896-8322-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/sys_ia32.c         | 14 --------------
 arch/x86/syscalls/syscall_32.tbl |  4 ++--
 2 files changed, 2 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
 				advice);
 }
 
-long sys32_vm86_warning(void)
-{
-	struct task_struct *me = current;
-	static char lastcomm[sizeof(me->comm)];
-
-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		compat_printk(KERN_INFO
-			      "%s: vm86 mode not supported on 64 bit kernel\n",
-			      me->comm);
-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
-	}
-	return -ENOSYS;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
 				   size_t count)
 {
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
 110	i386	iopl			sys_iopl
 111	i386	vhangup			sys_vhangup
 112	i386	idle
-113	i386	vm86old			sys_vm86old			sys32_vm86_warning
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
 114	i386	wait4			sys_wait4			compat_sys_wait4
 115	i386	swapoff			sys_swapoff
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
@@ -172,7 +172,7 @@
 163	i386	mremap			sys_mremap
 164	i386	setresuid		sys_setresuid16
 165	i386	getresuid		sys_getresuid16
-166	i386	vm86			sys_vm86			sys32_vm86_warning
+166	i386	vm86			sys_vm86			sys_ni_syscall
 167	i386	query_module
 168	i386	poll			sys_poll
 169	i386	nfsservctl
-- 
cgit v1.2.3


From 63f1789ec71677dd285d43d6c79ca44808f16945 Mon Sep 17 00:00:00 2001
From: Jiang Liu
Date: Wed, 4 Mar 2015 16:47:11 +0800
Subject: x86/PCI/ACPI: Ignore resources consumed by host bridge itself

When parsing resources for PCI host bridge, we should ignore resources
consumed by host bridge itself and only report window resources available
to child PCI busses.

Fixes: 593669c2ac0f (x86/PCI/ACPI: Use common ACPI resource interfaces ...)
Signed-off-by: Jiang Liu <jiang.liu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/pci/acpi.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 6ac273832f28..e4695985f9de 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -331,7 +331,7 @@ static void probe_pci_root_info(struct pci_root_info *info,
 				struct list_head *list)
 {
 	int ret;
-	struct resource_entry *entry;
+	struct resource_entry *entry, *tmp;
 
 	sprintf(info->name, "PCI Bus %04x:%02x", domain, busnum);
 	info->bridge = device;
@@ -345,8 +345,13 @@ static void probe_pci_root_info(struct pci_root_info *info,
 		dev_dbg(&device->dev,
 			"no IO and memory resources present in _CRS\n");
 	else
-		resource_list_for_each_entry(entry, list)
-			entry->res->name = info->name;
+		resource_list_for_each_entry_safe(entry, tmp, list) {
+			if ((entry->res->flags & IORESOURCE_WINDOW) == 0 ||
+			    (entry->res->flags & IORESOURCE_DISABLED))
+				resource_list_destroy_entry(entry);
+			else
+				entry->res->name = info->name;
+		}
 }
 
 struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
-- 
cgit v1.2.3


From 69e8544cd0056e02965ffb5e8414fb7501a2ee2e Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:24 -0800
Subject: x86/asm/64: Open-code register save/restore in trace_hardirqs*()
 thunks

This is a preparatory patch for change in "struct pt_regs"
handling in entry_64.S.

trace_hardirqs*() thunks were (ab)using a part of the
'pt_regs' handling code, namely the SAVE_ARGS/RESTORE_ARGS
macros, to save/restore registers across C function calls.

Since SAVE_ARGS is going to be changed, open-code
register saving/restoring here.

Incidentally, this removes a bit of dead code:
one SAVE_ARGS was used just to emit a CFI annotation,
but it also generated unreachable assembly instructions.

Take a page from thunk_32.S and use push/pop instructions
instead of movq, they are far shorter:
1 or 2 bytes versus 5, and no need for instructions to adjust %rsp:

   text	   data	    bss	    dec	    hex	filename
    333	     40	      0	    373	    175	thunk_64_movq.o
    104	     40	      0	    144	     90	thunk_64_push_pop.o

[ This is ugly as sin, but we'll fix up the ugliness in the next
  patch. I see no point in reordering patches just to avoid an
  ugly intermediate state.  --Andy ]

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1420927210-19738-4-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/4c979ad604f0f02c5ade3b3da308b53eabd5e198.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/lib/thunk_64.S | 46 ++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 42 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..8ec443a0777b 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,27 @@
 	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	SAVE_ARGS
+	pushq_cfi %rdi
+	CFI_REL_OFFSET rdi, 0
+	pushq_cfi %rsi
+	CFI_REL_OFFSET rsi, 0
+	pushq_cfi %rdx
+	CFI_REL_OFFSET rdx, 0
+	pushq_cfi %rcx
+	CFI_REL_OFFSET rcx, 0
+	pushq_cfi %rax
+	CFI_REL_OFFSET rax, 0
+	pushq_cfi %r8
+	CFI_REL_OFFSET r8, 0
+	pushq_cfi %r9
+	CFI_REL_OFFSET r9, 0
+	pushq_cfi %r10
+	CFI_REL_OFFSET r10, 0
+	pushq_cfi %r11
+	CFI_REL_OFFSET r11, 0
 
 	.if \put_ret_addr_in_rdi
+	/* 9*8(%rsp) is return addr on stack */
 	movq_cfi_restore 9*8, rdi
 	.endif
 
@@ -45,11 +63,31 @@
 #endif
 #endif
 
-	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
 	CFI_STARTPROC
-	SAVE_ARGS
+	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	RESTORE_ARGS
+	popq_cfi %r11
+	CFI_RESTORE r11
+	popq_cfi %r10
+	CFI_RESTORE r10
+	popq_cfi %r9
+	CFI_RESTORE r9
+	popq_cfi %r8
+	CFI_RESTORE r8
+	popq_cfi %rax
+	CFI_RESTORE rax
+	popq_cfi %rcx
+	CFI_RESTORE rcx
+	popq_cfi %rdx
+	CFI_RESTORE rdx
+	popq_cfi %rsi
+	CFI_RESTORE rsi
+	popq_cfi %rdi
+	CFI_RESTORE rdi
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
+#endif
-- 
cgit v1.2.3


From 49db46a67bec9ca9e29ece4729a876195877af50 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:25 -0800
Subject: x86/asm: Introduce push/pop macros which generate CFI_REL_OFFSET and
 CFI_RESTORE

Sequences:

        pushl_cfi %reg
        CFI_REL_OFFSET reg, 0

and:

        popl_cfi %reg
        CFI_RESTORE reg

happen quite often. This patch adds macros which generate them.

No assembly changes (verified with objdump -dr vmlinux.o).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1421017655-25561-1-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/2202eb90f175cf45d1b2d1c64dbb5676a8ad07ad.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calling.h | 42 ++++++++++-------------------
 arch/x86/include/asm/dwarf2.h  | 24 +++++++++++++++++
 arch/x86/kernel/entry_32.S     | 21 +++++----------
 arch/x86/lib/atomic64_cx8_32.S | 50 ++++++++++++++---------------------
 arch/x86/lib/checksum_32.S     | 60 ++++++++++++++----------------------------
 arch/x86/lib/msr-reg.S         | 24 ++++++++---------
 arch/x86/lib/rwsem.S           | 44 ++++++++++++++-----------------
 arch/x86/lib/thunk_32.S        | 18 +++++--------
 arch/x86/lib/thunk_64.S        | 54 +++++++++++++------------------------
 9 files changed, 141 insertions(+), 196 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..3c711f2ab236 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -210,37 +210,23 @@ For 32-bit we have the following conventions - kernel is built with
  */
 
 	.macro SAVE_ALL
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ebp
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg edx
+	pushl_cfi_reg ecx
+	pushl_cfi_reg ebx
 	.endm
 
 	.macro RESTORE_ALL
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebp
-	CFI_RESTORE ebp
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl_cfi_reg ebx
+	popl_cfi_reg ecx
+	popl_cfi_reg edx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
+	popl_cfi_reg ebp
+	popl_cfi_reg eax
 	.endm
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
 	CFI_ADJUST_CFA_OFFSET 8
 	.endm
 
+	.macro pushq_cfi_reg reg
+	pushq %\reg
+	CFI_ADJUST_CFA_OFFSET 8
+	CFI_REL_OFFSET \reg, 0
+	.endm
+
 	.macro popq_cfi reg
 	popq \reg
 	CFI_ADJUST_CFA_OFFSET -8
 	.endm
 
+	.macro popq_cfi_reg reg
+	popq %\reg
+	CFI_ADJUST_CFA_OFFSET -8
+	CFI_RESTORE \reg
+	.endm
+
 	.macro pushfq_cfi
 	pushfq
 	CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
 	CFI_ADJUST_CFA_OFFSET 4
 	.endm
 
+	.macro pushl_cfi_reg reg
+	pushl %\reg
+	CFI_ADJUST_CFA_OFFSET 4
+	CFI_REL_OFFSET \reg, 0
+	.endm
+
 	.macro popl_cfi reg
 	popl \reg
 	CFI_ADJUST_CFA_OFFSET -4
 	.endm
 
+	.macro popl_cfi_reg reg
+	popl %\reg
+	CFI_ADJUST_CFA_OFFSET -4
+	CFI_RESTORE \reg
+	.endm
+
 	.macro pushfl_cfi
 	pushfl
 	CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 7e0323cc0b7d..e33ba51b1069 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1234,20 +1234,13 @@ error_code:
 	/*CFI_REL_OFFSET es, 0*/
 	pushl_cfi %ds
 	/*CFI_REL_OFFSET ds, 0*/
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ebp
-	CFI_REL_OFFSET ebp, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ebp
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg edx
+	pushl_cfi_reg ecx
+	pushl_cfi_reg ebx
 	cld
 	movl $(__KERNEL_PERCPU), %ecx
 	movl %ecx, %fs
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
 #include <asm/alternative-asm.h>
 #include <asm/dwarf2.h>
 
-.macro SAVE reg
-	pushl_cfi %\reg
-	CFI_REL_OFFSET \reg, 0
-.endm
-
-.macro RESTORE reg
-	popl_cfi %\reg
-	CFI_RESTORE \reg
-.endm
-
 .macro read64 reg
 	movl %ebx, %eax
 	movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
 .macro addsub_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
 	CFI_STARTPROC
-	SAVE ebp
-	SAVE ebx
-	SAVE esi
-	SAVE edi
+	pushl_cfi_reg ebp
+	pushl_cfi_reg ebx
+	pushl_cfi_reg esi
+	pushl_cfi_reg edi
 
 	movl %eax, %esi
 	movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE edi
-	RESTORE esi
-	RESTORE ebx
-	RESTORE ebp
+	popl_cfi_reg edi
+	popl_cfi_reg esi
+	popl_cfi_reg ebx
+	popl_cfi_reg ebp
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
 .macro incdec_return func ins insc
 ENTRY(atomic64_\func\()_return_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
 10:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
 
 ENTRY(atomic64_dec_if_positive_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
 2:
 	movl %ebx, %eax
 	movl %ecx, %edx
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_dec_if_positive_cx8)
 
 ENTRY(atomic64_add_unless_cx8)
 	CFI_STARTPROC
-	SAVE ebp
-	SAVE ebx
+	pushl_cfi_reg ebp
+	pushl_cfi_reg ebx
 /* these just push these two parameters on the stack */
-	SAVE edi
-	SAVE ecx
+	pushl_cfi_reg edi
+	pushl_cfi_reg ecx
 
 	movl %eax, %ebp
 	movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
 3:
 	addl $8, %esp
 	CFI_ADJUST_CFA_OFFSET -8
-	RESTORE ebx
-	RESTORE ebp
+	popl_cfi_reg ebx
+	popl_cfi_reg ebp
 	ret
 4:
 	cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
 
 ENTRY(atomic64_inc_not_zero_cx8)
 	CFI_STARTPROC
-	SAVE ebx
+	pushl_cfi_reg ebx
 
 	read64 %esi
 1:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
 
 	movl $1, %eax
 3:
-	RESTORE ebx
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..c3b9953d3fa0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
 	   */		
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg: unsigned char *buff
@@ -131,10 +129,8 @@ ENTRY(csum_partial)
 	jz 8f
 	roll $8, %eax
 8:
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
 
 ENTRY(csum_partial)
 	CFI_STARTPROC
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl 20(%esp),%eax	# Function arg: unsigned int sum
 	movl 16(%esp),%ecx	# Function arg: int len
 	movl 12(%esp),%esi	# Function arg:	const unsigned char *buf
@@ -255,10 +249,8 @@ ENTRY(csum_partial)
 	jz 90f
 	roll $8, %eax
 90: 
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
 	subl  $4,%esp	
 	CFI_ADJUST_CFA_OFFSET 4
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
+	pushl_cfi_reg ebx
 	movl ARGBASE+16(%esp),%eax	# sum
 	movl ARGBASE+12(%esp),%ecx	# len
 	movl ARGBASE+4(%esp),%esi	# src
@@ -412,12 +401,9 @@ DST(	movb %cl, (%edi)	)
 
 .previous
 
-	popl_cfi %ebx
-	CFI_RESTORE ebx
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
+	popl_cfi_reg ebx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
 	popl_cfi %ecx			# equivalent to addl $4,%esp
 	ret	
 	CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
 		
 ENTRY(csum_partial_copy_generic)
 	CFI_STARTPROC
-	pushl_cfi %ebx
-	CFI_REL_OFFSET ebx, 0
-	pushl_cfi %edi
-	CFI_REL_OFFSET edi, 0
-	pushl_cfi %esi
-	CFI_REL_OFFSET esi, 0
+	pushl_cfi_reg ebx
+	pushl_cfi_reg edi
+	pushl_cfi_reg esi
 	movl ARGBASE+4(%esp),%esi	#src
 	movl ARGBASE+8(%esp),%edi	#dst	
 	movl ARGBASE+12(%esp),%ecx	#len
@@ -506,12 +489,9 @@ DST(	movb %dl, (%edi)         )
 	jmp  7b			
 .previous				
 
-	popl_cfi %esi
-	CFI_RESTORE esi
-	popl_cfi %edi
-	CFI_RESTORE edi
-	popl_cfi %ebx
-	CFI_RESTORE ebx
+	popl_cfi_reg esi
+	popl_cfi_reg edi
+	popl_cfi_reg ebx
 	ret
 	CFI_ENDPROC
 ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
-	pushq_cfi %rbx
-	pushq_cfi %rbp
+	pushq_cfi_reg rbx
+	pushq_cfi_reg rbp
 	movq	%rdi, %r10	/* Save pointer */
 	xorl	%r11d, %r11d	/* Return value */
 	movl    (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
 	movl    %ebp, 20(%r10)
 	movl    %esi, 24(%r10)
 	movl    %edi, 28(%r10)
-	popq_cfi %rbp
-	popq_cfi %rbx
+	popq_cfi_reg rbp
+	popq_cfi_reg rbx
 	ret
 3:
 	CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
 .macro op_safe_regs op
 ENTRY(\op\()_safe_regs)
 	CFI_STARTPROC
-	pushl_cfi %ebx
-	pushl_cfi %ebp
-	pushl_cfi %esi
-	pushl_cfi %edi
+	pushl_cfi_reg ebx
+	pushl_cfi_reg ebp
+	pushl_cfi_reg esi
+	pushl_cfi_reg edi
 	pushl_cfi $0              /* Return value */
 	pushl_cfi %eax
 	movl    4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
 	movl    %esi, 24(%eax)
 	movl    %edi, 28(%eax)
 	popl_cfi %eax
-	popl_cfi %edi
-	popl_cfi %esi
-	popl_cfi %ebp
-	popl_cfi %ebx
+	popl_cfi_reg edi
+	popl_cfi_reg esi
+	popl_cfi_reg ebp
+	popl_cfi_reg ebx
 	ret
 3:
 	CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
  */
 
 #define save_common_regs \
-	pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+	pushl_cfi_reg ecx
 
 #define restore_common_regs \
-	popl_cfi %ecx; CFI_RESTORE ecx
+	popl_cfi_reg ecx
 
 	/* Avoid uglifying the argument copying x86-64 needs to do. */
 	.macro movq src, dst
@@ -64,22 +64,22 @@
  */
 
 #define save_common_regs \
-	pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
-	pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
-	pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
-	pushq_cfi %r8;  CFI_REL_OFFSET r8,  0; \
-	pushq_cfi %r9;  CFI_REL_OFFSET r9,  0; \
-	pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
-	pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+	pushq_cfi_reg rdi; \
+	pushq_cfi_reg rsi; \
+	pushq_cfi_reg rcx; \
+	pushq_cfi_reg r8;  \
+	pushq_cfi_reg r9;  \
+	pushq_cfi_reg r10; \
+	pushq_cfi_reg r11
 
 #define restore_common_regs \
-	popq_cfi %r11; CFI_RESTORE r11; \
-	popq_cfi %r10; CFI_RESTORE r10; \
-	popq_cfi %r9;  CFI_RESTORE r9; \
-	popq_cfi %r8;  CFI_RESTORE r8; \
-	popq_cfi %rcx; CFI_RESTORE rcx; \
-	popq_cfi %rsi; CFI_RESTORE rsi; \
-	popq_cfi %rdi; CFI_RESTORE rdi
+	popq_cfi_reg r11; \
+	popq_cfi_reg r10; \
+	popq_cfi_reg r9; \
+	popq_cfi_reg r8; \
+	popq_cfi_reg rcx; \
+	popq_cfi_reg rsi; \
+	popq_cfi_reg rdi
 
 #endif
 
@@ -87,12 +87,10 @@
 ENTRY(call_rwsem_down_read_failed)
 	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-	CFI_REL_OFFSET __ASM_REG(dx), 0
+	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_down_read_failed
-	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-	CFI_RESTORE __ASM_REG(dx)
+	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
 ENTRY(call_rwsem_downgrade_wake)
 	CFI_STARTPROC
 	save_common_regs
-	__ASM_SIZE(push,_cfi) %__ASM_REG(dx)
-	CFI_REL_OFFSET __ASM_REG(dx), 0
+	__ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
 	movq %rax,%rdi
 	call rwsem_downgrade_wake
-	__ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
-	CFI_RESTORE __ASM_REG(dx)
+	__ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
 	restore_common_regs
 	ret
 	CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
 	.globl \name
 \name:
 	CFI_STARTPROC
-	pushl_cfi %eax
-	CFI_REL_OFFSET eax, 0
-	pushl_cfi %ecx
-	CFI_REL_OFFSET ecx, 0
-	pushl_cfi %edx
-	CFI_REL_OFFSET edx, 0
+	pushl_cfi_reg eax
+	pushl_cfi_reg ecx
+	pushl_cfi_reg edx
 
 	.if \put_ret_addr_in_eax
 	/* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
 	.endif
 
 	call \func
-	popl_cfi %edx
-	CFI_RESTORE edx
-	popl_cfi %ecx
-	CFI_RESTORE ecx
-	popl_cfi %eax
-	CFI_RESTORE eax
+	popl_cfi_reg edx
+	popl_cfi_reg ecx
+	popl_cfi_reg eax
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index 8ec443a0777b..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,24 +17,15 @@
 	CFI_STARTPROC
 
 	/* this one pushes 9 elems, the next one would be %rIP */
-	pushq_cfi %rdi
-	CFI_REL_OFFSET rdi, 0
-	pushq_cfi %rsi
-	CFI_REL_OFFSET rsi, 0
-	pushq_cfi %rdx
-	CFI_REL_OFFSET rdx, 0
-	pushq_cfi %rcx
-	CFI_REL_OFFSET rcx, 0
-	pushq_cfi %rax
-	CFI_REL_OFFSET rax, 0
-	pushq_cfi %r8
-	CFI_REL_OFFSET r8, 0
-	pushq_cfi %r9
-	CFI_REL_OFFSET r9, 0
-	pushq_cfi %r10
-	CFI_REL_OFFSET r10, 0
-	pushq_cfi %r11
-	CFI_REL_OFFSET r11, 0
+	pushq_cfi_reg rdi
+	pushq_cfi_reg rsi
+	pushq_cfi_reg rdx
+	pushq_cfi_reg rcx
+	pushq_cfi_reg rax
+	pushq_cfi_reg r8
+	pushq_cfi_reg r9
+	pushq_cfi_reg r10
+	pushq_cfi_reg r11
 
 	.if \put_ret_addr_in_rdi
 	/* 9*8(%rsp) is return addr on stack */
@@ -69,24 +60,15 @@
 	CFI_STARTPROC
 	CFI_ADJUST_CFA_OFFSET 9*8
 restore:
-	popq_cfi %r11
-	CFI_RESTORE r11
-	popq_cfi %r10
-	CFI_RESTORE r10
-	popq_cfi %r9
-	CFI_RESTORE r9
-	popq_cfi %r8
-	CFI_RESTORE r8
-	popq_cfi %rax
-	CFI_RESTORE rax
-	popq_cfi %rcx
-	CFI_RESTORE rcx
-	popq_cfi %rdx
-	CFI_RESTORE rdx
-	popq_cfi %rsi
-	CFI_RESTORE rsi
-	popq_cfi %rdi
-	CFI_RESTORE rdi
+	popq_cfi_reg r11
+	popq_cfi_reg r10
+	popq_cfi_reg r9
+	popq_cfi_reg r8
+	popq_cfi_reg rax
+	popq_cfi_reg rcx
+	popq_cfi_reg rdx
+	popq_cfi_reg rsi
+	popq_cfi_reg rdi
 	ret
 	CFI_ENDPROC
 	_ASM_NOKPROBE(restore)
-- 
cgit v1.2.3


From 6e1327bd2b20ccb387fcddc0caa605cb253cc458 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:26 -0800
Subject: x86/asm/entry/64: Fix incorrect symbolic constant usage:
 R11->ARGOFFSET

Since the last fix of this nature, a few more instances have crept
in. Fix them up. No object code changes (constants have the same
value).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1423778052-21038-1-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/f5e1c4084319a42e5f14d41e2d638949ce66bc08.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 10074ad9ebf8..a57b3387ec7d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -757,8 +757,8 @@ retint_swapgs:		/* return to user-space */
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.
 	 */
-	movq (RCX-R11)(%rsp), %rcx
-	cmpq %rcx,(RIP-R11)(%rsp)		/* RCX == RIP */
+	movq (RCX-ARGOFFSET)(%rsp), %rcx
+	cmpq %rcx,(RIP-ARGOFFSET)(%rsp)		/* RCX == RIP */
 	jne opportunistic_sysret_failed
 
 	/*
@@ -779,7 +779,7 @@ retint_swapgs:		/* return to user-space */
 	shr $__VIRTUAL_MASK_SHIFT, %rcx
 	jnz opportunistic_sysret_failed
 
-	cmpq $__USER_CS,(CS-R11)(%rsp)		/* CS must match SYSRET */
+	cmpq $__USER_CS,(CS-ARGOFFSET)(%rsp)	/* CS must match SYSRET */
 	jne opportunistic_sysret_failed
 
 	movq (R11-ARGOFFSET)(%rsp), %r11
-- 
cgit v1.2.3


From 76f5df43cab5e765c0bd42289103e8f625813ae1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:27 -0800
Subject: x86/asm/entry/64: Always allocate a complete "struct pt_regs" on the
 kernel stack

The 64-bit entry code was using six stack slots less by not
saving/restoring registers which are callee-preserved according
to the C ABI, and was not allocating space for them.

Only when syscalls needed a complete "struct pt_regs" was
the complete area allocated and filled in.

As an additional twist, on interrupt entry a "slightly less
truncated pt_regs" trick is used, to make nested interrupt
stacks easier to unwind.

This proved to be a source of significant obfuscation and subtle
bugs. For example, 'stub_fork' had to pop the return address,
extend the struct, save registers, and push return address back.
Ugly. 'ia32_ptregs_common' pops return address and "returns" via
jmp insn, throwing a wrench into CPU return stack cache.

This patch changes the code to always allocate a complete
"struct pt_regs" on the kernel stack. The saving of registers
is still done lazily.

"Partial pt_regs" trick on interrupt stack is retained.

Macros which manipulate "struct pt_regs" on stack are reworked:

 - ALLOC_PT_GPREGS_ON_STACK allocates the structure.

 - SAVE_C_REGS saves to it those registers which are clobbered
   by C code.

 - SAVE_EXTRA_REGS saves to it all other registers.

 - Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros
   reverse it.

'ia32_ptregs_common', 'stub_fork' and friends lost their ugly dance
with the return pointer.

LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets
instead of magic numbers.

'error_entry' and 'save_paranoid' now use SAVE_C_REGS +
SAVE_EXTRA_REGS instead of having it open-coded yet again.

Patch was run-tested: 64-bit executables, 32-bit executables,
strace works.

Timing tests did not show measurable difference in 32-bit
and 64-bit syscalls.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1423778052-21038-2-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/b89763d354aa23e670b9bdf3a40ae320320a7c2e.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S              |  47 +++----
 arch/x86/include/asm/calling.h         | 222 ++++++++++++++++-----------------
 arch/x86/include/asm/irqflags.h        |   4 +-
 arch/x86/include/uapi/asm/ptrace-abi.h |   1 -
 arch/x86/kernel/entry_64.S             | 196 +++++++++++------------------
 5 files changed, 210 insertions(+), 260 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..f4bed4971673 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -62,12 +62,12 @@
 	 */
 	.macro LOAD_ARGS32 offset, _r9=0
 	.if \_r9
-	movl \offset+16(%rsp),%r9d
+	movl \offset+R9(%rsp),%r9d
 	.endif
-	movl \offset+40(%rsp),%ecx
-	movl \offset+48(%rsp),%edx
-	movl \offset+56(%rsp),%esi
-	movl \offset+64(%rsp),%edi
+	movl \offset+RCX(%rsp),%ecx
+	movl \offset+RDX(%rsp),%edx
+	movl \offset+RSI(%rsp),%esi
+	movl \offset+RDI(%rsp),%edi
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
@@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rip,0
 	pushq_cfi %rax
 	cld
-	SAVE_ARGS 0,1,0
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
  	/* no need to do an access_ok check here because rbp has been
  	   32bit zero extended */ 
 	ASM_STAC
@@ -182,7 +183,8 @@ sysexit_from_sys_call:
 	andl	$~0x200,EFLAGS-ARGOFFSET(%rsp)
 	movl	RIP-ARGOFFSET(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
-	RESTORE_ARGS 0,24,0,0,0,0
+	RESTORE_RSI_RDI
+	REMOVE_PT_GPREGS_FROM_STACK 3*8
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
@@ -256,13 +258,13 @@ sysenter_tracesys:
 	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jz	sysenter_auditsys
 #endif
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
 	jmp	sysenter_do_call
@@ -304,7 +306,8 @@ ENTRY(ia32_cstar_target)
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8,0,0
+	ALLOC_PT_GPREGS_ON_STACK 8
+	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
 	movq	%rcx,RIP-ARGOFFSET(%rsp)
@@ -341,7 +344,7 @@ cstar_dispatch:
 	jnz sysretl_audit
 sysretl_from_sys_call:
 	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	RESTORE_ARGS 0,-ARG_SKIP,0,0,0
+	RESTORE_RSI_RDI_RDX
 	movl RIP-ARGOFFSET(%rsp),%ecx
 	CFI_REGISTER rip,rcx
 	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
@@ -372,13 +375,13 @@ cstar_tracesys:
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS 0, r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -433,7 +436,8 @@ ENTRY(ia32_syscall)
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
-	SAVE_ARGS 0,1,0
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS_EXCEPT_R891011
 	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz ia32_tracesys
@@ -446,16 +450,16 @@ ia32_sysret:
 	movq %rax,RAX-ARGOFFSET(%rsp)
 ia32_ret_from_sys_call:
 	CLEAR_RREGS -ARGOFFSET
-	jmp int_ret_from_sys_call 
+	jmp int_ret_from_sys_call
 
-ia32_tracesys:			 
-	SAVE_REST
+ia32_tracesys:
+	SAVE_EXTRA_REGS
 	CLEAR_RREGS
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
 	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
 	jmp ia32_do_call
@@ -492,7 +496,6 @@ GLOBAL(stub32_clone)
 
 	ALIGN
 ia32_ptregs_common:
-	popq %r11
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
@@ -507,9 +510,9 @@ ia32_ptregs_common:
 /*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
 	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
 /*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
-	SAVE_REST
+	SAVE_EXTRA_REGS 8
 	call *%rax
-	RESTORE_REST
-	jmp  ia32_sysret	/* misbalances the return cache */
+	RESTORE_EXTRA_REGS 8
+	ret
 	CFI_ENDPROC
 END(ia32_ptregs_common)
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 3c711f2ab236..38356476b131 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,137 @@ For 32-bit we have the following conventions - kernel is built with
  * for assembly code:
  */
 
-#define R15		  0
-#define R14		  8
-#define R13		 16
-#define R12		 24
-#define RBP		 32
-#define RBX		 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11		 48
-#define R10		 56
-#define R9		 64
-#define R8		 72
-#define RAX		 80
-#define RCX		 88
-#define RDX		 96
-#define RSI		104
-#define RDI		112
-#define ORIG_RAX	120       /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP		128
-#define CS		136
-#define EFLAGS		144
-#define RSP		152
-#define SS		160
-
-#define ARGOFFSET	R11
-
-	.macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
-	subq  $9*8+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	9*8+\addskip
-	movq_cfi rdi, 8*8
-	movq_cfi rsi, 7*8
-	movq_cfi rdx, 6*8
-
-	.if \save_rcx
-	movq_cfi rcx, 5*8
-	.endif
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15		0*8
+#define R14		1*8
+#define R13		2*8
+#define R12		3*8
+#define RBP		4*8
+#define RBX		5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11		6*8
+#define R10		7*8
+#define R9		8*8
+#define R8		9*8
+#define RAX		10*8
+#define RCX		11*8
+#define RDX		12*8
+#define RSI		13*8
+#define RDI		14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX	15*8
+/* Return frame for iretq */
+#define RIP		16*8
+#define CS		17*8
+#define EFLAGS		18*8
+#define RSP		19*8
+#define SS		20*8
+
+#define ARGOFFSET	0
+
+	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
+	subq	$15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+	.endm
 
-	.if \rax_enosys
-	movq $-ENOSYS, 4*8(%rsp)
-	.else
-	movq_cfi rax, 4*8
+	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8plus=1
+	.if \r8plus
+	movq_cfi r11, 6*8+\offset
+	movq_cfi r10, 7*8+\offset
+	movq_cfi r9,  8*8+\offset
+	movq_cfi r8,  9*8+\offset
 	.endif
-
-	.if \save_r891011
-	movq_cfi r8,  3*8
-	movq_cfi r9,  2*8
-	movq_cfi r10, 1*8
-	movq_cfi r11, 0*8
+	.if \rax
+	movq_cfi rax, 10*8+\offset
+	.endif
+	.if \rcx
+	movq_cfi rcx, 11*8+\offset
 	.endif
+	movq_cfi rdx, 12*8+\offset
+	movq_cfi rsi, 13*8+\offset
+	movq_cfi rdi, 14*8+\offset
+	.endm
+	.macro SAVE_C_REGS offset=0
+	SAVE_C_REGS_HELPER \offset, 1, 1, 1
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
+	SAVE_C_REGS_HELPER \offset, 0, 0, 1
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_R891011
+	SAVE_C_REGS_HELPER 0, 1, 1, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
+	SAVE_C_REGS_HELPER 0, 1, 0, 0
+	.endm
 
+	.macro SAVE_EXTRA_REGS offset=0
+	movq_cfi r15, 0*8+\offset
+	movq_cfi r14, 1*8+\offset
+	movq_cfi r13, 2*8+\offset
+	movq_cfi r12, 3*8+\offset
+	movq_cfi rbp, 4*8+\offset
+	movq_cfi rbx, 5*8+\offset
+	.endm
+	.macro SAVE_EXTRA_REGS_RBP offset=0
+	movq_cfi rbp, 4*8+\offset
 	.endm
 
-#define ARG_SKIP	(9*8)
+	.macro RESTORE_EXTRA_REGS offset=0
+	movq_cfi_restore 0*8+\offset, r15
+	movq_cfi_restore 1*8+\offset, r14
+	movq_cfi_restore 2*8+\offset, r13
+	movq_cfi_restore 3*8+\offset, r12
+	movq_cfi_restore 4*8+\offset, rbp
+	movq_cfi_restore 5*8+\offset, rbx
+	.endm
 
-	.macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
-			    rstor_r8910=1, rstor_rdx=1
+	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
-	movq_cfi_restore 0*8, r11
+	movq_cfi_restore 6*8, r11
 	.endif
-
 	.if \rstor_r8910
-	movq_cfi_restore 1*8, r10
-	movq_cfi_restore 2*8, r9
-	movq_cfi_restore 3*8, r8
+	movq_cfi_restore 7*8, r10
+	movq_cfi_restore 8*8, r9
+	movq_cfi_restore 9*8, r8
 	.endif
-
 	.if \rstor_rax
-	movq_cfi_restore 4*8, rax
+	movq_cfi_restore 10*8, rax
 	.endif
-
 	.if \rstor_rcx
-	movq_cfi_restore 5*8, rcx
+	movq_cfi_restore 11*8, rcx
 	.endif
-
 	.if \rstor_rdx
-	movq_cfi_restore 6*8, rdx
-	.endif
-
-	movq_cfi_restore 7*8, rsi
-	movq_cfi_restore 8*8, rdi
-
-	.if ARG_SKIP+\addskip > 0
-	addq $ARG_SKIP+\addskip, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(ARG_SKIP+\addskip)
+	movq_cfi_restore 12*8, rdx
 	.endif
+	movq_cfi_restore 13*8, rsi
+	movq_cfi_restore 14*8, rdi
 	.endm
-
-	.macro LOAD_ARGS offset, skiprax=0
-	movq \offset(%rsp),    %r11
-	movq \offset+8(%rsp),  %r10
-	movq \offset+16(%rsp), %r9
-	movq \offset+24(%rsp), %r8
-	movq \offset+40(%rsp), %rcx
-	movq \offset+48(%rsp), %rdx
-	movq \offset+56(%rsp), %rsi
-	movq \offset+64(%rsp), %rdi
-	.if \skiprax
-	.else
-	movq \offset+72(%rsp), %rax
-	.endif
+	.macro RESTORE_C_REGS
+	RESTORE_C_REGS_HELPER 1,1,1,1,1
 	.endm
-
-#define REST_SKIP	(6*8)
-
-	.macro SAVE_REST
-	subq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
-	movq_cfi rbx, 5*8
-	movq_cfi rbp, 4*8
-	movq_cfi r12, 3*8
-	movq_cfi r13, 2*8
-	movq_cfi r14, 1*8
-	movq_cfi r15, 0*8
+	.macro RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_C_REGS_HELPER 0,1,1,1,1
 	.endm
-
-	.macro RESTORE_REST
-	movq_cfi_restore 0*8, r15
-	movq_cfi_restore 1*8, r14
-	movq_cfi_restore 2*8, r13
-	movq_cfi_restore 3*8, r12
-	movq_cfi_restore 4*8, rbp
-	movq_cfi_restore 5*8, rbx
-	addq $REST_SKIP, %rsp
-	CFI_ADJUST_CFA_OFFSET	-(REST_SKIP)
+	.macro RESTORE_C_REGS_EXCEPT_RCX
+	RESTORE_C_REGS_HELPER 1,0,1,1,1
 	.endm
-
-	.macro SAVE_ALL
-	SAVE_ARGS
-	SAVE_REST
+	.macro RESTORE_RSI_RDI
+	RESTORE_C_REGS_HELPER 0,0,0,0,0
+	.endm
+	.macro RESTORE_RSI_RDI_RDX
+	RESTORE_C_REGS_HELPER 0,0,0,0,1
 	.endm
 
-	.macro RESTORE_ALL addskip=0
-	RESTORE_REST
-	RESTORE_ARGS 1, \addskip
+	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+	addq $15*8+\addskip, %rsp
+	CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..021bee9b86b6 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
 	sti; \
-	SAVE_REST; \
+	SAVE_EXTRA_REGS; \
 	LOCKDEP_SYS_EXIT; \
-	RESTORE_REST; \
+	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
 
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..ad115bf779f3 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -49,7 +49,6 @@
 #define EFLAGS 144
 #define RSP 152
 #define SS 160
-#define ARGOFFSET R11
 #endif /* __ASSEMBLY__ */
 
 /* top of stack page */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a57b3387ec7d..e8372e08f8c2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -26,12 +26,6 @@
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
  * - ENTRY/END Define functions in the symbol table.
  * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
  * frame that is otherwise undefined after a SYSCALL
@@ -190,9 +184,9 @@ ENDPROC(native_usergs_sysret64)
 	.endm
 
 /*
- * frame that enables calling into C.
+ * frame that enables passing a complete pt_regs to a C function.
  */
-	.macro PARTIAL_FRAME start=1 offset=0
+	.macro DEFAULT_FRAME start=1 offset=0
 	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
 	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
 	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
@@ -203,13 +197,6 @@ ENDPROC(native_usergs_sysret64)
 	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
 	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
 	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
-	.endm
-
-/*
- * frame that enables passing a complete pt_regs to a C function.
- */
-	.macro DEFAULT_FRAME start=1 offset=0
-	PARTIAL_FRAME \start, R11+\offset-R15
 	CFI_REL_OFFSET rbx, RBX+\offset
 	CFI_REL_OFFSET rbp, RBP+\offset
 	CFI_REL_OFFSET r12, R12+\offset
@@ -221,21 +208,8 @@ ENDPROC(native_usergs_sysret64)
 ENTRY(save_paranoid)
 	XCPT_FRAME 1 RDI+8
 	cld
-	movq %rdi, RDI+8(%rsp)
-	movq %rsi, RSI+8(%rsp)
-	movq_cfi rdx, RDX+8
-	movq_cfi rcx, RCX+8
-	movq_cfi rax, RAX+8
-	movq %r8, R8+8(%rsp)
-	movq %r9, R9+8(%rsp)
-	movq %r10, R10+8(%rsp)
-	movq %r11, R11+8(%rsp)
-	movq_cfi rbx, RBX+8
-	movq %rbp, RBP+8(%rsp)
-	movq %r12, R12+8(%rsp)
-	movq %r13, R13+8(%rsp)
-	movq %r14, R14+8(%rsp)
-	movq %r15, R15+8(%rsp)
+	SAVE_C_REGS 8
+	SAVE_EXTRA_REGS 8
 	movl $1,%ebx
 	movl $MSR_GS_BASE,%ecx
 	rdmsr
@@ -264,7 +238,7 @@ ENTRY(ret_from_fork)
 
 	GET_THREAD_INFO(%rcx)
 
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
 	jz   1f
@@ -276,12 +250,10 @@ ENTRY(ret_from_fork)
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
 1:
-	subq $REST_SKIP, %rsp	# leave space for volatiles
-	CFI_ADJUST_CFA_OFFSET	REST_SKIP
 	movq %rbp, %rdi
 	call *%rbx
 	movl $0, RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(ret_from_fork)
@@ -339,9 +311,11 @@ GLOBAL(system_call_after_swapgs)
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_ARGS 8, 0, rax_enosys=1
+	ALLOC_PT_GPREGS_ON_STACK 8
+	SAVE_C_REGS_EXCEPT_RAX_RCX
+	movq	$-ENOSYS,RAX-ARGOFFSET(%rsp)
 	movq_cfi rax,(ORIG_RAX-ARGOFFSET)
-	movq  %rcx,RIP-ARGOFFSET(%rsp)
+	movq	%rcx,RIP-ARGOFFSET(%rsp)
 	CFI_REL_OFFSET rip,RIP-ARGOFFSET
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
 	jnz tracesys
@@ -372,9 +346,9 @@ ret_from_sys_call:
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
+	RESTORE_C_REGS_EXCEPT_RCX
 	movq RIP-ARGOFFSET(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
-	RESTORE_ARGS 1,-ARG_SKIP,0
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	USERGS_SYSRET64
@@ -387,16 +361,17 @@ int_ret_from_sys_call_fixup:
 
 	/* Do syscall tracing */
 tracesys:
-	leaq -REST_SKIP(%rsp), %rdi
+	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
 	call syscall_trace_enter_phase1
 	test %rax, %rax
 	jnz tracesys_phase2		/* if needed, run the slow path */
-	LOAD_ARGS 0			/* else restore clobbered regs */
+	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
+	movq ORIG_RAX-ARGOFFSET(%rsp), %rax
 	jmp system_call_fastpath	/*      and return to the fast path */
 
 tracesys_phase2:
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
@@ -408,8 +383,8 @@ tracesys_phase2:
 	 * We don't reload %rax because syscall_trace_entry_phase2() returned
 	 * the value it wants us to use in the table lookup.
 	 */
-	LOAD_ARGS ARGOFFSET, 1
-	RESTORE_REST
+	RESTORE_C_REGS_EXCEPT_RAX
+	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
 	cmpq $__NR_syscall_max,%rax
 #else
@@ -460,7 +435,7 @@ int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
 int_check_syscall_exit_work:
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
 	jz int_signal
@@ -479,7 +454,7 @@ int_signal:
 	call do_notify_resume
 1:	movl $_TIF_WORK_MASK,%edi
 int_restore_rest:
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
@@ -489,15 +464,12 @@ END(system_call)
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
 	CFI_STARTPROC
-	popq	%r11			/* save return address */
-	PARTIAL_FRAME 0
-	SAVE_REST
-	pushq	%r11			/* put it back on stack */
+	DEFAULT_FRAME 0, 8		/* offset 8: return address */
+	SAVE_EXTRA_REGS 8
 	FIXUP_TOP_OF_STACK %r11, 8
-	DEFAULT_FRAME 0 8		/* offset 8: return address */
 	call sys_\func
 	RESTORE_TOP_OF_STACK %r11, 8
-	ret $REST_SKIP		/* pop extended registers */
+	ret
 	CFI_ENDPROC
 END(stub_\func)
 	.endm
@@ -505,7 +477,7 @@ END(stub_\func)
 	.macro FIXED_FRAME label,func
 ENTRY(\label)
 	CFI_STARTPROC
-	PARTIAL_FRAME 0 8		/* offset 8: return address */
+	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
 	call \func
 	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
@@ -522,12 +494,12 @@ END(\label)
 ENTRY(stub_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys_execve
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
@@ -535,13 +507,13 @@ END(stub_execve)
 ENTRY(stub_execveat)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys_execveat
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execveat)
@@ -553,12 +525,12 @@ END(stub_execveat)
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
@@ -567,12 +539,12 @@ END(stub_rt_sigreturn)
 ENTRY(stub_x32_rt_sigreturn)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call sys32_x32_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
@@ -580,13 +552,13 @@ END(stub_x32_rt_sigreturn)
 ENTRY(stub_x32_execve)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call compat_sys_execve
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_execve)
@@ -594,13 +566,13 @@ END(stub_x32_execve)
 ENTRY(stub_x32_execveat)
 	CFI_STARTPROC
 	addq $8, %rsp
-	PARTIAL_FRAME 0
-	SAVE_REST
+	DEFAULT_FRAME 0
+	SAVE_EXTRA_REGS
 	FIXUP_TOP_OF_STACK %r11
 	call compat_sys_execveat
 	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_x32_execveat)
@@ -656,42 +628,28 @@ END(interrupt)
 
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
-	/* reserve pt_regs for scratch regs and rbp */
-	subq $ORIG_RAX-RBP, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
 	cld
-	/* start from rbp in pt_regs and jump over */
-	movq_cfi rdi, (RDI-RBP)
-	movq_cfi rsi, (RSI-RBP)
-	movq_cfi rdx, (RDX-RBP)
-	movq_cfi rcx, (RCX-RBP)
-	movq_cfi rax, (RAX-RBP)
-	movq_cfi  r8,  (R8-RBP)
-	movq_cfi  r9,  (R9-RBP)
-	movq_cfi r10, (R10-RBP)
-	movq_cfi r11, (R11-RBP)
-
-	/* Save rbp so that we can unwind from get_irq_regs() */
-	movq_cfi rbp, 0
-
-	/* Save previous stack value */
-	movq %rsp, %rsi
+	ALLOC_PT_GPREGS_ON_STACK -RBP
+	SAVE_C_REGS -RBP
+	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
+	SAVE_EXTRA_REGS_RBP -RBP
+
+	leaq -RBP(%rsp),%rdi	/* arg1 for \func (pointer to pt_regs) */
 
-	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS-RBP(%rsi)
+	testl $3, CS-RBP(%rsp)
 	je 1f
 	SWAPGS
+1:
 	/*
 	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-1:	incl PER_CPU_VAR(irq_count)
+	movq %rsp, %rsi
+	incl PER_CPU_VAR(irq_count)
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
-
-	/* Store previous stack value */
 	pushq %rsi
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
 			0x77 /* DW_OP_breg7 */, 0, \
@@ -800,7 +758,8 @@ retint_swapgs:		/* return to user-space */
 	 */
 irq_return_via_sysret:
 	CFI_REMEMBER_STATE
-	RESTORE_ARGS 1,8,1
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 	movq (RSP-RIP)(%rsp),%rsp
 	USERGS_SYSRET64
 	CFI_RESTORE_STATE
@@ -816,7 +775,8 @@ retint_restore_args:	/* return to kernel space */
 	 */
 	TRACE_IRQS_IRETQ
 restore_args:
-	RESTORE_ARGS 1,8,1
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 
 irq_return:
 	INTERRUPT_RETURN
@@ -887,12 +847,12 @@ retint_signal:
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_REST
+	SAVE_EXTRA_REGS
 	movq $-1,ORIG_RAX(%rsp)
 	xorl %esi,%esi		# oldset
 	movq %rsp,%rdi		# &pt_regs
 	call do_notify_resume
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1019,8 +979,7 @@ ENTRY(\sym)
 	pushq_cfi $-1			/* ORIG_RAX: no syscall to restart */
 	.endif
 
-	subq $ORIG_RAX-R15, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	ALLOC_PT_GPREGS_ON_STACK
 
 	.if \paranoid
 	.if \paranoid == 1
@@ -1269,7 +1228,9 @@ ENTRY(xen_failsafe_callback)
 	addq $0x30,%rsp
 	CFI_ADJUST_CFA_OFFSET -0x30
 	pushq_cfi $-1 /* orig_ax = -1 => not a system call */
-	SAVE_ALL
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 	jmp error_exit
 	CFI_ENDPROC
 END(xen_failsafe_callback)
@@ -1321,11 +1282,15 @@ ENTRY(paranoid_exit)
 	jnz paranoid_restore
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
-	RESTORE_ALL 8
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
 paranoid_restore:
 	TRACE_IRQS_IRETQ_DEBUG 0
-	RESTORE_ALL 8
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
+	REMOVE_PT_GPREGS_FROM_STACK 8
 	INTERRUPT_RETURN
 	CFI_ENDPROC
 END(paranoid_exit)
@@ -1339,21 +1304,8 @@ ENTRY(error_entry)
 	CFI_ADJUST_CFA_OFFSET 15*8
 	/* oldrax contains error code */
 	cld
-	movq %rdi, RDI+8(%rsp)
-	movq %rsi, RSI+8(%rsp)
-	movq %rdx, RDX+8(%rsp)
-	movq %rcx, RCX+8(%rsp)
-	movq %rax, RAX+8(%rsp)
-	movq  %r8,  R8+8(%rsp)
-	movq  %r9,  R9+8(%rsp)
-	movq %r10, R10+8(%rsp)
-	movq %r11, R11+8(%rsp)
-	movq_cfi rbx, RBX+8
-	movq %rbp, RBP+8(%rsp)
-	movq %r12, R12+8(%rsp)
-	movq %r13, R13+8(%rsp)
-	movq %r14, R14+8(%rsp)
-	movq %r15, R15+8(%rsp)
+	SAVE_C_REGS 8
+	SAVE_EXTRA_REGS 8
 	xorl %ebx,%ebx
 	testl $3,CS+8(%rsp)
 	je error_kernelspace
@@ -1402,7 +1354,7 @@ END(error_entry)
 ENTRY(error_exit)
 	DEFAULT_FRAME
 	movl %ebx,%eax
-	RESTORE_REST
+	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1621,8 +1573,8 @@ end_repeat_nmi:
 	 * so that we repeat another NMI.
 	 */
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
-	subq $ORIG_RAX-R15, %rsp
-	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+	ALLOC_PT_GPREGS_ON_STACK
+
 	/*
 	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
 	 * as we should not be calling schedule in NMI context.
@@ -1661,8 +1613,10 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
+	RESTORE_EXTRA_REGS
+	RESTORE_C_REGS
 	/* Pop the extra iret frame at once */
-	RESTORE_ALL 6*8
+	REMOVE_PT_GPREGS_FROM_STACK 6*8
 
 	/* Clear the NMI executing stack variable */
 	movq $0, 5*8(%rsp)
-- 
cgit v1.2.3


From e90e147cbc0cbc8dcf48000e15190badf75250f4 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:28 -0800
Subject: x86/asm/entry/64: Fix comments

 - Misleading and slightly incorrect comments in "struct pt_regs" are
   fixed (four instances).

 - Fix incorrect comment atop EMPTY_FRAME macro.

 - Explain in more detail what we do with stack layout during hw interrupt.

 - Correct comments about "partial stack frame" which are no longer
   true.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1423778052-21038-3-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/e1f4429c491fe6ceeddb879dea2786e0f8920f9c.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ptrace.h          | 13 ++++++++++---
 arch/x86/include/uapi/asm/ptrace-abi.h | 15 +++++++++++----
 arch/x86/include/uapi/asm/ptrace.h     | 13 ++++++++++---
 arch/x86/kernel/entry_64.S             | 18 ++++++++++++------
 4 files changed, 43 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..4077d963a1a0 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
 #else /* __i386__ */
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long bp;
 	unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
 	unsigned long dx;
 	unsigned long si;
 	unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long ip;
 	unsigned long cs;
 	unsigned long flags;
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index ad115bf779f3..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
 #else /* __i386__ */
 
 #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 #define R15 0
 #define R14 8
 #define R13 16
 #define R12 24
 #define RBP 32
 #define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 #define R11 48
 #define R10 56
 #define R9 64
@@ -41,9 +45,12 @@
 #define RDX 96
 #define RSI 104
 #define RDI 112
-#define ORIG_RAX 120       /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
 #define RIP 128
 #define CS 136
 #define EFLAGS 144
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
 #ifndef __KERNEL__
 
 struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
 	unsigned long r15;
 	unsigned long r14;
 	unsigned long r13;
 	unsigned long r12;
 	unsigned long rbp;
 	unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
 	unsigned long r11;
 	unsigned long r10;
 	unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
 	unsigned long rdx;
 	unsigned long rsi;
 	unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
 	unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
 	unsigned long rip;
 	unsigned long cs;
 	unsigned long eflags;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e8372e08f8c2..695f4d434a84 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,9 +14,6 @@
  * NOTE: This code handles signal-recognition, which happens every time
  * after an interrupt and after each system call.
  *
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
  * A note on terminology:
  * - top of stack: Architecture defined interrupt frame from SS to RIP
  * at the top of the kernel process stack.
@@ -151,7 +148,7 @@ ENDPROC(native_usergs_sysret64)
 	.endm
 
 /*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
  */
 	.macro EMPTY_FRAME start=1 offset=0
 	.if \start
@@ -379,7 +376,7 @@ tracesys_phase2:
 	call syscall_trace_enter_phase2
 
 	/*
-	 * Reload arg registers from stack in case ptrace changed them.
+	 * Reload registers from stack in case ptrace changed them.
 	 * We don't reload %rax because syscall_trace_entry_phase2() returned
 	 * the value it wants us to use in the table lookup.
 	 */
@@ -629,6 +626,13 @@ END(interrupt)
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	cld
+	/*
+	 * Since nothing in interrupt handling code touches r12...r15 members
+	 * of "struct pt_regs", and since interrupts can nest, we can save
+	 * four stack slots and simultaneously provide
+	 * an unwind-friendly stack layout by saving "truncated" pt_regs
+	 * exactly up to rbp slot, without these members.
+	 */
 	ALLOC_PT_GPREGS_ON_STACK -RBP
 	SAVE_C_REGS -RBP
 	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
@@ -641,6 +645,7 @@ END(interrupt)
 	SWAPGS
 1:
 	/*
+	 * Save previous stack pointer, optionally switch to interrupt stack.
 	 * irq_count is used to check if a CPU is already on an interrupt stack
 	 * or not. While this is essentially redundant with preempt_count it is
 	 * a little cheaper to use a separate counter in the PDA (short of
@@ -681,6 +686,7 @@ ret_from_intr:
 	/* Restore saved previous stack */
 	popq %rsi
 	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
+	/* return code expects complete pt_regs - adjust rsp accordingly: */
 	leaq ARGOFFSET-RBP(%rsi), %rsp
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
@@ -692,7 +698,7 @@ exit_intr:
 
 	/* Interrupt came from user space */
 	/*
-	 * Has a correct top of stack, but a partial stack frame
+	 * Has a correct top of stack.
 	 * %rcx: thread info. Interrupts off.
 	 */
 retint_with_reschedule:
-- 
cgit v1.2.3


From 0d55083698ed2f498e5682c5c252e6b7224890be Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:29 -0800
Subject: x86/asm/entry/64: Shrink code in 'paranoid_exit'

RESTORE_EXTRA_REGS + RESTORE_C_REGS looks small, but it's
a lot of instructions (fourteen). Let's reuse them.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
[ Cleaned up the labels. ]
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1421272101-16847-2-git-send-email-dvlasenk@redhat.com
Link: http://lkml.kernel.org/r/59d71848cee3ec9eb48c0252e602efd6bd560e3c.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 695f4d434a84..8fafed9f462d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1285,15 +1285,13 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
-	jnz paranoid_restore
+	jnz paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ 0
 	SWAPGS_UNSAFE_STACK
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
-	INTERRUPT_RETURN
-paranoid_restore:
+	jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG 0
+paranoid_exit_restore:
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
-- 
cgit v1.2.3


From f2db9382c1140914cfdef224ce907e443c9f9b81 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:30 -0800
Subject: x86/asm/entry: Do mass removal of 'ARGOFFSET'

ARGOFFSET is zero now, removing it changes no code.

A few macros lost "offset" parameter, since it is always zero
now too.

No code changes - verified with objdump.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/8689f937622d9d2db0ab8be82331fa15e4ed4713.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S      | 142 ++++++++++++++++++++---------------------
 arch/x86/include/asm/calling.h |   2 -
 arch/x86/kernel/entry_64.S     |  86 ++++++++++++-------------
 3 files changed, 114 insertions(+), 116 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index f4bed4971673..e99f8a5be2df 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -41,13 +41,13 @@
 	movl	%edx,%edx	/* zero extension */
 	.endm 
 
-	/* clobbers %eax */	
-	.macro  CLEAR_RREGS offset=0, _r9=rax
+	/* clobbers %rax */
+	.macro  CLEAR_RREGS _r9=rax
 	xorl 	%eax,%eax
-	movq	%rax,\offset+R11(%rsp)
-	movq	%rax,\offset+R10(%rsp)
-	movq	%\_r9,\offset+R9(%rsp)
-	movq	%rax,\offset+R8(%rsp)
+	movq	%rax,R11(%rsp)
+	movq	%rax,R10(%rsp)
+	movq	%\_r9,R9(%rsp)
+	movq	%rax,R8(%rsp)
 	.endm
 
 	/*
@@ -60,14 +60,14 @@
 	 * If it's -1 to make us punt the syscall, then (u32)-1 is still
 	 * an appropriately invalid value.
 	 */
-	.macro LOAD_ARGS32 offset, _r9=0
+	.macro LOAD_ARGS32 _r9=0
 	.if \_r9
-	movl \offset+R9(%rsp),%r9d
+	movl R9(%rsp),%r9d
 	.endif
-	movl \offset+RCX(%rsp),%ecx
-	movl \offset+RDX(%rsp),%edx
-	movl \offset+RSI(%rsp),%esi
-	movl \offset+RDI(%rsp),%edi
+	movl RCX(%rsp),%ecx
+	movl RDX(%rsp),%edx
+	movl RSI(%rsp),%esi
+	movl RDI(%rsp),%edi
 	movl %eax,%eax			/* zero extension */
 	.endm
 	
@@ -158,12 +158,12 @@ ENTRY(ia32_sysenter_target)
 	 * ourselves.  To save a few cycles, we can check whether
 	 * NT was set instead of doing an unconditional popfq.
 	 */
-	testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+	testl $X86_EFLAGS_NT,EFLAGS(%rsp)
 	jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -172,16 +172,16 @@ sysenter_do_call:
 	IA32_ARG_FIXUP
 sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
-	movq	%rax,RAX-ARGOFFSET(%rsp)
+	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
 	/* clear IF, that popfq doesn't enable interrupts early */
-	andl	$~0x200,EFLAGS-ARGOFFSET(%rsp)
-	movl	RIP-ARGOFFSET(%rsp),%edx		/* User %eip */
+	andl	$~0x200,EFLAGS(%rsp)
+	movl	RIP(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
 	RESTORE_RSI_RDI
 	REMOVE_PT_GPREGS_FROM_STACK 3*8
@@ -207,18 +207,18 @@ sysexit_from_sys_call:
 	movl %ebx,%esi			/* 2nd arg: 1st syscall arg */
 	movl %eax,%edi			/* 1st arg: syscall number */
 	call __audit_syscall_entry
-	movl RAX-ARGOFFSET(%rsp),%eax	/* reload syscall number */
+	movl RAX(%rsp),%eax	/* reload syscall number */
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
 	movl %ebx,%edi			/* reload 1st syscall arg */
-	movl RCX-ARGOFFSET(%rsp),%esi	/* reload 2nd syscall arg */
-	movl RDX-ARGOFFSET(%rsp),%edx	/* reload 3rd syscall arg */
-	movl RSI-ARGOFFSET(%rsp),%ecx	/* reload 4th syscall arg */
-	movl RDI-ARGOFFSET(%rsp),%r8d	/* reload 5th syscall arg */
+	movl RCX(%rsp),%esi	/* reload 2nd syscall arg */
+	movl RDX(%rsp),%edx	/* reload 3rd syscall arg */
+	movl RSI(%rsp),%ecx	/* reload 4th syscall arg */
+	movl RDI(%rsp),%r8d	/* reload 5th syscall arg */
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -229,13 +229,13 @@ sysexit_from_sys_call:
 1:	setbe %al		/* 1 if error, 0 if not */
 	movzbl %al,%edi		/* zero-extend that into %edi */
 	call __audit_syscall_exit
-	movq RAX-ARGOFFSET(%rsp),%rax	/* reload syscall return value */
+	movq RAX(%rsp),%rax	/* reload syscall return value */
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP)
 	jz \exit
-	CLEAR_RREGS -ARGOFFSET
+	CLEAR_RREGS
 	jmp int_with_check
 	.endm
 
@@ -255,7 +255,7 @@ sysenter_fix_flags:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jz	sysenter_auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -263,7 +263,7 @@ sysenter_tracesys:
 	movq	$-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
 	movq	%rsp,%rdi        /* &pt_regs -> arg1 */
 	call	syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32  /* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
@@ -309,17 +309,17 @@ ENTRY(ia32_cstar_target)
 	ALLOC_PT_GPREGS_ON_STACK 8
 	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
-	movq	%rax,ORIG_RAX-ARGOFFSET(%rsp)
-	movq	%rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	movq	%rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+	movq	%rax,ORIG_RAX(%rsp)
+	movq	%rcx,RIP(%rsp)
+	CFI_REL_OFFSET rip,RIP
+	movq	%rbp,RCX(%rsp) /* this lies slightly to ptrace */
 	movl	%ebp,%ecx
-	movq	$__USER32_CS,CS-ARGOFFSET(%rsp)
-	movq	$__USER32_DS,SS-ARGOFFSET(%rsp)
-	movq	%r11,EFLAGS-ARGOFFSET(%rsp)
-	/*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
-	movq	%r8,RSP-ARGOFFSET(%rsp)	
-	CFI_REL_OFFSET rsp,RSP-ARGOFFSET
+	movq	$__USER32_CS,CS(%rsp)
+	movq	$__USER32_DS,SS(%rsp)
+	movq	%r11,EFLAGS(%rsp)
+	/*CFI_REL_OFFSET rflags,EFLAGS*/
+	movq	%r8,RSP(%rsp)
+	CFI_REL_OFFSET rsp,RSP
 	/* no need to do an access_ok check here because r8 has been
 	   32bit zero extended */ 
 	/* hardware stack frame is complete now */	
@@ -327,8 +327,8 @@ ENTRY(ia32_cstar_target)
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -337,32 +337,32 @@ cstar_do_call:
 	IA32_ARG_FIXUP 1
 cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
 	RESTORE_RSI_RDI_RDX
-	movl RIP-ARGOFFSET(%rsp),%ecx
+	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
-	movl EFLAGS-ARGOFFSET(%rsp),%r11d	
+	movl EFLAGS(%rsp),%r11d
 	/*CFI_REGISTER rflags,r11*/
 	xorq	%r10,%r10
 	xorq	%r9,%r9
 	xorq	%r8,%r8
 	TRACE_IRQS_ON
-	movl RSP-ARGOFFSET(%rsp),%esp
+	movl RSP(%rsp),%esp
 	CFI_RESTORE rsp
 	USERGS_SYSRET32
 	
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
 	CFI_RESTORE_STATE
-	movl %r9d,R9-ARGOFFSET(%rsp)	/* register to be clobbered by call */
+	movl %r9d,R9(%rsp)	/* register to be clobbered by call */
 	auditsys_entry_common
-	movl R9-ARGOFFSET(%rsp),%r9d	/* reload 6th syscall arg */
+	movl R9(%rsp),%r9d	/* reload 6th syscall arg */
 	jmp cstar_dispatch
 
 sysretl_audit:
@@ -371,16 +371,16 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
 	SAVE_EXTRA_REGS
-	CLEAR_RREGS 0, r9
+	CLEAR_RREGS r9
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET, 1  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32 1	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	xchgl %ebp,%r9d
 	cmpq $(IA32_NR_syscalls-1),%rax
@@ -438,8 +438,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
-	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
@@ -447,9 +447,9 @@ ia32_do_call:
 	IA32_ARG_FIXUP
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 ia32_ret_from_sys_call:
-	CLEAR_RREGS -ARGOFFSET
+	CLEAR_RREGS
 	jmp int_ret_from_sys_call
 
 ia32_tracesys:
@@ -458,7 +458,7 @@ ia32_tracesys:
 	movq $-ENOSYS,RAX(%rsp)	/* ptrace can change this for a bad syscall */
 	movq %rsp,%rdi        /* &pt_regs -> arg1 */
 	call syscall_trace_enter
-	LOAD_ARGS32 ARGOFFSET  /* reload args from stack in case ptrace changed it */
+	LOAD_ARGS32	/* reload args from stack in case ptrace changed it */
 	RESTORE_EXTRA_REGS
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja  int_ret_from_sys_call	/* ia32_tracesys has set RAX(%rsp) */
@@ -466,7 +466,7 @@ ia32_tracesys:
 END(ia32_syscall)
 
 ia32_badsys:
-	movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+	movq $0,ORIG_RAX(%rsp)
 	movq $-ENOSYS,%rax
 	jmp ia32_sysret
 
@@ -499,17 +499,17 @@ ia32_ptregs_common:
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-ARGOFFSET
-	CFI_REL_OFFSET	rax,RAX-ARGOFFSET
-	CFI_REL_OFFSET	rcx,RCX-ARGOFFSET
-	CFI_REL_OFFSET	rdx,RDX-ARGOFFSET
-	CFI_REL_OFFSET	rsi,RSI-ARGOFFSET
-	CFI_REL_OFFSET	rdi,RDI-ARGOFFSET
-	CFI_REL_OFFSET	rip,RIP-ARGOFFSET
-/*	CFI_REL_OFFSET	cs,CS-ARGOFFSET*/
-/*	CFI_REL_OFFSET	rflags,EFLAGS-ARGOFFSET*/
-	CFI_REL_OFFSET	rsp,RSP-ARGOFFSET
-/*	CFI_REL_OFFSET	ss,SS-ARGOFFSET*/
+	CFI_DEF_CFA	rsp,SS+8
+	CFI_REL_OFFSET	rax,RAX
+	CFI_REL_OFFSET	rcx,RCX
+	CFI_REL_OFFSET	rdx,RDX
+	CFI_REL_OFFSET	rsi,RSI
+	CFI_REL_OFFSET	rdi,RDI
+	CFI_REL_OFFSET	rip,RIP
+/*	CFI_REL_OFFSET	cs,CS*/
+/*	CFI_REL_OFFSET	rflags,EFLAGS*/
+	CFI_REL_OFFSET	rsp,RSP
+/*	CFI_REL_OFFSET	ss,SS*/
 	SAVE_EXTRA_REGS 8
 	call *%rax
 	RESTORE_EXTRA_REGS 8
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 38356476b131..4a7ceb9789a5 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -88,8 +88,6 @@ For 32-bit we have the following conventions - kernel is built with
 #define RSP		19*8
 #define SS		20*8
 
-#define ARGOFFSET	0
-
 	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
 	subq	$15*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8fafed9f462d..06055f9578a8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -73,9 +73,9 @@ ENDPROC(native_usergs_sysret64)
 #endif /* CONFIG_PARAVIRT */
 
 
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
 #ifdef CONFIG_TRACE_IRQFLAGS
-	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  1f
 	TRACE_IRQS_ON
 1:
@@ -107,8 +107,8 @@ ENDPROC(native_usergs_sysret64)
 	call debug_stack_reset
 .endm
 
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
-	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  1f
 	TRACE_IRQS_ON_DEBUG
 1:
@@ -184,16 +184,16 @@ ENDPROC(native_usergs_sysret64)
  * frame that enables passing a complete pt_regs to a C function.
  */
 	.macro DEFAULT_FRAME start=1 offset=0
-	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
-	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
-	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
-	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
-	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
-	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
-	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+	XCPT_FRAME \start, ORIG_RAX+\offset
+	CFI_REL_OFFSET rdi, RDI+\offset
+	CFI_REL_OFFSET rsi, RSI+\offset
+	CFI_REL_OFFSET rdx, RDX+\offset
+	CFI_REL_OFFSET rcx, RCX+\offset
+	CFI_REL_OFFSET rax, RAX+\offset
+	CFI_REL_OFFSET r8, R8+\offset
+	CFI_REL_OFFSET r9, R9+\offset
+	CFI_REL_OFFSET r10, R10+\offset
+	CFI_REL_OFFSET r11, R11+\offset
 	CFI_REL_OFFSET rbx, RBX+\offset
 	CFI_REL_OFFSET rbp, RBP+\offset
 	CFI_REL_OFFSET r12, R12+\offset
@@ -237,13 +237,13 @@ ENTRY(ret_from_fork)
 
 	RESTORE_EXTRA_REGS
 
-	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
+	testl $3,CS(%rsp)			# from kernel_thread?
 	jz   1f
 
 	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
 	jnz  int_ret_from_sys_call
 
-	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
+	RESTORE_TOP_OF_STACK %rdi
 	jmp ret_from_sys_call			# go to the SYSRET fastpath
 
 1:
@@ -310,11 +310,11 @@ GLOBAL(system_call_after_swapgs)
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ALLOC_PT_GPREGS_ON_STACK 8
 	SAVE_C_REGS_EXCEPT_RAX_RCX
-	movq	$-ENOSYS,RAX-ARGOFFSET(%rsp)
-	movq_cfi rax,(ORIG_RAX-ARGOFFSET)
-	movq	%rcx,RIP-ARGOFFSET(%rsp)
-	CFI_REL_OFFSET rip,RIP-ARGOFFSET
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	movq	$-ENOSYS,RAX(%rsp)
+	movq_cfi rax,ORIG_RAX
+	movq	%rcx,RIP(%rsp)
+	CFI_REL_OFFSET rip,RIP
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -326,13 +326,13 @@ system_call_fastpath:
 	ja ret_from_sys_call  /* and return regs->ax */
 	movq %r10,%rcx
 	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
  * Has incomplete stack frame and undefined top of stack.
  */
 ret_from_sys_call:
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
 
 	LOCKDEP_SYS_EXIT
@@ -344,7 +344,7 @@ ret_from_sys_call:
 	 */
 	TRACE_IRQS_ON
 	RESTORE_C_REGS_EXCEPT_RCX
-	movq RIP-ARGOFFSET(%rsp),%rcx
+	movq RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
@@ -353,7 +353,7 @@ ret_from_sys_call:
 	CFI_RESTORE_STATE
 
 int_ret_from_sys_call_fixup:
-	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+	FIXUP_TOP_OF_STACK %r11
 	jmp int_ret_from_sys_call
 
 	/* Do syscall tracing */
@@ -364,7 +364,7 @@ tracesys:
 	test %rax, %rax
 	jnz tracesys_phase2		/* if needed, run the slow path */
 	RESTORE_C_REGS_EXCEPT_RAX	/* else restore clobbered regs */
-	movq ORIG_RAX-ARGOFFSET(%rsp), %rax
+	movq ORIG_RAX(%rsp), %rax
 	jmp system_call_fastpath	/*      and return to the fast path */
 
 tracesys_phase2:
@@ -391,7 +391,7 @@ tracesys_phase2:
 	ja   int_ret_from_sys_call	/* RAX(%rsp) is already set */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
-	movq %rax,RAX-ARGOFFSET(%rsp)
+	movq %rax,RAX(%rsp)
 	/* Use IRET because user could have changed frame */
 
 /*
@@ -475,9 +475,9 @@ END(stub_\func)
 ENTRY(\label)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8		/* offset 8: return address */
-	FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
+	FIXUP_TOP_OF_STACK %r11, 8
 	call \func
-	RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
+	RESTORE_TOP_OF_STACK %r11, 8
 	ret
 	CFI_ENDPROC
 END(\label)
@@ -677,7 +677,7 @@ common_interrupt:
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
-	/* 0(%rsp): old_rsp-ARGOFFSET */
+	/* 0(%rsp): old_rsp */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -687,13 +687,13 @@ ret_from_intr:
 	popq %rsi
 	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
 	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq ARGOFFSET-RBP(%rsi), %rsp
+	leaq -RBP(%rsi),%rsp
 	CFI_DEF_CFA_REGISTER	rsp
-	CFI_ADJUST_CFA_OFFSET	RBP-ARGOFFSET
+	CFI_ADJUST_CFA_OFFSET	RBP
 
 exit_intr:
 	GET_THREAD_INFO(%rcx)
-	testl $3,CS-ARGOFFSET(%rsp)
+	testl $3,CS(%rsp)
 	je retint_kernel
 
 	/* Interrupt came from user space */
@@ -721,8 +721,8 @@ retint_swapgs:		/* return to user-space */
 	 * Try to use SYSRET instead of IRET if we're returning to
 	 * a completely clean 64-bit userspace context.
 	 */
-	movq (RCX-ARGOFFSET)(%rsp), %rcx
-	cmpq %rcx,(RIP-ARGOFFSET)(%rsp)		/* RCX == RIP */
+	movq RCX(%rsp),%rcx
+	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
 	jne opportunistic_sysret_failed
 
 	/*
@@ -743,19 +743,19 @@ retint_swapgs:		/* return to user-space */
 	shr $__VIRTUAL_MASK_SHIFT, %rcx
 	jnz opportunistic_sysret_failed
 
-	cmpq $__USER_CS,(CS-ARGOFFSET)(%rsp)	/* CS must match SYSRET */
+	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
 	jne opportunistic_sysret_failed
 
-	movq (R11-ARGOFFSET)(%rsp), %r11
-	cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp)	/* R11 == RFLAGS */
+	movq R11(%rsp),%r11
+	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
 	jne opportunistic_sysret_failed
 
-	testq $X86_EFLAGS_RF,%r11		/* sysret can't restore RF */
+	testq $X86_EFLAGS_RF,%r11	/* sysret can't restore RF */
 	jnz opportunistic_sysret_failed
 
 	/* nothing to check for RSP */
 
-	cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp)	/* SS must match SYSRET */
+	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
 	jne opportunistic_sysret_failed
 
 	/*
@@ -870,7 +870,7 @@ retint_signal:
 ENTRY(retint_kernel)
 	cmpl $0,PER_CPU_VAR(__preempt_count)
 	jnz  retint_restore_args
-	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
+	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
 	jmp exit_intr
@@ -1286,11 +1286,11 @@ ENTRY(paranoid_exit)
 	TRACE_IRQS_OFF_DEBUG
 	testl %ebx,%ebx				/* swapgs needed? */
 	jnz paranoid_exit_no_swapgs
-	TRACE_IRQS_IRETQ 0
+	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
 	jmp paranoid_exit_restore
 paranoid_exit_no_swapgs:
-	TRACE_IRQS_IRETQ_DEBUG 0
+	TRACE_IRQS_IRETQ_DEBUG
 paranoid_exit_restore:
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
-- 
cgit v1.2.3


From 050273d19b94f2adf9d35979cee949d6b6a9df84 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 26 Feb 2015 14:40:31 -0800
Subject: x86/asm/entry/64: Remove 'int_check_syscall_exit_work'

Nothing references it anymore.

Reported-by: Denys Vlasenko <vda.linux@googlemail.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 96b6352c1271 ("x86_64, entry: Remove the syscall exit audit and schedule optimizations")
Link: http://lkml.kernel.org/r/dd2a4d26ecc7a5db61b476727175cd99ae2b32a4.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 06055f9578a8..b0fbde1876e1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -431,7 +431,6 @@ int_careful:
 int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
 	SAVE_EXTRA_REGS
 	/* Check for syscall exit trace */
 	testl $_TIF_WORK_SYSCALL_EXIT,%edx
-- 
cgit v1.2.3


From b87cf63e2a5fbe3b368d5f5e5708e585b0fb3f84 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:32 -0800
Subject: x86/asm/entry: Add comments about various syscall instructions

SYSCALL/SYSRET and SYSENTER/SYSEXIT have weird semantics.
Moreover, they differ in 32- and 64-bit mode.

What is saved? What is not? Is rsp set? Are interrupts disabled?
People tend to not remember these details well enough.

This patch adds comments which explain in detail
what registers are modified by each of these instructions.

The comments are placed immediately before corresponding
entry and exit points.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/a94b98b63527797c871a81402ff5060b18fa880a.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S  | 133 ++++++++++++++++++++++++++++-----------------
 arch/x86/kernel/entry_64.S |  32 ++++++-----
 2 files changed, 102 insertions(+), 63 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index e99f8a5be2df..b5670564a1fb 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -99,22 +99,25 @@ ENDPROC(native_irq_enable_sysexit)
 /*
  * 32bit SYSENTER instruction entry.
  *
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
  * Arguments:
- * %eax	System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6	
- * 	
- * Interrupts off.
- *	
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  user stack
+ * 0(%ebp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below.	Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
  * with the int 0x80 path.
- */ 	
+ */
 ENTRY(ia32_sysenter_target)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
@@ -128,6 +131,7 @@ ENTRY(ia32_sysenter_target)
 	 * disabled irqs, here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	/* Construct iret frame (ss,rsp,rflags,cs,rip) */
  	movl	%ebp,%ebp		/* zero extension */
 	pushq_cfi $__USER32_DS
 	/*CFI_REL_OFFSET ss,0*/
@@ -140,14 +144,19 @@ ENTRY(ia32_sysenter_target)
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
 	movl	%eax, %eax
+	/* Store thread_info->sysenter_return in rip stack slot */
 	pushq_cfi %r10
 	CFI_REL_OFFSET rip,0
+	/* Store orig_ax */
 	pushq_cfi %rax
+	/* Construct the rest of "struct pt_regs" */
 	cld
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
- 	/* no need to do an access_ok check here because rbp has been
- 	   32bit zero extended */ 
+	/*
+	 * no need to do an access_ok check here because rbp has been
+	 * 32bit zero extended
+	 */
 	ASM_STAC
 1:	movl	(%rbp),%ebp
 	_ASM_EXTABLE(1b,ia32_badarg)
@@ -184,6 +193,7 @@ sysexit_from_sys_call:
 	movl	RIP(%rsp),%edx		/* User %eip */
 	CFI_REGISTER rip,rdx
 	RESTORE_RSI_RDI
+	/* pop everything except ss,rsp,rflags slots */
 	REMOVE_PT_GPREGS_FROM_STACK 3*8
 	xorq	%r8,%r8
 	xorq	%r9,%r9
@@ -194,6 +204,10 @@ sysexit_from_sys_call:
 	popq_cfi %rcx				/* User %esp */
 	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
+	/*
+	 * 32bit SYSEXIT restores eip from edx, esp from ecx.
+	 * cs and ss are loaded from MSRs.
+	 */
 	ENABLE_INTERRUPTS_SYSEXIT32
 
 	CFI_RESTORE_STATE
@@ -274,23 +288,33 @@ ENDPROC(ia32_sysenter_target)
 /*
  * 32bit SYSCALL instruction entry.
  *
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
  * Arguments:
- * %eax	System call number.
- * %ebx Arg1
- * %ecx return EIP 
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2    [note: not saved in the stack frame, should not be touched]
- * %esp user stack 
- * 0(%esp) Arg6
- * 	
- * Interrupts off.
- *	
+ * eax  system call number
+ * ecx  return address
+ * ebx  arg1
+ * ebp  arg2	(note: not saved in the stack frame, should not be touched)
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * esp  user stack
+ * 0(%esp) arg6
+ *
  * This is purely a fast path. For anything complicated we use the int 0x80
- * path below.	Set up a complete hardware stack frame to share code
- * with the int 0x80 path.	
- */ 	
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
 ENTRY(ia32_cstar_target)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
@@ -306,7 +330,7 @@ ENTRY(ia32_cstar_target)
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	ALLOC_PT_GPREGS_ON_STACK 8
+	ALLOC_PT_GPREGS_ON_STACK 8	/* +8: space for orig_ax */
 	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX(%rsp)
@@ -320,9 +344,11 @@ ENTRY(ia32_cstar_target)
 	/*CFI_REL_OFFSET rflags,EFLAGS*/
 	movq	%r8,RSP(%rsp)
 	CFI_REL_OFFSET rsp,RSP
-	/* no need to do an access_ok check here because r8 has been
-	   32bit zero extended */ 
-	/* hardware stack frame is complete now */	
+	/* iret stack frame is complete now */
+	/*
+	 * no need to do an access_ok check here because r8 has been
+	 * 32bit zero extended
+	 */
 	ASM_STAC
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
@@ -355,8 +381,15 @@ sysretl_from_sys_call:
 	TRACE_IRQS_ON
 	movl RSP(%rsp),%esp
 	CFI_RESTORE rsp
+	/*
+	 * 64bit->32bit SYSRET restores eip from ecx,
+	 * eflags from r11 (but RF and VM bits are forced to 0),
+	 * cs and ss are loaded from MSRs.
+	 * (Note: 32bit->32bit SYSRET is different: since r11
+	 * does not exist, it merely sets eflags.IF=1).
+	 */
 	USERGS_SYSRET32
-	
+
 #ifdef CONFIG_AUDITSYSCALL
 cstar_auditsys:
 	CFI_RESTORE_STATE
@@ -394,26 +427,26 @@ ia32_badarg:
 	jmp ia32_sysret
 	CFI_ENDPROC
 
-/* 
- * Emulated IA32 system calls via int 0x80. 
+/*
+ * Emulated IA32 system calls via int 0x80.
  *
- * Arguments:	 
- * %eax	System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6    [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax  system call number
+ * ebx  arg1
+ * ecx  arg2
+ * edx  arg3
+ * esi  arg4
+ * edi  arg5
+ * ebp  arg6	(note: not saved in the stack frame, should not be touched)
  *
  * Notes:
- * Uses the same stack frame as the x86-64 version.	
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
  * Arguments are zero extended. For system calls that want sign extension and
  * take long arguments a wrapper is needed. Most calls can just be called
  * directly.
- * Assumes it is only called from user space and entered with interrupts off.	
- */ 				
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
 
 ENTRY(ia32_syscall)
 	CFI_STARTPROC32	simple
@@ -432,7 +465,7 @@ ENTRY(ia32_syscall)
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	movl %eax,%eax
-	pushq_cfi %rax
+	pushq_cfi %rax		/* store orig_ax */
 	cld
 	/* note the registers are not zero extended to the sf.
 	   this could be a problem. */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b0fbde1876e1..e5cbfbbf9479 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -256,25 +256,25 @@ ENTRY(ret_from_fork)
 END(ret_from_fork)
 
 /*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer.  However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Registers on entry:
  * rax  system call number
+ * rcx  return address
+ * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
  * rdi  arg0
- * rcx  return address for syscall/sysret, C arg3
  * rsi  arg1
  * rdx  arg2
- * r10  arg3 	(--> moved to rcx for C)
+ * r10  arg3 (needs to be moved to rcx to conform to C ABI)
  * r8   arg4
  * r9   arg5
- * r11  eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
  *
  * Interrupts are off on entry.
  * Only called from user space.
@@ -302,13 +302,14 @@ ENTRY(system_call)
 GLOBAL(system_call_after_swapgs)
 
 	movq	%rsp,PER_CPU_VAR(old_rsp)
+	/* kernel_stack is set so that 5 slots (iret frame) are preallocated */
 	movq	PER_CPU_VAR(kernel_stack),%rsp
 	/*
 	 * No need to follow this irqs off/on section - it's straight
 	 * and short:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	ALLOC_PT_GPREGS_ON_STACK 8
+	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
 	SAVE_C_REGS_EXCEPT_RAX_RCX
 	movq	$-ENOSYS,RAX(%rsp)
 	movq_cfi rax,ORIG_RAX
@@ -348,6 +349,11 @@ ret_from_sys_call:
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
+	/*
+	 * 64bit SYSRET restores rip from rcx,
+	 * rflags from r11 (but RF and VM bits are forced to 0),
+	 * cs and ss are loaded from MSRs.
+	 */
 	USERGS_SYSRET64
 
 	CFI_RESTORE_STATE
-- 
cgit v1.2.3


From 1eeb207f870f746a863e5c59321d837d2d91c218 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:33 -0800
Subject: x86/asm/entry/64: Move 'save_paranoid' and 'ret_from_fork' closer to
 their users

For some odd reason, these two functions are at the very top of
the file. "save_paranoid"'s caller is approximately in the middle
of it, move it there. Move 'ret_from_fork' to be right after
fork/exec helpers.

This is a pure block move, nothing is changed in the function
bodies.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/6446bbfe4094532623a5b83779b7015fec167a9d.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 106 ++++++++++++++++++++++-----------------------
 1 file changed, 53 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e5cbfbbf9479..9e33d492ace3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -202,59 +202,6 @@ ENDPROC(native_usergs_sysret64)
 	CFI_REL_OFFSET r15, R15+\offset
 	.endm
 
-ENTRY(save_paranoid)
-	XCPT_FRAME 1 RDI+8
-	cld
-	SAVE_C_REGS 8
-	SAVE_EXTRA_REGS 8
-	movl $1,%ebx
-	movl $MSR_GS_BASE,%ecx
-	rdmsr
-	testl %edx,%edx
-	js 1f	/* negative -> in kernel */
-	SWAPGS
-	xorl %ebx,%ebx
-1:	ret
-	CFI_ENDPROC
-END(save_paranoid)
-
-/*
- * A newly forked process directly context switches into this address.
- *
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
-	DEFAULT_FRAME
-
-	LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
-	pushq_cfi $0x0002
-	popfq_cfi				# reset kernel eflags
-
-	call schedule_tail			# rdi: 'prev' task parameter
-
-	GET_THREAD_INFO(%rcx)
-
-	RESTORE_EXTRA_REGS
-
-	testl $3,CS(%rsp)			# from kernel_thread?
-	jz   1f
-
-	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
-	jnz  int_ret_from_sys_call
-
-	RESTORE_TOP_OF_STACK %rdi
-	jmp ret_from_sys_call			# go to the SYSRET fastpath
-
-1:
-	movq %rbp, %rdi
-	call *%rbx
-	movl $0, RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
-	CFI_ENDPROC
-END(ret_from_fork)
-
 /*
  * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
  *
@@ -581,6 +528,43 @@ END(stub_x32_execveat)
 
 #endif
 
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+	DEFAULT_FRAME
+
+	LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+	pushq_cfi $0x0002
+	popfq_cfi				# reset kernel eflags
+
+	call schedule_tail			# rdi: 'prev' task parameter
+
+	GET_THREAD_INFO(%rcx)
+
+	RESTORE_EXTRA_REGS
+
+	testl $3,CS(%rsp)			# from kernel_thread?
+	jz   1f
+
+	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
+	jnz  int_ret_from_sys_call
+
+	RESTORE_TOP_OF_STACK %rdi
+	jmp ret_from_sys_call			# go to the SYSRET fastpath
+
+1:
+	movq %rbp, %rdi
+	call *%rbx
+	movl $0, RAX(%rsp)
+	RESTORE_EXTRA_REGS
+	jmp int_ret_from_sys_call
+	CFI_ENDPROC
+END(ret_from_fork)
+
 /*
  * Build the entry stubs and pointer table with some assembler magic.
  * We pack 7 stubs into a single 32-byte chunk, which will fit in a
@@ -1273,6 +1257,22 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
+ENTRY(save_paranoid)
+	XCPT_FRAME 1 RDI+8
+	cld
+	SAVE_C_REGS 8
+	SAVE_EXTRA_REGS 8
+	movl $1,%ebx
+	movl $MSR_GS_BASE,%ecx
+	rdmsr
+	testl %edx,%edx
+	js 1f	/* negative -> in kernel */
+	SWAPGS
+	xorl %ebx,%ebx
+1:	ret
+	CFI_ENDPROC
+END(save_paranoid)
+
 	/*
 	 * "Paranoid" exit path from exception stack.  This is invoked
 	 * only on return from non-NMI IST interrupts that came
-- 
cgit v1.2.3


From ebfc453e27c676e104378366a0b027e5c6918631 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:34 -0800
Subject: x86/asm/entry/64: Clean up and document various entry code details

This patch does a lot of cleanup in comments and formatting,
but it does not change any code:

 - Rename 'save_paranoid' to 'paranoid_entry': this makes naming
   similar to its "non-paranoid" sibling, 'error_entry',
   and to its counterpart, 'paranoid_exit'.

 - Use the same CFI annotation atop 'paranoid_entry' and 'error_entry'.

 - Fix irregular indentation of assembler operands.

 - Add/fix comments on top of 'paranoid_entry' and 'error_entry'.

 - Remove stale comment about "oldrax".

 - Make comments about "no swapgs" flag in ebx more prominent.

 - Deindent wrongly indented top-level comment atop 'paranoid_exit'.

 - Indent wrongly deindented comment inside 'error_entry'.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/4640f9fcd5ea46eb299b1cd6d3f5da3167d2f78d.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 68 ++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9e33d492ace3..466947770648 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -982,10 +982,11 @@ ENTRY(\sym)
 	testl $3, CS(%rsp)		/* If coming from userspace, switch */
 	jnz 1f				/* stacks. */
 	.endif
-	call save_paranoid
+	call paranoid_entry
 	.else
 	call error_entry
 	.endif
+	/* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
 
 	DEFAULT_FRAME 0
 
@@ -1016,10 +1017,11 @@ ENTRY(\sym)
 	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
 	.endif
 
+	/* these procedures expect "no swapgs" flag in ebx */
 	.if \paranoid
-	jmp paranoid_exit		/* %ebx: no swapgs flag */
+	jmp paranoid_exit
 	.else
-	jmp error_exit			/* %ebx: no swapgs flag */
+	jmp error_exit
 	.endif
 
 	.if \paranoid == 1
@@ -1257,8 +1259,13 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
 #endif
 
-ENTRY(save_paranoid)
-	XCPT_FRAME 1 RDI+8
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1271,20 +1278,19 @@ ENTRY(save_paranoid)
 	xorl %ebx,%ebx
 1:	ret
 	CFI_ENDPROC
-END(save_paranoid)
-
-	/*
-	 * "Paranoid" exit path from exception stack.  This is invoked
-	 * only on return from non-NMI IST interrupts that came
-	 * from kernel space.
-	 *
-	 * We may be returning to very strange contexts (e.g. very early
-	 * in syscall entry), so checking for preemption here would
-	 * be complicated.  Fortunately, we there's no good reason
-	 * to try to handle preemption here.
-	 */
+END(paranoid_entry)
 
-	/* ebx:	no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack.  This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(paranoid_exit)
 	DEFAULT_FRAME
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -1305,13 +1311,11 @@ paranoid_exit_restore:
 END(paranoid_exit)
 
 /*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
  */
 ENTRY(error_entry)
-	XCPT_FRAME
-	CFI_ADJUST_CFA_OFFSET 15*8
-	/* oldrax contains error code */
+	XCPT_FRAME 1 15*8
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
@@ -1324,12 +1328,12 @@ error_sti:
 	TRACE_IRQS_OFF
 	ret
 
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here.  B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+	/*
+	 * There are two places in the kernel that can potentially fault with
+	 * usergs. Handle them here.  B stepping K8s sometimes report a
+	 * truncated RIP for IRET exceptions returning to compat mode. Check
+	 * for these here too.
+	 */
 error_kernelspace:
 	CFI_REL_OFFSET rcx, RCX+8
 	incl %ebx
@@ -1359,7 +1363,7 @@ error_bad_iret:
 END(error_entry)
 
 
-/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
 ENTRY(error_exit)
 	DEFAULT_FRAME
 	movl %ebx,%eax
@@ -1585,13 +1589,13 @@ end_repeat_nmi:
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
-	 * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
 	 * as we should not be calling schedule in NMI context.
 	 * Even with normal interrupts enabled. An NMI should not be
 	 * setting NEED_RESCHED or anything that normal interrupts and
 	 * exceptions might do.
 	 */
-	call save_paranoid
+	call paranoid_entry
 	DEFAULT_FRAME 0
 
 	/*
-- 
cgit v1.2.3


From 14f6e9532dda399a7b789f744dc045f8865a9e42 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:35 -0800
Subject: x86/asm/entry/64/compat: Fold the IA32_ARG_FIXUP macro into its
 callers

Use of a small macro - one with conditional expansion - does
more harm than good. It obfuscates code, with minimal code
reuse.

For example, because of obfuscation it's not obvious that
in 'ia32_sysenter_target', we can optimize loading of r9 -
currently it is loaded with a detour through ebp.

This patch folds the IA32_ARG_FIXUP macro into its callers.

No code changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/4da092094cd78734384ac31e0d4ec1d8f69145a2.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index b5670564a1fb..6dcd37256979 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,17 +30,6 @@
 
 	.section .entry.text, "ax"
 
-	.macro IA32_ARG_FIXUP noebp=0
-	movl	%edi,%r8d
-	.if \noebp
-	.else
-	movl	%ebp,%r9d
-	.endif
-	xchg	%ecx,%esi
-	movl	%ebx,%edi
-	movl	%edx,%edx	/* zero extension */
-	.endm 
-
 	/* clobbers %rax */
 	.macro  CLEAR_RREGS _r9=rax
 	xorl 	%eax,%eax
@@ -178,7 +167,12 @@ sysenter_flags_fixed:
 	cmpq	$(IA32_NR_syscalls-1),%rax
 	ja	ia32_badsys
 sysenter_do_call:
-	IA32_ARG_FIXUP
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl	%edi,%r8d	/* arg5 */
+	movl	%ebp,%r9d	/* arg6 */
+	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl	%ebx,%edi	/* arg1 */
+	movl	%edx,%edx	/* arg3 (zero extension) */
 sysenter_dispatch:
 	call	*ia32_sys_call_table(,%rax,8)
 	movq	%rax,RAX(%rsp)
@@ -360,7 +354,12 @@ ENTRY(ia32_cstar_target)
 	cmpq $IA32_NR_syscalls-1,%rax
 	ja  ia32_badsys
 cstar_do_call:
-	IA32_ARG_FIXUP 1
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl	%edi,%r8d	/* arg5 */
+	/* r9 already loaded */	/* arg6 */
+	xchg	%ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl	%ebx,%edi	/* arg1 */
+	movl	%edx,%edx	/* arg3 (zero extension) */
 cstar_dispatch:
 	call *ia32_sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
@@ -477,7 +476,12 @@ ENTRY(ia32_syscall)
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
 ia32_do_call:
-	IA32_ARG_FIXUP
+	/* 32bit syscall -> 64bit C ABI argument conversion */
+	movl %edi,%r8d	/* arg5 */
+	movl %ebp,%r9d	/* arg6 */
+	xchg %ecx,%esi	/* rsi:arg2, rcx:arg4 */
+	movl %ebx,%edi	/* arg1 */
+	movl %edx,%edx	/* arg3 (zero extension) */
 	call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
 ia32_sysret:
 	movq %rax,RAX(%rsp)
-- 
cgit v1.2.3


From 911d2bb5ccaab102abbab2bb58438c75bc342ca9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:36 -0800
Subject: x86/asm/entry/64: Use more readable constants

Constants such as SS+8 or SS+8-RIP are mysterious.
In most cases, SS+8 is just meant to be SIZEOF_PTREGS,
SS+8-RIP is RIP's offset in the iret frame.

This patch changes some of these constants to be less
mysterious.

No code changes (verified with objdump).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1d20491384773bd606e23a382fac23ddb49b5178.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calling.h |  2 ++
 arch/x86/kernel/entry_64.S     | 28 ++++++++++++++++------------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 4a7ceb9789a5..337423590b08 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -88,6 +88,8 @@ For 32-bit we have the following conventions - kernel is built with
 #define RSP		19*8
 #define SS		20*8
 
+#define SIZEOF_PTREGS	21*8
+
 	.macro ALLOC_PT_GPREGS_ON_STACK addskip=0
 	subq	$15*8+\addskip, %rsp
 	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 466947770648..858e94e86f5e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -164,12 +164,12 @@ ENDPROC(native_usergs_sysret64)
  * initial frame state for interrupts (and exceptions without error code)
  */
 	.macro INTR_FRAME start=1 offset=0
-	EMPTY_FRAME \start, SS+8+\offset-RIP
-	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
-	CFI_REL_OFFSET rsp, RSP+\offset-RIP
-	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
-	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
-	CFI_REL_OFFSET rip, RIP+\offset-RIP
+	EMPTY_FRAME \start, 5*8+\offset
+	/*CFI_REL_OFFSET ss, 4*8+\offset*/
+	CFI_REL_OFFSET rsp, 3*8+\offset
+	/*CFI_REL_OFFSET rflags, 2*8+\offset*/
+	/*CFI_REL_OFFSET cs, 1*8+\offset*/
+	CFI_REL_OFFSET rip, 0*8+\offset
 	.endm
 
 /*
@@ -177,7 +177,7 @@ ENDPROC(native_usergs_sysret64)
  * with vector already pushed)
  */
 	.macro XCPT_FRAME start=1 offset=0
-	INTR_FRAME \start, RIP+\offset-ORIG_RAX
+	INTR_FRAME \start, 1*8+\offset
 	.endm
 
 /*
@@ -645,10 +645,14 @@ END(interrupt)
 	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
 	pushq %rsi
+	/*
+	 * For debugger:
+	 * "CFA (Current Frame Address) is the value on stack + offset"
+	 */
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
-			0x77 /* DW_OP_breg7 */, 0, \
+			0x77 /* DW_OP_breg7 (rsp) */, 0, \
 			0x06 /* DW_OP_deref */, \
-			0x08 /* DW_OP_const1u */, SS+8-RBP, \
+			0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
 			0x22 /* DW_OP_plus */
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
@@ -674,7 +678,7 @@ ret_from_intr:
 
 	/* Restore saved previous stack */
 	popq %rsi
-	CFI_DEF_CFA rsi,SS+8-RBP	/* reg/off reset after def_cfa_expr */
+	CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
 	/* return code expects complete pt_regs - adjust rsp accordingly: */
 	leaq -RBP(%rsi),%rsp
 	CFI_DEF_CFA_REGISTER	rsp
@@ -1549,7 +1553,7 @@ first_nmi:
 	.rept 5
 	pushq_cfi 11*8(%rsp)
 	.endr
-	CFI_DEF_CFA_OFFSET SS+8-RIP
+	CFI_DEF_CFA_OFFSET 5*8
 
 	/* Everything up to here is safe from nested NMIs */
 
@@ -1577,7 +1581,7 @@ repeat_nmi:
 	pushq_cfi -6*8(%rsp)
 	.endr
 	subq $(5*8), %rsp
-	CFI_DEF_CFA_OFFSET SS+8-RIP
+	CFI_DEF_CFA_OFFSET 5*8
 end_repeat_nmi:
 
 	/*
-- 
cgit v1.2.3


From b3ab90b333e94659e7c351843ab41ec0004f73e8 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:37 -0800
Subject: x86/asm/entry/64/compat: Use more readable constant

The last instance of "mysterious" SS+8 constant is replaced by
SIZEOF_PTREGS.

Message-Id: <1424822419-10267-1-git-send-email-dvlasenk@redhat.com>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/d35aeba3059407ac54f472ddcfbea767ff8916ac.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 6dcd37256979..ed9746340363 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -536,7 +536,7 @@ ia32_ptregs_common:
 	CFI_ENDPROC
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8
+	CFI_DEF_CFA	rsp,SIZEOF_PTREGS
 	CFI_REL_OFFSET	rax,RAX
 	CFI_REL_OFFSET	rcx,RCX
 	CFI_REL_OFFSET	rdx,RDX
-- 
cgit v1.2.3


From d441c1f2b73ec742c2e55be804ebc6fee130c77f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 26 Feb 2015 14:40:38 -0800
Subject: x86/asm/entry/64: Simplify optimistic SYSRET

Avoid redundant load of %r11 (it is already loaded a few
instructions before).

Also simplify %rsp restoration, instead of two steps:

         add $0x80, %rsp
         mov 0x18(%rsp), %rsp

we can do a simplified single step to restore user-space RSP:

         mov 0x98(%rsp), %rsp

and get the same result.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
[ Clarified the changelog. ]
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1aef69b346a6db0d99cdfb0f5ba83e8c985e27d7.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calling.h | 3 +++
 arch/x86/kernel/entry_64.S     | 6 +++---
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 337423590b08..f1a962ff7ddf 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -176,6 +176,9 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX
 	RESTORE_C_REGS_HELPER 1,0,1,1,1
 	.endm
+	.macro RESTORE_C_REGS_EXCEPT_R11
+	RESTORE_C_REGS_HELPER 1,1,0,1,1
+	.endm
 	.macro RESTORE_RSI_RDI
 	RESTORE_C_REGS_HELPER 0,0,0,0,0
 	.endm
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 858e94e86f5e..bc1527889c40 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -757,9 +757,9 @@ retint_swapgs:		/* return to user-space */
 	 */
 irq_return_via_sysret:
 	CFI_REMEMBER_STATE
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
-	movq (RSP-RIP)(%rsp),%rsp
+	/* r11 is already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_R11
+	movq RSP(%rsp),%rsp
 	USERGS_SYSRET64
 	CFI_RESTORE_STATE
 
-- 
cgit v1.2.3


From 1e3fbb8a1d814f35e2e689cf87714d38d9f3564d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 26 Feb 2015 14:40:39 -0800
Subject: x86/asm/entry/64: Remove a bogus 'ret_from_fork' optimization

'ret_from_fork' checks TIF_IA32 to determine whether 'pt_regs' and
the related state make sense for 'ret_from_sys_call'.  This is
entirely the wrong check.  TS_COMPAT would make a little more
sense, but there's really no point in keeping this optimization
at all.

This fixes a return to the wrong user CS if we came from int
0x80 in a 64-bit task.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/4710be56d76ef994ddf59087aad98c000fbab9a4.1424989793.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bc1527889c40..622ce4254893 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -550,11 +550,14 @@ ENTRY(ret_from_fork)
 	testl $3,CS(%rsp)			# from kernel_thread?
 	jz   1f
 
-	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
-	jnz  int_ret_from_sys_call
-
-	RESTORE_TOP_OF_STACK %rdi
-	jmp ret_from_sys_call			# go to the SYSRET fastpath
+	/*
+	 * By the time we get here, we have no idea whether our pt_regs,
+	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+	 * the slow path, or one of the ia32entry paths.
+	 * Use int_ret_from_sys_call to return, since it can safely handle
+	 * all of the above.
+	 */
+	jmp  int_ret_from_sys_call
 
 1:
 	movq %rbp, %rdi
-- 
cgit v1.2.3


From 5eca7453d61003bf886992388f8cb407e6f0d051 Mon Sep 17 00:00:00 2001
From: Wang Nan
Date: Fri, 27 Feb 2015 12:19:49 +0800
Subject: x86/traps: Separate set_intr_gate() and clean up early_trap_init()

As early_trap_init() doesn't use IST, replace
set_intr_gate_ist() and set_system_intr_gate_ist() with their
standard counterparts.

set_intr_gate() requires a trace_debug symbol which we don't
have and won't use. This patch separates set_intr_gate() into two
parts, and uses base version in early_trap_init().

Reported-by: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Wang Nan <wangnan0@huawei.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: <dave.hansen@linux.intel.com>
Cc: <lizefan@huawei.com>
Cc: <masami.hiramatsu.pt@hitachi.com>
Cc: <oleg@redhat.com>
Cc: <rostedt@goodmis.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425010789-13714-1-git-send-email-wangnan0@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/desc.h |  7 ++++++-
 arch/x86/kernel/traps.c     | 20 ++++++++++++--------
 2 files changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
  * Pentium F0 0F bugfix can have resulted in the mapped
  * IDT being write-protected.
  */
-#define set_intr_gate(n, addr)						\
+#define set_intr_gate_notrace(n, addr)					\
 	do {								\
 		BUG_ON((unsigned)n > 0xFF);				\
 		_set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0,	\
 			  __KERNEL_CS);					\
+	} while (0)
+
+#define set_intr_gate(n, addr)						\
+	do {								\
+		set_intr_gate_notrace(n, addr);				\
 		_trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
 				0, 0, __KERNEL_CS);			\
 	} while (0)
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 42819886be0c..9965bd1916db 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -926,16 +926,20 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 void __init early_trap_init(void)
 {
 	/*
-	 * Don't set ist to DEBUG_STACK as it doesn't work until TSS is
-	 * ready in cpu_init() <-- trap_init(). Before trap_init(), CPU
-	 * runs at ring 0 so it is impossible to hit an invalid stack.
-	 * Using the original stack works well enough at this early
-	 * stage. DEBUG_STACK will be equipped after cpu_init() in
+	 * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+	 * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+	 * CPU runs at ring 0 so it is impossible to hit an invalid
+	 * stack.  Using the original stack works well enough at this
+	 * early stage. DEBUG_STACK will be equipped after cpu_init() in
 	 * trap_init().
+	 *
+	 * We don't need to set trace_idt_table like set_intr_gate(),
+	 * since we don't have trace_debug and it will be reset to
+	 * 'debug' in trap_init() by set_intr_gate_ist().
 	 */
-	set_intr_gate_ist(X86_TRAP_DB, &debug, 0);
+	set_intr_gate_notrace(X86_TRAP_DB, debug);
 	/* int3 can be called from all */
-	set_system_intr_gate_ist(X86_TRAP_BP, &int3, 0);
+	set_system_intr_gate(X86_TRAP_BP, &int3);
 #ifdef CONFIG_X86_32
 	set_intr_gate(X86_TRAP_PF, page_fault);
 #endif
@@ -1015,7 +1019,7 @@ void __init trap_init(void)
 
 	/*
 	 * X86_TRAP_DB and X86_TRAP_BP have been set
-	 * in early_trap_init(). However, DEBUG_STACK works only after
+	 * in early_trap_init(). However, ITS works only after
 	 * cpu_init() loads TSS. See comments in early_trap_init().
 	 */
 	set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
-- 
cgit v1.2.3


From 956421fbb74c3a6261903f3836c0740187cf038b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 01:09:44 +0100
Subject: x86/asm/entry/64: Remove a bogus 'ret_from_fork' optimization

'ret_from_fork' checks TIF_IA32 to determine whether 'pt_regs' and
the related state make sense for 'ret_from_sys_call'.  This is
entirely the wrong check.  TS_COMPAT would make a little more
sense, but there's really no point in keeping this optimization
at all.

This fixes a return to the wrong user CS if we came from int
0x80 in a 64-bit task.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: <stable@vger.kernel.org>
Link: http://lkml.kernel.org/r/4710be56d76ef994ddf59087aad98c000fbab9a4.1424989793.git.luto@amacapital.net
[ Backported from tip:x86/asm. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 10074ad9ebf8..1d74d161687c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -269,11 +269,14 @@ ENTRY(ret_from_fork)
 	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
 	jz   1f
 
-	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
-	jnz  int_ret_from_sys_call
-
-	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
-	jmp ret_from_sys_call			# go to the SYSRET fastpath
+	/*
+	 * By the time we get here, we have no idea whether our pt_regs,
+	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+	 * the slow path, or one of the ia32entry paths.
+	 * Use int_ret_from_sys_call to return, since it can safely handle
+	 * all of the above.
+	 */
+	jmp  int_ret_from_sys_call
 
 1:
 	subq $REST_SKIP, %rsp	# leave space for volatiles
-- 
cgit v1.2.3


From d9fd579c218e22c897f0f1b9e132af9b436cf445 Mon Sep 17 00:00:00 2001
From: Luis R. Rodriguez
Date: Wed, 4 Mar 2015 17:24:11 -0800
Subject: x86/mm: Use IS_ENABLED() for direct_gbpages

Replace #ifdef eyesore with IS_ENABLED() use.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: JBeulich@suse.com
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: julia.lawall@lip6.fr
Link: http://lkml.kernel.org/r/1425518654-3403-2-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..74f2b37fd073 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -131,11 +131,7 @@ void  __init early_alloc_pgt_buf(void)
 
 int after_bootmem;
 
-int direct_gbpages
-#ifdef CONFIG_DIRECT_GBPAGES
-				= 1
-#endif
-;
+int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES);
 
 static void __init init_gbpages(void)
 {
-- 
cgit v1.2.3


From e5008abe929c160d36e44b8c2b644d4330d2e389 Mon Sep 17 00:00:00 2001
From: Luis R. Rodriguez
Date: Wed, 4 Mar 2015 17:24:12 -0800
Subject: x86/mm: Simplify enabling direct_gbpages

direct_gbpages can be force enabled as an early parameter
but not really have taken effect when DEBUG_PAGEALLOC
or KMEMCHECK is enabled. You can also enable direct_gbpages
right now if you have an x86_64 architecture but your CPU
doesn't really have support for this feature. In both cases
PG_LEVEL_1G won't actually be enabled but direct_gbpages is used
in other areas under the assumptions that PG_LEVEL_1G
was set. Fix this by putting together all requirements
which make this feature sensible to enable under, and only
enable both finally flipping on PG_LEVEL_1G and leaving
PG_LEVEL_1G set when this is true.

We only enable this feature then to be possible on sensible
builds defined by the new ENABLE_DIRECT_GBPAGES. If the
CPU has support for it you can either enable this by using
the DIRECT_GBPAGES option or using the early kernel parameter.
If a platform had support for this you can always force disable
it as well.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: JBeulich@suse.com
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: julia.lawall@lip6.fr
Link: http://lkml.kernel.org/r/1425518654-3403-3-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig       | 18 +++++++++++++-----
 arch/x86/mm/init.c     | 17 +++++++++--------
 arch/x86/mm/pageattr.c |  2 --
 3 files changed, 22 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c2fb8a87dccb..4d06e1c8294a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1299,14 +1299,22 @@ config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 	depends on X86_64 || HIGHMEM64G
 
+config ENABLE_DIRECT_GBPAGES
+	def_bool y
+	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
+
 config DIRECT_GBPAGES
 	bool "Enable 1GB pages for kernel pagetables" if EXPERT
 	default y
-	depends on X86_64
-	---help---
-	  Allow the kernel linear mapping to use 1GB pages on CPUs that
-	  support it. This can improve the kernel's performance a tiny bit by
-	  reducing TLB pressure. If in doubt, say "Y".
+	depends on ENABLE_DIRECT_GBPAGES
+	---help---
+	  Enable by default the kernel linear mapping to use 1GB pages on CPUs
+	  that support it. This can improve the kernel's performance a tiny bit
+	  by reducing TLB pressure. If in doubt, say "Y". If you've disabled
+	  option but your platform is capable of handling support for this
+	  you can use the gbpages kernel parameter. Likewise if you've enabled
+	  this but you'd like to force disable this option you can use the
+	  nogbpages kernel parameter.
 
 # Common NUMA Features
 config NUMA
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 74f2b37fd073..2ce2c8e8c99c 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -131,16 +131,21 @@ void  __init early_alloc_pgt_buf(void)
 
 int after_bootmem;
 
+static int page_size_mask;
+
 int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES);
 
 static void __init init_gbpages(void)
 {
-#ifdef CONFIG_X86_64
-	if (direct_gbpages && cpu_has_gbpages)
+	if (!IS_ENABLED(CONFIG_ENABLE_DIRECT_GBPAGES)) {
+		direct_gbpages = 0;
+		return;
+	}
+	if (direct_gbpages && cpu_has_gbpages) {
 		printk(KERN_INFO "Using GB pages for direct mapping\n");
-	else
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	} else
 		direct_gbpages = 0;
-#endif
 }
 
 struct map_range {
@@ -149,8 +154,6 @@ struct map_range {
 	unsigned page_size_mask;
 };
 
-static int page_size_mask;
-
 static void __init probe_page_size_mask(void)
 {
 	init_gbpages();
@@ -161,8 +164,6 @@ static void __init probe_page_size_mask(void)
 	 * This will simplify cpa(), which otherwise needs to support splitting
 	 * large pages into small in interrupt context, etc.
 	 */
-	if (direct_gbpages)
-		page_size_mask |= 1 << PG_LEVEL_1G;
 	if (cpu_has_pse)
 		page_size_mask |= 1 << PG_LEVEL_2M;
 #endif
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 81e8282d8c2f..89af288ec674 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -81,11 +81,9 @@ void arch_report_meminfo(struct seq_file *m)
 	seq_printf(m, "DirectMap4M:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_2M] << 12);
 #endif
-#ifdef CONFIG_X86_64
 	if (direct_gbpages)
 		seq_printf(m, "DirectMap1G:    %8lu kB\n",
 			direct_pages_count[PG_LEVEL_1G] << 20);
-#endif
 }
 #else
 static inline void split_page_count(int level) { }
-- 
cgit v1.2.3


From 73c8c861dc5bddf1b24c6aeffee2292c96cf8db2 Mon Sep 17 00:00:00 2001
From: Luis R. Rodriguez
Date: Wed, 4 Mar 2015 17:24:14 -0800
Subject: x86/mm: Use early_param_on_off() for direct_gbpages

The enabler / disabler is pretty simple, just use the
provided wrappers, this lets us easily relate the variable
to the associated Kconfig entry.

Signed-off-by: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: JBeulich@suse.com
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: julia.lawall@lip6.fr
Link: http://lkml.kernel.org/r/1425518654-3403-5-git-send-email-mcgrof@do-not-panic.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c    |  3 ++-
 arch/x86/mm/init_64.c | 14 --------------
 2 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2ce2c8e8c99c..c35ba8bce7cb 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -133,7 +133,8 @@ int after_bootmem;
 
 static int page_size_mask;
 
-int direct_gbpages = IS_ENABLED(CONFIG_DIRECT_GBPAGES);
+early_param_on_off("gbpages", "nogbpages",
+		   direct_gbpages, CONFIG_DIRECT_GBPAGES);
 
 static void __init init_gbpages(void)
 {
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 30eb05ae7061..3fba623e3ba5 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -130,20 +130,6 @@ int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
 	return 0;
 }
 
-static int __init parse_direct_gbpages_off(char *arg)
-{
-	direct_gbpages = 0;
-	return 0;
-}
-early_param("nogbpages", parse_direct_gbpages_off);
-
-static int __init parse_direct_gbpages_on(char *arg)
-{
-	direct_gbpages = 1;
-	return 0;
-}
-early_param("gbpages", parse_direct_gbpages_on);
-
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
  * physical space so we can cache the place of the first one and move
-- 
cgit v1.2.3


From 10971ab269bbf22120edac95fcfa3c873a549bea Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 5 Mar 2015 08:18:23 +0100
Subject: x86/mm: Further simplify 1 GB kernel linear mappings handling

It's a bit pointless to allow Kconfig configuration for 1GB kernel
mappings, it's already hidden behind a 'default y' and CONFIG_EXPERT.

Remove this complication and simplify the code by renaming
CONFIG_ENABLE_DIRECT_GBPAGES to CONFIG_X86_DIRECT_GBPAGES and
document the DEBUG_PAGE_ALLOC and KMEMCHECK quirks.

Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: JBeulich@suse.com
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: julia.lawall@lip6.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig   | 20 ++++++--------------
 arch/x86/mm/init.c |  7 +------
 2 files changed, 7 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 4d06e1c8294a..d03847513b6d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1299,22 +1299,14 @@ config ARCH_DMA_ADDR_T_64BIT
 	def_bool y
 	depends on X86_64 || HIGHMEM64G
 
-config ENABLE_DIRECT_GBPAGES
+config X86_DIRECT_GBPAGES
 	def_bool y
 	depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK
-
-config DIRECT_GBPAGES
-	bool "Enable 1GB pages for kernel pagetables" if EXPERT
-	default y
-	depends on ENABLE_DIRECT_GBPAGES
-	---help---
-	  Enable by default the kernel linear mapping to use 1GB pages on CPUs
-	  that support it. This can improve the kernel's performance a tiny bit
-	  by reducing TLB pressure. If in doubt, say "Y". If you've disabled
-	  option but your platform is capable of handling support for this
-	  you can use the gbpages kernel parameter. Likewise if you've enabled
-	  this but you'd like to force disable this option you can use the
-	  nogbpages kernel parameter.
+	---help---
+	  Certain kernel features effectively disable kernel
+	  linear 1 GB mappings (even if the CPU otherwise
+	  supports them), so don't confuse the user by printing
+	  that we have them enabled.
 
 # Common NUMA Features
 config NUMA
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index c35ba8bce7cb..8704153f2675 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -133,15 +133,10 @@ int after_bootmem;
 
 static int page_size_mask;
 
-early_param_on_off("gbpages", "nogbpages",
-		   direct_gbpages, CONFIG_DIRECT_GBPAGES);
+early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
 
 static void __init init_gbpages(void)
 {
-	if (!IS_ENABLED(CONFIG_ENABLE_DIRECT_GBPAGES)) {
-		direct_gbpages = 0;
-		return;
-	}
 	if (direct_gbpages && cpu_has_gbpages) {
 		printk(KERN_INFO "Using GB pages for direct mapping\n");
 		page_size_mask |= 1 << PG_LEVEL_1G;
-- 
cgit v1.2.3


From e61980a70245715ab39cbee2b9d6e6afc1ec37d4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 5 Mar 2015 08:25:01 +0100
Subject: x86/mm: Simplify probe_page_size_mask()

Now that we've simplified the gbpages config space, move the
'page_size_mask' initialization into probe_page_size_mask(),
right next to the PSE and PGE enablement lines.

Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Dexuan Cui <decui@microsoft.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: JBeulich@suse.com
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pavel Machek <pavel@ucw.cz>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Lindgren <tony@atomide.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: julia.lawall@lip6.fr
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 8704153f2675..6dc85d51cd98 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -131,29 +131,18 @@ void  __init early_alloc_pgt_buf(void)
 
 int after_bootmem;
 
-static int page_size_mask;
-
 early_param_on_off("gbpages", "nogbpages", direct_gbpages, CONFIG_X86_DIRECT_GBPAGES);
 
-static void __init init_gbpages(void)
-{
-	if (direct_gbpages && cpu_has_gbpages) {
-		printk(KERN_INFO "Using GB pages for direct mapping\n");
-		page_size_mask |= 1 << PG_LEVEL_1G;
-	} else
-		direct_gbpages = 0;
-}
-
 struct map_range {
 	unsigned long start;
 	unsigned long end;
 	unsigned page_size_mask;
 };
 
+static int page_size_mask;
+
 static void __init probe_page_size_mask(void)
 {
-	init_gbpages();
-
 #if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_KMEMCHECK)
 	/*
 	 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
@@ -173,6 +162,14 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	}
+
+	/* Enable 1 GB linear kernel mappings if available: */
+	if (direct_gbpages && cpu_has_gbpages) {
+		printk(KERN_INFO "Using GB pages for direct mapping\n");
+		page_size_mask |= 1 << PG_LEVEL_1G;
+	} else {
+		direct_gbpages = 0;
+	}
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From c709feda56886c38af3116254f84cbe6a78b3a5d Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 5 Mar 2015 08:58:44 +0100
Subject: x86/mm/pat: Initialize __cachemode2pte_tbl[] and
 __pte2cachemode_tbl[] in a bit more readable fashion

The initialization of these two arrays is a bit difficult to follow:
restructure it optically so that a 2D structure shows which bit in
the PTE is set and which not.

Also improve on comments a bit.

No code or data changed:

  # arch/x86/mm/init.o:

   text    data     bss     dec     hex filename
   4585     424   29776   34785    87e1 init.o.before
   4585     424   29776   34785    87e1 init.o.after

md5:
   a82e11ff58bcfd0af3a94662a701f65d  init.o.before.asm
   a82e11ff58bcfd0af3a94662a701f65d  init.o.after.asm

Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Jan Beulich <JBeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-kernel@vger.kernel.org
Link: http://lkml.kernel.org/r/20150305082135.GB5969@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/init.c | 40 ++++++++++++++++++++++------------------
 1 file changed, 22 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6dc85d51cd98..4469563f8c3b 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -29,29 +29,33 @@
 
 /*
  * Tables translating between page_cache_type_t and pte encoding.
- * Minimal supported modes are defined statically, modified if more supported
- * cache modes are available.
- * Index into __cachemode2pte_tbl is the cachemode.
- * Index into __pte2cachemode_tbl are the caching attribute bits of the pte
- * (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
+ *
+ * Minimal supported modes are defined statically, they are modified
+ * during bootup if more supported cache modes are available.
+ *
+ *   Index into __cachemode2pte_tbl[] is the cachemode.
+ *
+ *   Index into __pte2cachemode_tbl[] are the caching attribute bits of the pte
+ *   (_PAGE_PWT, _PAGE_PCD, _PAGE_PAT) at index bit positions 0, 1, 2.
  */
 uint16_t __cachemode2pte_tbl[_PAGE_CACHE_MODE_NUM] = {
-	[_PAGE_CACHE_MODE_WB]		= 0,
-	[_PAGE_CACHE_MODE_WC]		= _PAGE_PWT,
-	[_PAGE_CACHE_MODE_UC_MINUS]	= _PAGE_PCD,
-	[_PAGE_CACHE_MODE_UC]		= _PAGE_PCD | _PAGE_PWT,
-	[_PAGE_CACHE_MODE_WT]		= _PAGE_PCD,
-	[_PAGE_CACHE_MODE_WP]		= _PAGE_PCD,
+	[_PAGE_CACHE_MODE_WB      ]	= 0         | 0        ,
+	[_PAGE_CACHE_MODE_WC      ]	= _PAGE_PWT | 0        ,
+	[_PAGE_CACHE_MODE_UC_MINUS]	= 0         | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_UC      ]	= _PAGE_PWT | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_WT      ]	= 0         | _PAGE_PCD,
+	[_PAGE_CACHE_MODE_WP      ]	= 0         | _PAGE_PCD,
 };
 EXPORT_SYMBOL(__cachemode2pte_tbl);
+
 uint8_t __pte2cachemode_tbl[8] = {
-	[__pte2cm_idx(0)] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT)] = _PAGE_CACHE_MODE_WC,
-	[__pte2cm_idx(_PAGE_PCD)] = _PAGE_CACHE_MODE_UC_MINUS,
-	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD)] = _PAGE_CACHE_MODE_UC,
-	[__pte2cm_idx(_PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
-	[__pte2cm_idx(_PAGE_PWT | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
-	[__pte2cm_idx(_PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
+	[__pte2cm_idx( 0        | 0         | 0        )] = _PAGE_CACHE_MODE_WB,
+	[__pte2cm_idx(_PAGE_PWT | 0         | 0        )] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx( 0        | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC_MINUS,
+	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | 0        )] = _PAGE_CACHE_MODE_UC,
+	[__pte2cm_idx( 0        | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WB,
+	[__pte2cm_idx(_PAGE_PWT | 0         | _PAGE_PAT)] = _PAGE_CACHE_MODE_WC,
+	[__pte2cm_idx(0         | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC_MINUS,
 	[__pte2cm_idx(_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)] = _PAGE_CACHE_MODE_UC,
 };
 EXPORT_SYMBOL(__pte2cachemode_tbl);
-- 
cgit v1.2.3


From 9ab6eb51ef4ad63cb71533d3a4dfb09ea8f69b4c Mon Sep 17 00:00:00 2001
From: Andy Shevchenko
Date: Thu, 5 Mar 2015 17:24:04 +0200
Subject: x86/intel/quark: Select COMMON_CLK

The commit 8bbc2a135b63 ("x86/intel/quark: Add Intel Quark
platform support") introduced a minimal support of Intel Quark
SoC. That allows to use core parts of the SoC. However, the SPI,
I2C, and GPIO drivers can't be selected by kernel configuration
because they depend on COMMON_CLK. The patch adds a COMMON_CLK
selection to the platfrom definition to allow user choose the drivers.

Signed-off-by: Andy Shevchenko <andriy.shevchenko@linux.intel.com>
Acked-by: Ong, Boon Leong <boon.leong.ong@intel.com>
Cc: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Cc: Darren Hart <dvhart@linux.intel.com>
Fixes: 8bbc2a135b63 ("x86/intel/quark: Add Intel Quark platform support")
Link: http://lkml.kernel.org/r/1425569044-2867-1-git-send-email-andriy.shevchenko@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c2fb8a87dccb..b7d31ca55187 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -499,6 +499,7 @@ config X86_INTEL_QUARK
 	depends on X86_IO_APIC
 	select IOSF_MBI
 	select INTEL_IMR
+	select COMMON_CLK
 	---help---
 	  Select to include support for Quark X1000 SoC.
 	  Say Y here if you have a Quark based system such as the Arduino
-- 
cgit v1.2.3


From 06c8173eb92bbfc03a0fe8bb64315857d0badd06 Mon Sep 17 00:00:00 2001
From: Quentin Casasnovas
Date: Thu, 5 Mar 2015 13:19:22 +0100
Subject: x86/fpu/xsaves: Fix improper uses of __ex_table

Commit:

  f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area")

introduced alternative instructions for XSAVES/XRSTORS and commit:

  adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time")

added support for the XSAVES/XRSTORS instructions at boot time.

Unfortunately both failed to properly protect them against faulting:

The 'xstate_fault' macro will use the closest label named '1'
backward and that ends up in the .altinstr_replacement section
rather than in .text. This means that the kernel will never find
in the __ex_table the .text address where this instruction might
fault, leading to serious problems if userspace manages to
trigger the fault.

Signed-off-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Jamie Iles <jamie.iles@oracle.com>
[ Improved the changelog, fixed some whitespace noise. ]
Acked-by: Borislav Petkov <bp@alien8.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: <stable@vger.kernel.org>
Cc: Allan Xavier <mr.a.xavier@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: adb9d526e982 ("x86/xsaves: Add xsaves and xrstors support for booting time")
Fixes: f31a9f7c7169 ("x86/xsaves: Use xsaves/xrstors to save and restore xsave area")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/xsave.h | 28 +++++++++++-----------------
 1 file changed, 11 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 5fa9770035dc..c9a6d68b8d62 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -82,18 +82,15 @@ static inline int xsave_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XSAVES"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XSAVE"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -112,18 +109,15 @@ static inline int xrstor_state_booting(struct xsave_struct *fx, u64 mask)
 	if (boot_cpu_has(X86_FEATURE_XSAVES))
 		asm volatile("1:"XRSTORS"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
 	else
 		asm volatile("1:"XRSTOR"\n\t"
 			"2:\n\t"
-			: : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
+			     xstate_fault
+			: "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 			:   "memory");
-
-	asm volatile(xstate_fault
-		     : "0" (0)
-		     : "memory");
-
 	return err;
 }
 
@@ -149,9 +143,9 @@ static inline int xsave_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input_2(
 		"1:"XSAVE,
-		"1:"XSAVEOPT,
+		XSAVEOPT,
 		X86_FEATURE_XSAVEOPT,
-		"1:"XSAVES,
+		XSAVES,
 		X86_FEATURE_XSAVES,
 		[fx] "D" (fx), "a" (lmask), "d" (hmask) :
 		"memory");
@@ -178,7 +172,7 @@ static inline int xrstor_state(struct xsave_struct *fx, u64 mask)
 	 */
 	alternative_input(
 		"1: " XRSTOR,
-		"1: " XRSTORS,
+		XRSTORS,
 		X86_FEATURE_XSAVES,
 		"D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
 		: "memory");
-- 
cgit v1.2.3


From 8ef46a672a7d852709561d10672b6eaa8a4acd82 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:02 -0800
Subject: x86/asm/entry: Add this_cpu_sp0() to read sp0 for the current cpu

We currently store references to the top of the kernel stack in
multiple places: kernel_stack (with an offset) and
init_tss.x86_tss.sp0 (no offset).  The latter is defined by
hardware and is a clean canonical way to find the top of the
stack.  Add an accessor so we can start using it.

This needs minor paravirt tweaks.  On native, sp0 defines the
top of the kernel stack and is therefore always correct.  On Xen
and lguest, the hypervisor tracks the top of the stack, but we
want to start reading sp0 in the kernel.  Fixing this is simple:
just update our local copy of sp0 as well as the hypervisor's
copy on task switches.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/8d675581859712bee09a055ed8f785d80dac1eca.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 5 +++++
 arch/x86/kernel/process.c        | 1 +
 arch/x86/lguest/boot.c           | 1 +
 arch/x86/xen/enlighten.c         | 1 +
 4 files changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 7be2c9a6caba..71c3a826a690 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -564,6 +564,11 @@ static inline void native_swapgs(void)
 #endif
 }
 
+static inline unsigned long this_cpu_sp0(void)
+{
+	return this_cpu_read_stable(init_tss.x86_tss.sp0);
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..ff5c9088b1c5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -38,6 +38,7 @@
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+EXPORT_PER_CPU_SYMBOL_GPL(init_tss);
 
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..8561585ee2c6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1076,6 +1076,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
 {
 	lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
 		   THREAD_SIZE / PAGE_SIZE);
+	tss->x86_tss.sp0 = thread->sp0;
 }
 
 /* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..81665c9f2132 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
 	mcs = xen_mc_entry(0);
 	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
+	tss->x86_tss.sp0 = thread->sp0;
 }
 
 static void xen_set_iopl_mask(unsigned mask)
-- 
cgit v1.2.3


From 75182b1632a89f12540baa1806a7c5c180db620c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:03 -0800
Subject: x86/asm/entry: Switch all C consumers of kernel_stack to
 this_cpu_sp0()

This will make modifying the semantics of kernel_stack easier.

The change to ist_begin_non_atomic() is necessary because sp0 no
longer points to the same THREAD_SIZE-aligned region as RSP;
it's one byte too high for that.  At Denys' suggestion, rather
than offsetting it, just check explicitly that we're in the
correct range ending at sp0.  This has the added benefit that we
no longer assume that the thread stack is aligned to
THREAD_SIZE.

Suggested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ef8254ad414cbb8034c9a56396eeb24f5dd5b0de.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/thread_info.h | 3 +--
 arch/x86/kernel/traps.c            | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..a2fa1899494e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -159,8 +159,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
 static inline struct thread_info *current_thread_info(void)
 {
 	struct thread_info *ti;
-	ti = (void *)(this_cpu_read_stable(kernel_stack) +
-		      KERNEL_STACK_OFFSET - THREAD_SIZE);
+	ti = (void *)(this_cpu_sp0() - THREAD_SIZE);
 	return ti;
 }
 
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9965bd1916db..fa290586ed37 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 */
-	BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
-		& ~(THREAD_SIZE - 1)) != 0);
+	BUG_ON((unsigned long)(this_cpu_sp0() - current_stack_pointer()) >=
+	       THREAD_SIZE);
 
 	preempt_count_sub(HARDIRQ_OFFSET);
 }
-- 
cgit v1.2.3


From 9d0c914c60f4d3123debb653340dc1f7cf44939d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:04 -0800
Subject: x86/asm/entry/64/compat: Change the 32-bit sysenter code to use sp0

The ia32 sysenter code loaded the top of the kernel stack into
rsp by loading kernel_stack and then adjusting it.  It can be
simplified to just read sp0 directly.

This requires the addition of a new asm-offsets entry for sp0.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/88ff9006163d296a0665338585c36d9bfb85235d.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S        | 3 +--
 arch/x86/kernel/asm-offsets_64.c | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ed9746340363..719db63b35c4 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -113,8 +113,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
 	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(kernel_stack), %rsp
-	addq	$(KERNEL_STACK_OFFSET),%rsp
+	movq	PER_CPU_VAR(init_tss + TSS_sp0), %rsp
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs, here we enable it straight after entry:
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
 #undef ENTRY
 
 	OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+	OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
 	BLANK();
 
 	DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
-- 
cgit v1.2.3


From 24933b82c0d9a711475a5ef7904eb733f561e637 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:05 -0800
Subject: x86/asm/entry: Rename 'init_tss' to 'cpu_tss'

It has nothing to do with init -- there's only one TSS per cpu.

Other names considered include:

 - current_tss: Confusing because we never switch the tss.
 - singleton_tss: Too long.

This patch was generated with 's/init_tss/cpu_tss/g'.  Followup
patches will fix INIT_TSS and INIT_TSS_IST by hand.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/da29fb2a793e4f649d93ce2d1ed320ebe8516262.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S        | 2 +-
 arch/x86/include/asm/processor.h | 4 ++--
 arch/x86/kernel/cpu/common.c     | 6 +++---
 arch/x86/kernel/entry_64.S       | 2 +-
 arch/x86/kernel/ioport.c         | 2 +-
 arch/x86/kernel/process.c        | 6 +++---
 arch/x86/kernel/process_32.c     | 2 +-
 arch/x86/kernel/process_64.c     | 2 +-
 arch/x86/kernel/vm86_32.c        | 4 ++--
 arch/x86/power/cpu.c             | 2 +-
 10 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 719db63b35c4..ad9efef65a6b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -113,7 +113,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
 	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(init_tss + TSS_sp0), %rsp
+	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
 	/*
 	 * No need to follow this irqs on/off section: the syscall
 	 * disabled irqs, here we enable it straight after entry:
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 71c3a826a690..117ee65473e2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -282,7 +282,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
 
 /*
  * Save the original ist values for checking stack pointers during debugging
@@ -566,7 +566,7 @@ static inline void native_swapgs(void)
 
 static inline unsigned long this_cpu_sp0(void)
 {
-	return this_cpu_read_stable(init_tss.x86_tss.sp0);
+	return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
 }
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..5d0f0cc7ea26 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -979,7 +979,7 @@ static void syscall32_cpu_init(void)
 void enable_sep_cpu(void)
 {
 	int cpu = get_cpu();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 
 	if (!boot_cpu_has(X86_FEATURE_SEP)) {
 		put_cpu();
@@ -1307,7 +1307,7 @@ void cpu_init(void)
 	 */
 	load_ucode_ap();
 
-	t = &per_cpu(init_tss, cpu);
+	t = &per_cpu(cpu_tss, cpu);
 	oist = &per_cpu(orig_ist, cpu);
 
 #ifdef CONFIG_NUMA
@@ -1391,7 +1391,7 @@ void cpu_init(void)
 {
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 	struct thread_struct *thread = &curr->thread;
 
 	wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 622ce4254893..0c00fd80249a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -959,7 +959,7 @@ apicinterrupt IRQ_WORK_VECTOR \
 /*
  * Exception entry points.
  */
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define INIT_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
 	 * because the ->io_bitmap_max value must match the bitmap
 	 * contents:
 	 */
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 
 	if (turn_on)
 		bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ff5c9088b1c5..6f6087349231 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,8 +37,8 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
-EXPORT_PER_CPU_SYMBOL_GPL(init_tss);
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS;
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
 
 #ifdef CONFIG_X86_64
 static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -110,7 +110,7 @@ void exit_thread(void)
 	unsigned long *bp = t->io_bitmap_ptr;
 
 	if (bp) {
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
 
 		t->io_bitmap_ptr = NULL;
 		clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..d3460af3d27a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -248,7 +248,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct thread_struct *prev = &prev_p->thread,
 				 *next = &next_p->thread;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	fpu_switch_t fpu;
 
 	/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 854b5981b327..2cd562f96c1f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -277,7 +277,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	struct thread_struct *prev = &prev_p->thread;
 	struct thread_struct *next = &next_p->thread;
 	int cpu = smp_processor_id();
-	struct tss_struct *tss = &per_cpu(init_tss, cpu);
+	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
 	unsigned fsindex, gsindex;
 	fpu_switch_t fpu;
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 		do_exit(SIGSEGV);
 	}
 
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 	current->thread.sp0 = current->thread.saved_sp0;
 	current->thread.sysenter_cs = __KERNEL_CS;
 	load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	tsk->thread.saved_fs = info->regs32->fs;
 	tsk->thread.saved_gs = get_user_gs(info->regs32);
 
-	tss = &per_cpu(init_tss, get_cpu());
+	tss = &per_cpu(cpu_tss, get_cpu());
 	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
 static void fix_processor_context(void)
 {
 	int cpu = smp_processor_id();
-	struct tss_struct *t = &per_cpu(init_tss, cpu);
+	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
 #ifdef CONFIG_X86_64
 	struct desc_struct *desc = get_cpu_gdt_table(cpu);
 	tss_desc tss;
-- 
cgit v1.2.3


From d0a0de21f82bbc1737ea3c831f018d0c2bc6b9c2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:06 -0800
Subject: x86/asm/entry: Remove INIT_TSS and fold the definitions into
 'cpu_tss'

The INIT_TSS is unnecessary.  Just define the initial TSS where
'cpu_tss' is defined.

While we're at it, merge the 32-bit and 64-bit definitions.  The
only syntactic change is that 32-bit kernels were computing sp0
as long, but now they compute it as unsigned long.

Verified by objdump: the contents and relocations of
.data..percpu..shared_aligned are unchanged on 32-bit and 64-bit
kernels.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/8fc39fa3f6c5d635e93afbdd1a0fe0678a6d7913.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 20 --------------------
 arch/x86/kernel/process.c        | 20 +++++++++++++++++++-
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 117ee65473e2..f5e3ec63767d 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -818,22 +818,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.io_bitmap_ptr		= NULL,					  \
 }
 
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS  {							  \
-	.x86_tss = {							  \
-		.sp0		= sizeof(init_stack) + (long)&init_stack, \
-		.ss0		= __KERNEL_DS,				  \
-		.ss1		= __KERNEL_CS,				  \
-		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,		  \
-	 },								  \
-	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },	  \
-}
-
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 #define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
@@ -892,10 +876,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
 }
 
-#define INIT_TSS  { \
-	.x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
 /*
  * Return saved PC of a blocked thread.
  * What is this good for? it will be always the scheduler or ret_from_fork.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6f6087349231..f4c0af7fc3a0 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,7 +37,25 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+	.x86_tss = {
+		.sp0 = (unsigned long)&init_stack + sizeof(init_stack),
+#ifdef CONFIG_X86_32
+		.ss0 = __KERNEL_DS,
+		.ss1 = __KERNEL_CS,
+		.io_bitmap_base	= INVALID_IO_BITMAP_OFFSET,
+#endif
+	 },
+#ifdef CONFIG_X86_32
+	 /*
+	  * Note that the .io_bitmap member must be extra-big. This is because
+	  * the CPU will access an additional byte beyond the end of the IO
+	  * permission bitmap. The extra byte must be all 1 bits, and must
+	  * be within the limit.
+	  */
+	.io_bitmap		= { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
 EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
 
 #ifdef CONFIG_X86_64
-- 
cgit v1.2.3


From 9b47668843d800ed57f6f6bfd6f5c4cffdf201c6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 5 Mar 2015 19:19:07 -0800
Subject: x86/asm/entry: Rename 'INIT_TSS_IST' to 'CPU_TSS_IST'

This has nothing to do with the init thread or the initial
anything. It's just the CPU's TSS.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a0bd5e26b32a2e1f08ff99017d0997118fbb2485.1425611534.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0c00fd80249a..5117a2baefe9 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -959,7 +959,7 @@ apicinterrupt IRQ_WORK_VECTOR \
 /*
  * Exception entry points.
  */
-#define INIT_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
@@ -1015,13 +1015,13 @@ ENTRY(\sym)
 	.endif
 
 	.if \shift_ist != -1
-	subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+	subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	call \do_sym
 
 	.if \shift_ist != -1
-	addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+	addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
 	.endif
 
 	/* these procedures expect "no swapgs" flag in ebx */
-- 
cgit v1.2.3


From e893286918d2cde3a94850d8f7101cd1039e0c62 Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Thu, 5 Mar 2015 09:13:31 +0100
Subject: x86/vdso: Fix the build on GCC5

On gcc5 the kernel does not link:

  ld: .eh_frame_hdr table[4] FDE at 0000000000000648 overlaps table[5] FDE at 0000000000000670.

Because prior GCC versions always emitted NOPs on ALIGN directives, but
gcc5 started omitting them.

.LSTARTFDEDLSI1 says:

        /* HACK: The dwarf2 unwind routines will subtract 1 from the
           return address to get an address in the middle of the
           presumed call instruction.  Since we didn't get here via
           a call, we need to include the nop before the real start
           to make up for it.  */
        .long .LSTART_sigreturn-1-.     /* PC-relative start address */

But commit 69d0627a7f6e ("x86 vDSO: reorder vdso32 code") from 2.6.25
replaced .org __kernel_vsyscall+32,0x90 by ALIGN right before
__kernel_sigreturn.

Of course, ALIGN need not generate any NOP in there. Esp. gcc5 collapses
vclock_gettime.o and int80.o together with no generated NOPs as "ALIGN".

So fix this by adding to that point at least a single NOP and make the
function ALIGN possibly with more NOPs then.

Kudos for reporting and diagnosing should go to Richard.

Reported-by: Richard Biener <rguenther@suse.de>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425543211-12542-1-git-send-email-jslaby@suse.cz
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/vdso32/sigreturn.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vdso32/sigreturn.S b/arch/x86/vdso/vdso32/sigreturn.S
index 31776d0efc8c..d7ec4e251c0a 100644
--- a/arch/x86/vdso/vdso32/sigreturn.S
+++ b/arch/x86/vdso/vdso32/sigreturn.S
@@ -17,6 +17,7 @@
 	.text
 	.globl __kernel_sigreturn
 	.type __kernel_sigreturn,@function
+	nop /* this guy is needed for .LSTARTFDEDLSI1 below (watch for HACK) */
 	ALIGN
 __kernel_sigreturn:
 .LSTART_sigreturn:
-- 
cgit v1.2.3


From d939be3add4f1410079dad2755d4936cdb70903b Mon Sep 17 00:00:00 2001
From: Masanari Iida
Date: Fri, 27 Feb 2015 23:52:31 +0900
Subject: treewide: Fix typo in printk messages

This patch fix spelling typo in printk messages.

Signed-off-by: Masanari Iida <standby24x7@gmail.com>
Acked-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arc/kernel/unwind.c                          | 2 +-
 arch/powerpc/kernel/prom.c                        | 2 +-
 arch/powerpc/platforms/85xx/p1022_rdk.c           | 4 ++--
 arch/x86/kernel/test_rodata.c                     | 2 +-
 drivers/iio/adc/max1027.c                         | 2 +-
 drivers/isdn/hardware/mISDN/mISDNinfineon.c       | 2 +-
 drivers/isdn/mISDN/dsp_cmx.c                      | 2 +-
 drivers/isdn/mISDN/dsp_core.c                     | 4 ++--
 drivers/media/tuners/msi001.c                     | 2 +-
 drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c | 2 +-
 drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c    | 2 +-
 drivers/net/wireless/ath/ath10k/mac.c             | 2 +-
 drivers/net/wireless/rtlwifi/rtl8723be/dm.c       | 2 +-
 drivers/net/wireless/rtlwifi/rtl8821ae/dm.c       | 2 +-
 drivers/parisc/eisa_enumerator.c                  | 2 +-
 drivers/scsi/be2iscsi/be_cmds.c                   | 2 +-
 drivers/scsi/osd/osd_initiator.c                  | 2 +-
 drivers/scsi/qla2xxx/qla_nx.c                     | 2 +-
 drivers/scsi/qla2xxx/qla_os.c                     | 4 ++--
 net/sunrpc/xprtrdma/verbs.c                       | 4 ++--
 tools/perf/util/probe-finder.c                    | 2 +-
 21 files changed, 25 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arc/kernel/unwind.c b/arch/arc/kernel/unwind.c
index e550b117ec4f..93c6ea52b671 100644
--- a/arch/arc/kernel/unwind.c
+++ b/arch/arc/kernel/unwind.c
@@ -841,7 +841,7 @@ static int processCFI(const u8 *start, const u8 *end, unsigned long targetLoc,
 				break;
 			case DW_CFA_GNU_window_save:
 			default:
-				unw_debug("UNKNOW OPCODE 0x%x\n", opcode);
+				unw_debug("UNKNOWN OPCODE 0x%x\n", opcode);
 				result = 0;
 				break;
 			}
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index 099f27e6d1b0..20ce051312fc 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -721,7 +721,7 @@ void __init early_init_devtree(void *params)
 	 */
 	of_scan_flat_dt(early_init_dt_scan_cpus, NULL);
 	if (boot_cpuid < 0) {
-		printk("Failed to indentify boot CPU !\n");
+		printk("Failed to identify boot CPU !\n");
 		BUG();
 	}
 
diff --git a/arch/powerpc/platforms/85xx/p1022_rdk.c b/arch/powerpc/platforms/85xx/p1022_rdk.c
index 7a180f0308d5..680232d6ba48 100644
--- a/arch/powerpc/platforms/85xx/p1022_rdk.c
+++ b/arch/powerpc/platforms/85xx/p1022_rdk.c
@@ -50,14 +50,14 @@ void p1022rdk_set_pixel_clock(unsigned int pixclock)
 	/* Map the global utilities registers. */
 	guts_np = of_find_compatible_node(NULL, NULL, "fsl,p1022-guts");
 	if (!guts_np) {
-		pr_err("p1022rdk: missing global utilties device node\n");
+		pr_err("p1022rdk: missing global utilities device node\n");
 		return;
 	}
 
 	guts = of_iomap(guts_np, 0);
 	of_node_put(guts_np);
 	if (!guts) {
-		pr_err("p1022rdk: could not map global utilties device\n");
+		pr_err("p1022rdk: could not map global utilities device\n");
 		return;
 	}
 
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133abda48..5ecbfe5099da 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
 	/* test 3: check the value hasn't changed */
 	/* If this test fails, we managed to overwrite the data */
 	if (!rodata_test_data) {
-		printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+		printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
 		return -ENODEV;
 	}
 	/* test 4: check if the rodata section is 4Kb aligned */
diff --git a/drivers/iio/adc/max1027.c b/drivers/iio/adc/max1027.c
index 87ee1c7d0b54..44bf815adb6c 100644
--- a/drivers/iio/adc/max1027.c
+++ b/drivers/iio/adc/max1027.c
@@ -436,7 +436,7 @@ static int max1027_probe(struct spi_device *spi)
 				  indio_dev->num_channels * 2,
 				  GFP_KERNEL);
 	if (st->buffer == NULL) {
-		dev_err(&indio_dev->dev, "Can't allocate bufffer\n");
+		dev_err(&indio_dev->dev, "Can't allocate buffer\n");
 		return -ENOMEM;
 	}
 
diff --git a/drivers/isdn/hardware/mISDN/mISDNinfineon.c b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
index c1493f4162fb..d5bdbaf93a1a 100644
--- a/drivers/isdn/hardware/mISDN/mISDNinfineon.c
+++ b/drivers/isdn/hardware/mISDN/mISDNinfineon.c
@@ -1092,7 +1092,7 @@ inf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
 	}
 	card->ci = get_card_info(ent->driver_data);
 	if (!card->ci) {
-		pr_info("mISDN: do not have informations about adapter at %s\n",
+		pr_info("mISDN: do not have information about adapter at %s\n",
 			pci_name(pdev));
 		kfree(card);
 		pci_disable_device(pdev);
diff --git a/drivers/isdn/mISDN/dsp_cmx.c b/drivers/isdn/mISDN/dsp_cmx.c
index 87f7dff20ff6..52c43821f746 100644
--- a/drivers/isdn/mISDN/dsp_cmx.c
+++ b/drivers/isdn/mISDN/dsp_cmx.c
@@ -295,7 +295,7 @@ dsp_cmx_del_conf_member(struct dsp *dsp)
 		}
 	}
 	printk(KERN_WARNING
-	       "%s: dsp is not present in its own conf_meber list.\n",
+	       "%s: dsp is not present in its own conf_member list.\n",
 	       __func__);
 
 	return -EINVAL;
diff --git a/drivers/isdn/mISDN/dsp_core.c b/drivers/isdn/mISDN/dsp_core.c
index 77025f5cb57d..0222b1a35a2d 100644
--- a/drivers/isdn/mISDN/dsp_core.c
+++ b/drivers/isdn/mISDN/dsp_core.c
@@ -460,7 +460,7 @@ dsp_control_req(struct dsp *dsp, struct mISDNhead *hh, struct sk_buff *skb)
 		}
 		if (dsp_debug & DEBUG_DSP_CORE)
 			printk(KERN_DEBUG "%s: enable mixing of "
-			       "tx-data with conf mebers\n", __func__);
+			       "tx-data with conf members\n", __func__);
 		dsp->tx_mix = 1;
 		dsp_cmx_hardware(dsp->conf, dsp);
 		dsp_rx_off(dsp);
@@ -474,7 +474,7 @@ dsp_control_req(struct dsp *dsp, struct mISDNhead *hh, struct sk_buff *skb)
 		}
 		if (dsp_debug & DEBUG_DSP_CORE)
 			printk(KERN_DEBUG "%s: disable mixing of "
-			       "tx-data with conf mebers\n", __func__);
+			       "tx-data with conf members\n", __func__);
 		dsp->tx_mix = 0;
 		dsp_cmx_hardware(dsp->conf, dsp);
 		dsp_rx_off(dsp);
diff --git a/drivers/media/tuners/msi001.c b/drivers/media/tuners/msi001.c
index 26019e731993..74cfc3c98edb 100644
--- a/drivers/media/tuners/msi001.c
+++ b/drivers/media/tuners/msi001.c
@@ -408,7 +408,7 @@ static int msi001_s_ctrl(struct v4l2_ctrl *ctrl)
 				s->mixer_gain->cur.val, s->if_gain->val);
 		break;
 	default:
-		dev_dbg(&s->spi->dev, "unkown control %d\n", ctrl->id);
+		dev_dbg(&s->spi->dev, "unknown control %d\n", ctrl->id);
 		ret = -EINVAL;
 	}
 
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
index c88b20af87df..bb2af7207826 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
@@ -591,7 +591,7 @@ int bnx2x_vf_mcast(struct bnx2x *bp, struct bnx2x_virtf *vf,
 		mc = kzalloc(mc_num * sizeof(struct bnx2x_mcast_list_elem),
 			     GFP_KERNEL);
 		if (!mc) {
-			BNX2X_ERR("Cannot Configure mulicasts due to lack of memory\n");
+			BNX2X_ERR("Cannot Configure multicasts due to lack of memory\n");
 			return -ENOMEM;
 		}
 	}
diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
index 69b46c051cc0..a5a482946679 100644
--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
+++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c
@@ -794,7 +794,7 @@ int qlcnic_82xx_config_intr_coalesce(struct qlcnic_adapter *adapter,
 
 	if (rv)
 		netdev_err(adapter->netdev,
-			   "Failed to set Rx coalescing parametrs\n");
+			   "Failed to set Rx coalescing parameters\n");
 
 	return rv;
 }
diff --git a/drivers/net/wireless/ath/ath10k/mac.c b/drivers/net/wireless/ath/ath10k/mac.c
index 46709301a51e..8698af91eaca 100644
--- a/drivers/net/wireless/ath/ath10k/mac.c
+++ b/drivers/net/wireless/ath/ath10k/mac.c
@@ -4342,7 +4342,7 @@ static void ath10k_sta_rc_update(struct ieee80211_hw *hw,
 			bw = WMI_PEER_CHWIDTH_80MHZ;
 			break;
 		case IEEE80211_STA_RX_BW_160:
-			ath10k_warn(ar, "Invalid bandwith %d in rc update for %pM\n",
+			ath10k_warn(ar, "Invalid bandwidth %d in rc update for %pM\n",
 				    sta->bandwidth, sta->addr);
 			bw = WMI_PEER_CHWIDTH_20MHZ;
 			break;
diff --git a/drivers/net/wireless/rtlwifi/rtl8723be/dm.c b/drivers/net/wireless/rtlwifi/rtl8723be/dm.c
index dd7eb4371f49..13430b53f7ca 100644
--- a/drivers/net/wireless/rtlwifi/rtl8723be/dm.c
+++ b/drivers/net/wireless/rtlwifi/rtl8723be/dm.c
@@ -336,7 +336,7 @@ static void rtl8723be_dm_find_minimum_rssi(struct ieee80211_hw *hw)
 		rtl_dm_dig->min_undec_pwdb_for_dm =
 				rtlpriv->dm.entry_min_undec_sm_pwdb;
 		RT_TRACE(rtlpriv, COMP_BB_POWERSAVING, DBG_LOUD,
-			 "AP Ext Port or disconnet PWDB = 0x%x\n",
+			 "AP Ext Port or disconnect PWDB = 0x%x\n",
 			  rtl_dm_dig->min_undec_pwdb_for_dm);
 	}
 	RT_TRACE(rtlpriv, COMP_DIG, DBG_LOUD, "MinUndecoratedPWDBForDM =%d\n",
diff --git a/drivers/net/wireless/rtlwifi/rtl8821ae/dm.c b/drivers/net/wireless/rtlwifi/rtl8821ae/dm.c
index 9be106109921..cb20e23744e3 100644
--- a/drivers/net/wireless/rtlwifi/rtl8821ae/dm.c
+++ b/drivers/net/wireless/rtlwifi/rtl8821ae/dm.c
@@ -899,7 +899,7 @@ static void rtl8821ae_dm_dig(struct ieee80211_hw *hw)
 
 	if (rtlpriv->falsealm_cnt.cnt_all > 10000) {
 		RT_TRACE(rtlpriv, COMP_DIG, DBG_LOUD,
-			 "Abnornally false alarm case.\n");
+			 "Abnormally false alarm case.\n");
 
 		if (dm_digtable->large_fa_hit != 3)
 			dm_digtable->large_fa_hit++;
diff --git a/drivers/parisc/eisa_enumerator.c b/drivers/parisc/eisa_enumerator.c
index caa153133754..a656d9e83343 100644
--- a/drivers/parisc/eisa_enumerator.c
+++ b/drivers/parisc/eisa_enumerator.c
@@ -357,7 +357,7 @@ static int parse_slot_config(int slot,
 		}
 		if (flags & HPEE_FUNCTION_INFO_CFG_FREE_FORM) {
 			/* I have no idea how to handle this */
-			printk("function %d have free-form confgiuration, skipping ",
+			printk("function %d have free-form configuration, skipping ",
 				num_func);
 			pos = p0 + function_len;
 			continue;
diff --git a/drivers/scsi/be2iscsi/be_cmds.c b/drivers/scsi/be2iscsi/be_cmds.c
index 80d97f3d2ed9..1028760b8a22 100644
--- a/drivers/scsi/be2iscsi/be_cmds.c
+++ b/drivers/scsi/be2iscsi/be_cmds.c
@@ -237,7 +237,7 @@ int beiscsi_mccq_compl(struct beiscsi_hba *phba,
 			beiscsi_log(phba, KERN_WARNING,
 				    BEISCSI_LOG_INIT | BEISCSI_LOG_EH |
 				    BEISCSI_LOG_CONFIG,
-				    "BC_%d : Insufficent Buffer Error "
+				    "BC_%d : Insufficient Buffer Error "
 				    "Resp_Len : %d Actual_Resp_Len : %d\n",
 				    mbx_resp_hdr->response_length,
 				    mbx_resp_hdr->actual_resp_len);
diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c
index 488c3929f19a..0cccd6033feb 100644
--- a/drivers/scsi/osd/osd_initiator.c
+++ b/drivers/scsi/osd/osd_initiator.c
@@ -186,7 +186,7 @@ static int _osd_get_print_system_info(struct osd_dev *od,
 
 		if (unlikely(len > sizeof(odi->systemid))) {
 			OSD_ERR("OSD Target error: OSD_SYSTEM_ID too long(%d). "
-				"device idetification might not work\n", len);
+				"device identification might not work\n", len);
 			len = sizeof(odi->systemid);
 		}
 		odi->systemid_len = len;
diff --git a/drivers/scsi/qla2xxx/qla_nx.c b/drivers/scsi/qla2xxx/qla_nx.c
index 54cb2ac9339b..7d2b18f2675c 100644
--- a/drivers/scsi/qla2xxx/qla_nx.c
+++ b/drivers/scsi/qla2xxx/qla_nx.c
@@ -1274,7 +1274,7 @@ qla82xx_pinit_from_rom(scsi_qla_host_t *vha)
 
 		if (off == ADDR_ERROR) {
 			ql_log(ql_log_fatal, vha, 0x0116,
-			    "Unknow addr: 0x%08lx.\n", buf[i].addr);
+			    "Unknown addr: 0x%08lx.\n", buf[i].addr);
 			continue;
 		}
 
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index db3dbd999cb6..45bdc4d558c0 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -4174,7 +4174,7 @@ qla83xx_schedule_work(scsi_qla_host_t *base_vha, int work_code)
 		break;
 	default:
 		ql_log(ql_log_warn, base_vha, 0xb05f,
-		    "Unknow work-code=0x%x.\n", work_code);
+		    "Unknown work-code=0x%x.\n", work_code);
 	}
 
 	return;
@@ -4774,7 +4774,7 @@ qla83xx_idc_state_handler(scsi_qla_host_t *base_vha)
 			break;
 		default:
 			ql_log(ql_log_warn, base_vha, 0xb071,
-			    "Unknow Device State: %x.\n", dev_state);
+			    "Unknown Device State: %x.\n", dev_state);
 			qla83xx_idc_unlock(base_vha, 0);
 			qla8xxx_dev_failed_handler(base_vha);
 			rval = QLA_FUNCTION_FAILED;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 61c41298b4ea..3b91aedf439e 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -1005,7 +1005,7 @@ rpcrdma_init_fmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
 	int i, rc;
 
 	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initalizing %d FMRs\n", __func__, i);
+	dprintk("RPC:       %s: initializing %d FMRs\n", __func__, i);
 
 	while (i--) {
 		r = kzalloc(sizeof(*r), GFP_KERNEL);
@@ -1038,7 +1038,7 @@ rpcrdma_init_frmrs(struct rpcrdma_ia *ia, struct rpcrdma_buffer *buf)
 	int i, rc;
 
 	i = (buf->rb_max_requests + 1) * RPCRDMA_MAX_SEGS;
-	dprintk("RPC:       %s: initalizing %d FRMRs\n", __func__, i);
+	dprintk("RPC:       %s: initializing %d FRMRs\n", __func__, i);
 
 	while (i--) {
 		r = kzalloc(sizeof(*r), GFP_KERNEL);
diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c
index c7918f83b300..1cb1ef765ca0 100644
--- a/tools/perf/util/probe-finder.c
+++ b/tools/perf/util/probe-finder.c
@@ -456,7 +456,7 @@ static int convert_variable_fields(Dwarf_Die *vr_die, const char *varname,
 			return -EINVAL;
 		}
 		if (field->name[0] == '[') {
-			pr_err("Semantic error: %s is not a pointor"
+			pr_err("Semantic error: %s is not a pointer"
 			       " nor array.\n", varname);
 			return -EINVAL;
 		}
-- 
cgit v1.2.3


From b27559a433bb6080d95c2593d4a2b81401197911 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Fri, 6 Mar 2015 17:50:18 -0800
Subject: x86/asm/entry: Delay loading sp0 slightly on task switch

The change:

  75182b1632a8 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()")

had the unintended side effect of changing the return value of
current_thread_info() during part of the context switch process.
Change it back.

This has no effect as far as I can tell -- it's just for
consistency.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/9fcaa47dd8487db59eed7a3911b6ae409476763e.1425692936.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process_32.c | 10 +++++-----
 arch/x86/kernel/process_64.c |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index d3460af3d27a..0405cab6634d 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -255,11 +255,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-	/*
-	 * Reload esp0.
-	 */
-	load_sp0(tss, next);
-
 	/*
 	 * Save away %gs. No need to save %fs, as it was saved on the
 	 * stack on entry.  No need to save %es and %ds, as those are
@@ -310,6 +305,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	arch_end_context_switch(next_p);
 
+	/*
+	 * Reload esp0.  This changes current_thread_info().
+	 */
+	load_sp0(tss, next);
+
 	this_cpu_write(kernel_stack,
 		  (unsigned long)task_stack_page(next_p) +
 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 2cd562f96c1f..1e393d27d701 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -283,9 +283,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	fpu = switch_fpu_prepare(prev_p, next_p, cpu);
 
-	/* Reload esp0 and ss1. */
-	load_sp0(tss, next);
-
 	/* We must save %fs and %gs before load_TLS() because
 	 * %fs and %gs may be cleared by load_TLS().
 	 *
@@ -413,6 +410,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
 	this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
 
+	/* Reload esp0 and ss1.  This changes current_thread_info(). */
+	load_sp0(tss, next);
+
 	this_cpu_write(kernel_stack,
 		  (unsigned long)task_stack_page(next_p) +
 		  THREAD_SIZE - KERNEL_STACK_OFFSET);
-- 
cgit v1.2.3


From a7fcf28d431ef70afaa91496e64e16dc51dccec4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Fri, 6 Mar 2015 17:50:19 -0800
Subject: x86/asm/entry: Replace this_cpu_sp0() with current_top_of_stack() and
 fix it on x86_32

I broke 32-bit kernels.  The implementation of sp0 was correct
as far as I can tell, but sp0 was much weirder on x86_32 than I
realized.  It has the following issues:

 - Init's sp0 is inconsistent with everything else's: non-init tasks
   are offset by 8 bytes.  (I have no idea why, and the comment is unhelpful.)

 - vm86 does crazy things to sp0.

Fix it up by replacing this_cpu_sp0() with
current_top_of_stack() and using a new percpu variable to track
the top of the stack on x86_32.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 75182b1632a8 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()")
Link: http://lkml.kernel.org/r/d09dbe270883433776e0cbee3c7079433349e96d.1425692936.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h   | 11 ++++++++++-
 arch/x86/include/asm/thread_info.h |  4 +---
 arch/x86/kernel/cpu/common.c       | 13 +++++++++++--
 arch/x86/kernel/process_32.c       | 11 +++++++----
 arch/x86/kernel/smpboot.c          |  2 ++
 arch/x86/kernel/traps.c            |  4 ++--
 6 files changed, 33 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f5e3ec63767d..48a61c1c626e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -284,6 +284,10 @@ struct tss_struct {
 
 DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
 
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#endif
+
 /*
  * Save the original ist values for checking stack pointers during debugging
  */
@@ -564,9 +568,14 @@ static inline void native_swapgs(void)
 #endif
 }
 
-static inline unsigned long this_cpu_sp0(void)
+static inline unsigned long current_top_of_stack(void)
 {
+#ifdef CONFIG_X86_64
 	return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+#else
+	/* sp0 on x86_32 is special in and around vm86 mode. */
+	return this_cpu_read_stable(cpu_current_top_of_stack);
+#endif
 }
 
 #ifdef CONFIG_PARAVIRT
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index a2fa1899494e..7740edd56fed 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -158,9 +158,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
 
 static inline struct thread_info *current_thread_info(void)
 {
-	struct thread_info *ti;
-	ti = (void *)(this_cpu_sp0() - THREAD_SIZE);
-	return ti;
+	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
 }
 
 static inline unsigned long current_stack_pointer(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5d0f0cc7ea26..76348334b934 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1130,8 +1130,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
 		     irq_stack_union) __aligned(PAGE_SIZE) __visible;
 
 /*
- * The following four percpu variables are hot.  Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot.  Align current_task to
+ * cacheline size such that they fall in the same cacheline.
  */
 DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
 	&init_task;
@@ -1226,6 +1226,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
 
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack.  Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+	(unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 #ifdef CONFIG_CC_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0405cab6634d..1b9963faf4eb 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -306,13 +306,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	arch_end_context_switch(next_p);
 
 	/*
-	 * Reload esp0.  This changes current_thread_info().
+	 * Reload esp0, kernel_stack, and current_top_of_stack.  This changes
+	 * current_thread_info().
 	 */
 	load_sp0(tss, next);
-
 	this_cpu_write(kernel_stack,
-		  (unsigned long)task_stack_page(next_p) +
-		  THREAD_SIZE - KERNEL_STACK_OFFSET);
+		       (unsigned long)task_stack_page(next_p) +
+		       THREAD_SIZE - KERNEL_STACK_OFFSET);
+	this_cpu_write(cpu_current_top_of_stack,
+		       (unsigned long)task_stack_page(next_p) +
+		       THREAD_SIZE);
 
 	/*
 	 * Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..759388c538cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -806,6 +806,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
+	per_cpu(cpu_current_top_of_stack, cpu) =
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 #else
 	clear_tsk_thread_flag(idle, TIF_FORK);
 	initial_gs = per_cpu_offset(cpu);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index fa290586ed37..081252c44cde 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 */
-	BUG_ON((unsigned long)(this_cpu_sp0() - current_stack_pointer()) >=
-	       THREAD_SIZE);
+	BUG_ON((unsigned long)(current_top_of_stack() -
+			       current_stack_pointer()) >= THREAD_SIZE);
 
 	preempt_count_sub(HARDIRQ_OFFSET);
 }
-- 
cgit v1.2.3


From 3e1aa7cb59aff4b245b45e326fcdba1bf7f105c6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 6 Mar 2015 21:55:32 +0100
Subject: x86/asm: Optimize unnecessarily wide TEST instructions

By the nature of the TEST operation, it is often possible to test
a narrower part of the operand:

    "testl $3,  mem"  ->  "testb $3, mem",
    "testq $3, %rcx"  ->  "testb $3, %cl"

This results in shorter instructions, because the TEST instruction
has no sign-entending byte-immediate forms unlike other ALU ops.

Note that this change does not create any LCP (Length-Changing Prefix)
stalls, which happen when adding a 0x66 prefix, which happens when
16-bit immediates are used, which changes such TEST instructions:

  [test_opcode] [modrm] [imm32]

to:

  [0x66] [test_opcode] [modrm] [imm16]

where [imm16] has a *different length* now: 2 bytes instead of 4.
This confuses the decoder and slows down execution.

REX prefixes were carefully designed to almost never hit this case:
adding REX prefix does not change instruction length except MOVABS
and MOV [addr],RAX instruction.

This patch does not add instructions which would use a 0x66 prefix,
code changes in assembly are:

    -48 f7 07 01 00 00 00 	testq  $0x1,(%rdi)
    +f6 07 01             	testb  $0x1,(%rdi)
    -48 f7 c1 01 00 00 00 	test   $0x1,%rcx
    +f6 c1 01             	test   $0x1,%cl
    -48 f7 c1 02 00 00 00 	test   $0x2,%rcx
    +f6 c1 02             	test   $0x2,%cl
    -41 f7 c2 01 00 00 00 	test   $0x1,%r10d
    +41 f6 c2 01          	test   $0x1,%r10b
    -48 f7 c1 04 00 00 00 	test   $0x4,%rcx
    +f6 c1 04             	test   $0x4,%cl
    -48 f7 c1 08 00 00 00 	test   $0x8,%rcx
    +f6 c1 08             	test   $0x8,%cl

Linus further notes:

   "There are no stalls from using 8-bit instruction forms.

    Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones
    *could* cause problems if it ends up having forwarding issues, so that
    instead of just forwarding the result, you end up having to wait for
    it to be stable in the L1 cache (or possibly the register file). The
    forwarding from the store buffer is simplest and most reliable if the
    read is done at the exact same address and the exact same size as the
    write that gets forwarded.

    But that's true only if:

     (a) the write was very recent and is still in the write queue. I'm
         not sure that's the case here anyway.

     (b) on at least most Intel microarchitectures, you have to test a
         different byte than the lowest one (so forwarding a 64-bit write
         to a 8-bit read ends up working fine, as long as the 8-bit read
         is of the low 8 bits of the written data).

    A very similar issue *might* show up for registers too, not just
    memory writes, if you use 'testb' with a high-byte register (where
    instead of forwarding the value from the original producer it needs to
    go through the register file and then shifted). But it's mainly a
    problem for store buffers.

    But afaik, the way Denys changed the test instructions, neither of the
    above issues should be true.

    The real problem for store buffer forwarding tends to be "write 8
    bits, read 32 bits". That can be really surprisingly expensive,
    because the read ends up having to wait until the write has hit the
    cacheline, and we might talk tens of cycles of latency here. But
    "write 32 bits, read the low 8 bits" *should* be fast on pretty much
    all x86 chips, afaik."

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425675332-31576-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head_64.S            | 2 +-
 arch/x86/kernel/relocate_kernel_32.S | 8 ++++----
 arch/x86/kernel/relocate_kernel_64.S | 8 ++++----
 arch/x86/lib/checksum_32.S           | 4 ++--
 arch/x86/lib/csum-copy_64.S          | 2 +-
 5 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9a0919678f97..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -146,7 +146,7 @@ startup_64:
 	leaq	level2_kernel_pgt(%rip), %rdi
 	leaq	4096(%rdi), %r8
 	/* See if it is a valid page table entry */
-1:	testq	$1, 0(%rdi)
+1:	testb	$1, 0(%rdi)
 	jz	2f
 	addq	%rbp, 0(%rdi)
 	/* Go to the next page */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
 	movl	(%ebx), %ecx
 	addl	$4, %ebx
 1:
-	testl	$0x1,   %ecx  /* is it a destination page */
+	testb	$0x1, %cl     /* is it a destination page */
 	jz	2f
 	movl	%ecx,	%edi
 	andl	$0xfffff000, %edi
 	jmp     0b
 2:
-	testl	$0x2,	%ecx  /* is it an indirection page */
+	testb	$0x2, %cl    /* is it an indirection page */
 	jz	2f
 	movl	%ecx,	%ebx
 	andl	$0xfffff000, %ebx
 	jmp     0b
 2:
-	testl   $0x4,   %ecx /* is it the done indicator */
+	testb   $0x4, %cl    /* is it the done indicator */
 	jz      2f
 	jmp     3f
 2:
-	testl   $0x8,   %ecx /* is it the source indicator */
+	testb   $0x8, %cl    /* is it the source indicator */
 	jz      0b	     /* Ignore it otherwise */
 	movl    %ecx,   %esi /* For every source page do a copy */
 	andl    $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..04cb1790a596 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -221,23 +221,23 @@ swap_pages:
 	movq	(%rbx), %rcx
 	addq	$8,	%rbx
 1:
-	testq	$0x1,	%rcx  /* is it a destination page? */
+	testb	$0x1,	%cl   /* is it a destination page? */
 	jz	2f
 	movq	%rcx,	%rdi
 	andq	$0xfffffffffffff000, %rdi
 	jmp	0b
 2:
-	testq	$0x2,	%rcx  /* is it an indirection page? */
+	testb	$0x2,	%cl   /* is it an indirection page? */
 	jz	2f
 	movq	%rcx,   %rbx
 	andq	$0xfffffffffffff000, %rbx
 	jmp	0b
 2:
-	testq	$0x4,	%rcx  /* is it the done indicator? */
+	testb	$0x4,	%cl   /* is it the done indicator? */
 	jz	2f
 	jmp	3f
 2:
-	testq	$0x8,	%rcx  /* is it the source indicator? */
+	testb	$0x8,	%cl   /* is it the source indicator? */
 	jz	0b	      /* Ignore it otherwise */
 	movq	%rcx,   %rsi  /* For ever source page do a copy */
 	andq	$0xfffffffffffff000, %rsi
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index c3b9953d3fa0..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -125,7 +125,7 @@ ENTRY(csum_partial)
 6:	addl %ecx,%eax
 	adcl $0, %eax 
 7:	
-	testl $1, 12(%esp)
+	testb $1, 12(%esp)
 	jz 8f
 	roll $8, %eax
 8:
@@ -245,7 +245,7 @@ ENTRY(csum_partial)
 	addl %ebx,%eax
 	adcl $0,%eax
 80: 
-	testl $1, 12(%esp)
+	testb $1, 12(%esp)
 	jz 90f
 	roll $8, %eax
 90: 
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
 
 	/* handle last odd byte */
 .Lhandle_1:
-	testl $1, %r10d
+	testb $1, %r10b
 	jz    .Lende
 	xorl  %ebx, %ebx
 	source
-- 
cgit v1.2.3


From 0d55ba46bfbee64fd2b492b87bfe2ec172e7b056 Mon Sep 17 00:00:00 2001
From: Sudeep Holla
Date: Wed, 4 Mar 2015 12:00:16 +0000
Subject: x86/cacheinfo: Move cacheinfo sysfs code to generic infrastructure

This patch removes the redundant sysfs cacheinfo code by reusing
the newly introduced generic cacheinfo infrastructure through the
commit

  246246cbde5e ("drivers: base: support cpu cache information
		 interface to userspace via sysfs")

The private pointer provided by the cacheinfo is used to implement
the AMD L3 cache-specific attributes.

Note that with v4.0-rc1, commit

  513e3d2d11c9 ("cpumask: always use nr_cpu_ids in formatting and parsing
		 functions")

in particular changes from long format to shorter one for all cpumasks
sysfs entries. As the consequence of the same, even the shared_cpu_map
in the cacheinfo sysfs was also changed.

This patch neither alters any existing sysfs entries nor their
formating, however since the generic cacheinfo has switched to use the
device attributes instead of the traditional raw kobjects, a directory
named "power" along with its standard attributes are added similar to
any other device.

Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Andre Przywara <andre.przywara@arm.com>
Link: http://lkml.kernel.org/r/1425470416-20691-1-git-send-email-sudeep.holla@arm.com
[ Add a check for uninitialized this_cpu_ci for the cpu_has_topoext case too
  in __cache_amd_cpumap_setup() ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 715 ++++++++++------------------------
 1 file changed, 198 insertions(+), 517 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 659643376dbf..8008bc2dd2d0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
  *	Andi Kleen / Andreas Herrmann	: CPUID4 emulation on AMD.
  */
 
-#include <linux/init.h>
 #include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
 #include <linux/cpu.h>
 #include <linux/sched.h>
+#include <linux/sysfs.h>
 #include <linux/pci.h>
 
 #include <asm/processor.h>
-#include <linux/smp.h>
 #include <asm/amd_nb.h>
 #include <asm/smp.h>
 
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
 
 
 enum _cache_type {
-	CACHE_TYPE_NULL	= 0,
-	CACHE_TYPE_DATA = 1,
-	CACHE_TYPE_INST = 2,
-	CACHE_TYPE_UNIFIED = 3
+	CTYPE_NULL = 0,
+	CTYPE_DATA = 1,
+	CTYPE_INST = 2,
+	CTYPE_UNIFIED = 3
 };
 
 union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
 	struct amd_northbridge *nb;
 };
 
-struct _cpuid4_info {
-	struct _cpuid4_info_regs base;
-	DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
 unsigned short			num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
 static const unsigned char levels[] = { 1, 1, 2, 3 };
 static const unsigned char types[] = { 1, 2, 3, 3 };
 
+static const enum cache_type cache_type_map[] = {
+	[CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+	[CTYPE_DATA] = CACHE_TYPE_DATA,
+	[CTYPE_INST] = CACHE_TYPE_INST,
+	[CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
 static void
 amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		     union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
 		(ebx->split.ways_of_associativity + 1) - 1;
 }
 
-struct _cache_attr {
-	struct attribute attr;
-	ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
-	ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
-			 unsigned int);
-};
-
 #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
 /*
  * L3 cache descriptors
  */
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
 	l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
 }
 
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
-	int node;
-
-	/* only for L3, and not in virtualized environments */
-	if (index < 3)
-		return;
-
-	node = amd_get_nb_id(smp_processor_id());
-	this_leaf->nb = node_to_amd_nb(node);
-	if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
-		amd_calc_l3_indices(this_leaf->nb);
-}
-
 /*
  * check whether a slot used for disabling an L3 index is occupied.
  * @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
 	return -1;
 }
 
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
 				  unsigned int slot)
 {
 	int index;
+	struct amd_northbridge *nb = this_leaf->priv;
 
-	if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-		return -EINVAL;
-
-	index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+	index = amd_get_l3_disable_slot(nb, slot);
 	if (index >= 0)
 		return sprintf(buf, "%d\n", index);
 
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
 
 #define SHOW_CACHE_DISABLE(slot)					\
 static ssize_t								\
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf,	\
-			  unsigned int cpu)				\
+cache_disable_##slot##_show(struct device *dev,				\
+			    struct device_attribute *attr, char *buf)	\
 {									\
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);		\
 	return show_cache_disable(this_leaf, buf, slot);		\
 }
 SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
 	return 0;
 }
 
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
-				  const char *buf, size_t count,
-				  unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+				   const char *buf, size_t count,
+				   unsigned int slot)
 {
 	unsigned long val = 0;
 	int cpu, err = 0;
+	struct amd_northbridge *nb = this_leaf->priv;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-		return -EINVAL;
-
-	cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+	cpu = cpumask_first(&this_leaf->shared_cpu_map);
 
 	if (kstrtoul(buf, 10, &val) < 0)
 		return -EINVAL;
 
-	err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+	err = amd_set_l3_disable_slot(nb, cpu, slot, val);
 	if (err) {
 		if (err == -EEXIST)
 			pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
 
 #define STORE_CACHE_DISABLE(slot)					\
 static ssize_t								\
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf,		\
-			   const char *buf, size_t count,		\
-			   unsigned int cpu)				\
+cache_disable_##slot##_store(struct device *dev,			\
+			     struct device_attribute *attr,		\
+			     const char *buf, size_t count)		\
 {									\
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);		\
 	return store_cache_disable(this_leaf, buf, count, slot);	\
 }
 STORE_CACHE_DISABLE(0)
 STORE_CACHE_DISABLE(1)
 
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
-		show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
-		show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
 {
-	if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-		return -EINVAL;
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+	int cpu = cpumask_first(&this_leaf->shared_cpu_map);
 
 	return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
 }
 
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
-		unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf, size_t count)
 {
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+	int cpu = cpumask_first(&this_leaf->shared_cpu_map);
 	unsigned long val;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-		return -EINVAL;
-
 	if (kstrtoul(buf, 16, &val) < 0)
 		return -EINVAL;
 
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
 	return count;
 }
 
-static struct _cache_attr subcaches =
-	__ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+			       struct attribute *attr, int unused)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+	umode_t mode = attr->mode;
+
+	if (!this_leaf->priv)
+		return 0;
+
+	if ((attr == &dev_attr_subcaches.attr) &&
+	    amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+		return mode;
+
+	if ((attr == &dev_attr_cache_disable_0.attr ||
+	     attr == &dev_attr_cache_disable_1.attr) &&
+	    amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+		return mode;
+
+	return 0;
+}
+
+static struct attribute_group cache_private_group = {
+	.is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+	int n = 1;
+	static struct attribute **amd_l3_attrs;
+
+	if (amd_l3_attrs) /* already initialized */
+		return;
+
+	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+		n += 2;
+	if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+		n += 1;
+
+	amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+	if (!amd_l3_attrs)
+		return;
+
+	n = 0;
+	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+		amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+		amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+	}
+	if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+		amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
 
+	cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+	struct amd_northbridge *nb = this_leaf->priv;
+
+	if (this_leaf->level < 3)
+		return NULL;
+
+	if (nb && nb->l3_cache.indices)
+		init_amd_l3_attrs();
+
+	return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+	int node;
+
+	/* only for L3, and not in virtualized environments */
+	if (index < 3)
+		return;
+
+	node = amd_get_nb_id(smp_processor_id());
+	this_leaf->nb = node_to_amd_nb(node);
+	if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+		amd_calc_l3_indices(this_leaf->nb);
+}
 #else
 #define amd_init_l3_cache(x, y)
 #endif  /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
 		cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
 	}
 
-	if (eax.split.type == CACHE_TYPE_NULL)
+	if (eax.split.type == CTYPE_NULL)
 		return -EIO; /* better error ? */
 
 	this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
 		/* Do cpuid(op) loop to find out num_cache_leaves */
 		cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
 		cache_eax.full = eax;
-	} while (cache_eax.split.type != CACHE_TYPE_NULL);
+	} while (cache_eax.split.type != CTYPE_NULL);
 	return i;
 }
 
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 
 			switch (this_leaf.eax.split.level) {
 			case 1:
-				if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+				if (this_leaf.eax.split.type == CTYPE_DATA)
 					new_l1d = this_leaf.size/1024;
-				else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+				else if (this_leaf.eax.split.type == CTYPE_INST)
 					new_l1i = this_leaf.size/1024;
 				break;
 			case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
 	return l2;
 }
 
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y)	(&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+				    struct _cpuid4_info_regs *base)
 {
-	struct _cpuid4_info *this_leaf;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf;
 	int i, sibling;
 
 	if (cpu_has_topoext) {
 		unsigned int apicid, nshared, first, last;
 
-		if (!per_cpu(ici_cpuid4_info, cpu))
-			return 0;
-
-		this_leaf = CPUID4_INFO_IDX(cpu, index);
-		nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+		this_leaf = this_cpu_ci->info_list + index;
+		nshared = base->eax.split.num_threads_sharing + 1;
 		apicid = cpu_data(cpu).apicid;
 		first = apicid - (apicid % nshared);
 		last = first + nshared - 1;
 
 		for_each_online_cpu(i) {
+			this_cpu_ci = get_cpu_cacheinfo(i);
+			if (!this_cpu_ci->info_list)
+				continue;
+
 			apicid = cpu_data(i).apicid;
 			if ((apicid < first) || (apicid > last))
 				continue;
-			if (!per_cpu(ici_cpuid4_info, i))
-				continue;
-			this_leaf = CPUID4_INFO_IDX(i, index);
+
+			this_leaf = this_cpu_ci->info_list + index;
 
 			for_each_online_cpu(sibling) {
 				apicid = cpu_data(sibling).apicid;
 				if ((apicid < first) || (apicid > last))
 					continue;
-				set_bit(sibling, this_leaf->shared_cpu_map);
+				cpumask_set_cpu(sibling,
+						&this_leaf->shared_cpu_map);
 			}
 		}
 	} else if (index == 3) {
 		for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
-			if (!per_cpu(ici_cpuid4_info, i))
+			this_cpu_ci = get_cpu_cacheinfo(i);
+			if (!this_cpu_ci->info_list)
 				continue;
-			this_leaf = CPUID4_INFO_IDX(i, index);
+			this_leaf = this_cpu_ci->info_list + index;
 			for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
 				if (!cpu_online(sibling))
 					continue;
-				set_bit(sibling, this_leaf->shared_cpu_map);
+				cpumask_set_cpu(sibling,
+						&this_leaf->shared_cpu_map);
 			}
 		}
 	} else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
 	return 1;
 }
 
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+				 struct _cpuid4_info_regs *base)
 {
-	struct _cpuid4_info *this_leaf, *sibling_leaf;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf, *sibling_leaf;
 	unsigned long num_threads_sharing;
 	int index_msb, i;
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 	if (c->x86_vendor == X86_VENDOR_AMD) {
-		if (cache_shared_amd_cpu_map_setup(cpu, index))
+		if (__cache_amd_cpumap_setup(cpu, index, base))
 			return;
 	}
 
-	this_leaf = CPUID4_INFO_IDX(cpu, index);
-	num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+	this_leaf = this_cpu_ci->info_list + index;
+	num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
 
+	cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
 	if (num_threads_sharing == 1)
-		cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
-	else {
-		index_msb = get_count_order(num_threads_sharing);
-
-		for_each_online_cpu(i) {
-			if (cpu_data(i).apicid >> index_msb ==
-			    c->apicid >> index_msb) {
-				cpumask_set_cpu(i,
-					to_cpumask(this_leaf->shared_cpu_map));
-				if (i != cpu && per_cpu(ici_cpuid4_info, i))  {
-					sibling_leaf =
-						CPUID4_INFO_IDX(i, index);
-					cpumask_set_cpu(cpu, to_cpumask(
-						sibling_leaf->shared_cpu_map));
-				}
-			}
-		}
-	}
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-	struct _cpuid4_info	*this_leaf, *sibling_leaf;
-	int sibling;
-
-	this_leaf = CPUID4_INFO_IDX(cpu, index);
-	for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
-		sibling_leaf = CPUID4_INFO_IDX(sibling, index);
-		cpumask_clear_cpu(cpu,
-				  to_cpumask(sibling_leaf->shared_cpu_map));
-	}
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
-	int i;
-
-	for (i = 0; i < num_cache_leaves; i++)
-		cache_remove_shared_cpu_map(cpu, i);
-
-	kfree(per_cpu(ici_cpuid4_info, cpu));
-	per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
-	int j, *retval = _retval, cpu = smp_processor_id();
+		return;
 
-	/* Do cpuid and store the results */
-	for (j = 0; j < num_cache_leaves; j++) {
-		struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+	index_msb = get_count_order(num_threads_sharing);
 
-		*retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
-		if (unlikely(*retval < 0)) {
-			int i;
+	for_each_online_cpu(i)
+		if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+			struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
 
-			for (i = 0; i < j; i++)
-				cache_remove_shared_cpu_map(cpu, i);
-			break;
+			if (i == cpu || !sib_cpu_ci->info_list)
+				continue;/* skip if itself or no cacheinfo */
+			sibling_leaf = sib_cpu_ci->info_list + index;
+			cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+			cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
 		}
-		cache_shared_cpu_map_setup(cpu, j);
-	}
 }
 
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+			 struct _cpuid4_info_regs *base)
 {
-	int			retval;
-
-	if (num_cache_leaves == 0)
-		return -ENOENT;
-
-	per_cpu(ici_cpuid4_info, cpu) = kzalloc(
-	    sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
-	if (per_cpu(ici_cpuid4_info, cpu) == NULL)
-		return -ENOMEM;
-
-	smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
-	if (retval) {
-		kfree(per_cpu(ici_cpuid4_info, cpu));
-		per_cpu(ici_cpuid4_info, cpu) = NULL;
-	}
-
-	return retval;
+	this_leaf->level = base->eax.split.level;
+	this_leaf->type = cache_type_map[base->eax.split.type];
+	this_leaf->coherency_line_size =
+				base->ebx.split.coherency_line_size + 1;
+	this_leaf->ways_of_associativity =
+				base->ebx.split.ways_of_associativity + 1;
+	this_leaf->size = base->size;
+	this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+	this_leaf->physical_line_partition =
+				base->ebx.split.physical_line_partition + 1;
+	this_leaf->priv = base->nb;
 }
 
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
-	struct kobject kobj;
-	unsigned int cpu;
-	unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y)		(&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val)				\
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
-				unsigned int cpu)			\
-{									\
-	return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
-			 unsigned int cpu)
-{
-	return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
-					int type, char *buf)
-{
-	const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-	int ret;
-
-	if (type)
-		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
-				cpumask_pr_args(mask));
-	else
-		ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
-				cpumask_pr_args(mask));
-	buf[ret++] = '\n';
-	buf[ret] = '\0';
-	return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
-					  unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
 {
-	return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
-					   unsigned int cpu)
-{
-	return show_shared_cpu_map_func(leaf, 1, buf);
-}
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
 
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
-			 unsigned int cpu)
-{
-	switch (this_leaf->base.eax.split.type) {
-	case CACHE_TYPE_DATA:
-		return sprintf(buf, "Data\n");
-	case CACHE_TYPE_INST:
-		return sprintf(buf, "Instruction\n");
-	case CACHE_TYPE_UNIFIED:
-		return sprintf(buf, "Unified\n");
-	default:
-		return sprintf(buf, "Unknown\n");
-	}
-}
-
-#define to_object(k)	container_of(k, struct _index_kobject, kobj)
-#define to_attr(a)	container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
-	__ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
-	&type.attr,
-	&level.attr,
-	&coherency_line_size.attr,
-	&physical_line_partition.attr,
-	&ways_of_associativity.attr,
-	&number_of_sets.attr,
-	&size.attr,
-	&shared_cpu_map.attr,
-	&shared_cpu_list.attr,
-	NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
-	static struct attribute **attrs;
-	int n;
-
-	if (attrs)
-		return attrs;
-
-	n = ARRAY_SIZE(default_attrs);
-
-	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
-		n += 2;
-
-	if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-		n += 1;
-
-	attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
-	if (attrs == NULL)
-		return attrs = default_attrs;
-
-	for (n = 0; default_attrs[n]; n++)
-		attrs[n] = default_attrs[n];
-
-	if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
-		attrs[n++] = &cache_disable_0.attr;
-		attrs[n++] = &cache_disable_1.attr;
-	}
-
-	if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
-		attrs[n++] = &subcaches.attr;
-
-	return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
-	struct _cache_attr *fattr = to_attr(attr);
-	struct _index_kobject *this_leaf = to_object(kobj);
-	ssize_t ret;
-
-	ret = fattr->show ?
-		fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-			buf, this_leaf->cpu) :
-		0;
-	return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
-		     const char *buf, size_t count)
-{
-	struct _cache_attr *fattr = to_attr(attr);
-	struct _index_kobject *this_leaf = to_object(kobj);
-	ssize_t ret;
-
-	ret = fattr->store ?
-		fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
-			buf, count, this_leaf->cpu) :
-		0;
-	return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
-	.show   = show,
-	.store  = store,
-};
-
-static struct kobj_type ktype_cache = {
-	.sysfs_ops	= &sysfs_ops,
-	.default_attrs	= default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
-	.sysfs_ops	= &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
-	kfree(per_cpu(ici_cache_kobject, cpu));
-	kfree(per_cpu(ici_index_kobject, cpu));
-	per_cpu(ici_cache_kobject, cpu) = NULL;
-	per_cpu(ici_index_kobject, cpu) = NULL;
-	free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
-	int err;
-
-	if (num_cache_leaves == 0)
+	if (!num_cache_leaves)
 		return -ENOENT;
-
-	err = detect_cache_attributes(cpu);
-	if (err)
-		return err;
-
-	/* Allocate all required memory */
-	per_cpu(ici_cache_kobject, cpu) =
-		kzalloc(sizeof(struct kobject), GFP_KERNEL);
-	if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
-		goto err_out;
-
-	per_cpu(ici_index_kobject, cpu) = kzalloc(
-	    sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
-	if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
-		goto err_out;
-
+	if (!this_cpu_ci)
+		return -EINVAL;
+	this_cpu_ci->num_levels = 3;
+	this_cpu_ci->num_leaves = num_cache_leaves;
 	return 0;
-
-err_out:
-	cpuid4_cache_sysfs_exit(cpu);
-	return -ENOMEM;
 }
 
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
 {
-	unsigned int cpu = dev->id;
-	unsigned long i, j;
-	struct _index_kobject *this_object;
-	struct _cpuid4_info   *this_leaf;
-	int retval;
-
-	retval = cpuid4_cache_sysfs_init(cpu);
-	if (unlikely(retval < 0))
-		return retval;
-
-	retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
-				      &ktype_percpu_entry,
-				      &dev->kobj, "%s", "cache");
-	if (retval < 0) {
-		cpuid4_cache_sysfs_exit(cpu);
-		return retval;
-	}
+	unsigned int idx, ret;
+	struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+	struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+	struct _cpuid4_info_regs id4_regs = {};
 
-	for (i = 0; i < num_cache_leaves; i++) {
-		this_object = INDEX_KOBJECT_PTR(cpu, i);
-		this_object->cpu = cpu;
-		this_object->index = i;
-
-		this_leaf = CPUID4_INFO_IDX(cpu, i);
-
-		ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
-		if (this_leaf->base.nb)
-			ktype_cache.default_attrs = amd_l3_attrs();
-#endif
-		retval = kobject_init_and_add(&(this_object->kobj),
-					      &ktype_cache,
-					      per_cpu(ici_cache_kobject, cpu),
-					      "index%1lu", i);
-		if (unlikely(retval)) {
-			for (j = 0; j < i; j++)
-				kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
-			kobject_put(per_cpu(ici_cache_kobject, cpu));
-			cpuid4_cache_sysfs_exit(cpu);
-			return retval;
-		}
-		kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+	for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+		ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+		if (ret)
+			return ret;
+		ci_leaf_init(this_leaf++, &id4_regs);
+		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}
-	cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
-	kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
 	return 0;
 }
 
-static void cache_remove_dev(struct device *dev)
-{
-	unsigned int cpu = dev->id;
-	unsigned long i;
-
-	if (per_cpu(ici_cpuid4_info, cpu) == NULL)
-		return;
-	if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
-		return;
-	cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
-	for (i = 0; i < num_cache_leaves; i++)
-		kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
-	kobject_put(per_cpu(ici_cache_kobject, cpu));
-	cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
-				  unsigned long action, void *hcpu)
-{
-	unsigned int cpu = (unsigned long)hcpu;
-	struct device *dev;
-
-	dev = get_cpu_device(cpu);
-	switch (action) {
-	case CPU_ONLINE:
-	case CPU_ONLINE_FROZEN:
-		cache_add_dev(dev);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		cache_remove_dev(dev);
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
-	.notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
-	int i, err = 0;
-
-	if (num_cache_leaves == 0)
-		return 0;
-
-	cpu_notifier_register_begin();
-	for_each_online_cpu(i) {
-		struct device *dev = get_cpu_device(i);
-
-		err = cache_add_dev(dev);
-		if (err)
-			goto out;
-	}
-	__register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
-	cpu_notifier_register_done();
-	return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
-- 
cgit v1.2.3


From c467ea763fd5d8795b7d1b5a78eb94b6ad8f66ad Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker
Date: Wed, 4 Mar 2015 18:06:33 +0100
Subject: context_tracking: Rename context symbols to prepare for transition
 state

Current context tracking symbols are designed to express living state.
As such they are prefixed with "IN_": IN_USER, IN_KERNEL.

Now we are going to use these symbols to also express state transitions
such as context_tracking_enter(IN_USER) or context_tracking_exit(IN_USER).
But while the "IN_" prefix works well to express entering a context, it's
confusing to depict a context exit: context_tracking_exit(IN_USER)
could mean two things:
	1) We are exiting the current context to enter user context.
	2) We are exiting the user context
We want 2) but the reviewer may be confused and understand 1)

So lets disambiguate these symbols and rename them to CONTEXT_USER and
CONTEXT_KERNEL.

Acked-by: Rik van Riel <riel@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Will deacon <will.deacon@arm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Luiz Capitulino <lcapitulino@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>---
 arch/x86/kernel/traps.c                | 2 +-
 include/linux/context_tracking.h       | 2 +-
 include/linux/context_tracking_state.h | 6 +++---
 kernel/context_tracking.c              | 8 ++++----
 kernel/sched/core.c                    | 2 +-
 5 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e2ecc9..756f74eed35d 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = IN_KERNEL;  /* the value is irrelevant. */
+		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 37b81bd51ec0..427b056dfd3d 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -43,7 +43,7 @@ static inline enum ctx_state exception_enter(void)
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
 	if (context_tracking_is_enabled()) {
-		if (prev_ctx == IN_USER)
+		if (prev_ctx == CONTEXT_USER)
 			context_tracking_user_enter();
 	}
 }
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 97a81225d037..8076f875c324 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -13,8 +13,8 @@ struct context_tracking {
 	 */
 	bool active;
 	enum ctx_state {
-		IN_KERNEL = 0,
-		IN_USER,
+		CONTEXT_KERNEL = 0,
+		CONTEXT_USER,
 	} state;
 };
 
@@ -34,7 +34,7 @@ static inline bool context_tracking_cpu_is_enabled(void)
 
 static inline bool context_tracking_in_user(void)
 {
-	return __this_cpu_read(context_tracking.state) == IN_USER;
+	return __this_cpu_read(context_tracking.state) == CONTEXT_USER;
 }
 #else
 static inline bool context_tracking_in_user(void) { return false; }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 937ecdfdf258..8ad53c9d38b6 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -75,7 +75,7 @@ void context_tracking_user_enter(void)
 	WARN_ON_ONCE(!current->mm);
 
 	local_irq_save(flags);
-	if ( __this_cpu_read(context_tracking.state) != IN_USER) {
+	if ( __this_cpu_read(context_tracking.state) != CONTEXT_USER) {
 		if (__this_cpu_read(context_tracking.active)) {
 			trace_user_enter(0);
 			/*
@@ -101,7 +101,7 @@ void context_tracking_user_enter(void)
 		 * OTOH we can spare the calls to vtime and RCU when context_tracking.active
 		 * is false because we know that CPU is not tickless.
 		 */
-		__this_cpu_write(context_tracking.state, IN_USER);
+		__this_cpu_write(context_tracking.state, CONTEXT_USER);
 	}
 	local_irq_restore(flags);
 }
@@ -129,7 +129,7 @@ void context_tracking_user_exit(void)
 		return;
 
 	local_irq_save(flags);
-	if (__this_cpu_read(context_tracking.state) == IN_USER) {
+	if (__this_cpu_read(context_tracking.state) == CONTEXT_USER) {
 		if (__this_cpu_read(context_tracking.active)) {
 			/*
 			 * We are going to run code that may use RCU. Inform
@@ -139,7 +139,7 @@ void context_tracking_user_exit(void)
 			vtime_user_exit(current);
 			trace_user_exit(0);
 		}
-		__this_cpu_write(context_tracking.state, IN_KERNEL);
+		__this_cpu_write(context_tracking.state, CONTEXT_KERNEL);
 	}
 	local_irq_restore(flags);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..06b9a00871e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2818,7 +2818,7 @@ asmlinkage __visible void __sched schedule_user(void)
 	 * we find a better solution.
 	 *
 	 * NB: There are buggy callers of this function.  Ideally we
-	 * should warn if prev_state != IN_USER, but that will trigger
+	 * should warn if prev_state != CONTEXT_USER, but that will trigger
 	 * too frequently to make sense yet.
 	 */
 	enum ctx_state prev_state = exception_enter();
-- 
cgit v1.2.3


From 394838c96013ba414a24ffe7a2a593a9154daadf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 9 Mar 2015 17:42:31 -0700
Subject: x86/asm/entry/32: Fix user_mode() misuses

The one in do_debug() is probably harmless, but better safe than sorry.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: <stable@vger.kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d67deaa9df5458363623001f252d1aee3215d014.1425948056.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 9d2073e2ecc9..4ff5d162ff9f 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		goto exit;
 	conditional_sti(regs);
 
-	if (!user_mode(regs))
+	if (!user_mode_vm(regs))
 		die("bounds", regs, error_code);
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -637,7 +637,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 * then it's very likely the result of an icebp/int01 trap.
 	 * User wants a sigtrap for that.
 	 */
-	if (!dr6 && user_mode(regs))
+	if (!dr6 && user_mode_vm(regs))
 		user_icebp = 1;
 
 	/* Catch kmemcheck conditions first of all! */
-- 
cgit v1.2.3


From e7f180dcd8ab48f18b20d7e8a7e9b39192bdf8e0 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Tue, 10 Mar 2015 07:06:24 +0100
Subject: x86/fpu: Change xstateregs_get()/set() to use ->xsave.i387 rather
 than ->fxsave

This is a cosmetic change: xstateregs_get() and xstateregs_set()
abuse ->fxsave to access xsave->i387.sw_reserved.

This practice is correct, ->fxsave and xsave->i387 share the same memory,
but IMHO this looks confusing.

And we can make this code more readable if we add a
"struct xsave_struct *" local variable as well.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425967585-4725-1-git-send-email-bp@alien8.de
Link: http://lkml.kernel.org/r/20150302183237.GB23085@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/i387.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 8416b5f85806..03cc0add8694 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -339,6 +339,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		unsigned int pos, unsigned int count,
 		void *kbuf, void __user *ubuf)
 {
+	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
 	int ret;
 
 	if (!cpu_has_xsave)
@@ -353,14 +354,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	 * memory layout in the thread struct, so that we can copy the entire
 	 * xstateregs to the user using one user_regset_copyout().
 	 */
-	memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
-	       xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
+	memcpy(&xsave->i387.sw_reserved,
+		xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
 	/*
 	 * Copy the xstate memory layout.
 	 */
-	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  &target->thread.fpu.state->xsave, 0, -1);
+	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	return ret;
 }
 
@@ -368,8 +367,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		  unsigned int pos, unsigned int count,
 		  const void *kbuf, const void __user *ubuf)
 {
+	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
 	int ret;
-	struct xsave_hdr_struct *xsave_hdr;
 
 	if (!cpu_has_xsave)
 		return -ENODEV;
@@ -378,22 +377,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
-	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 &target->thread.fpu.state->xsave, 0, -1);
-
+	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
 	 */
-	target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
-	xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
-
-	xsave_hdr->xstate_bv &= pcntxt_mask;
+	xsave->i387.mxcsr &= mxcsr_feature_mask;
+	xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
 	/*
 	 * These bits must be zero.
 	 */
-	memset(xsave_hdr->reserved, 0, 48);
-
+	memset(&xsave->xsave_hdr.reserved, 0, 48);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 1d23c4518b1f3a03c278f23333149245c178d2a6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Tue, 10 Mar 2015 07:06:25 +0100
Subject: x86/fpu: Factor out memset(xstate, 0) in fpu_finit() paths

fx_finit() has two users but only fpu_finit() needs to clear
xstate, alloc_bootmem_align() in setup_init_fpu_buf() returns
zero-filled memory.

And note that both memset()'s look confusing. Yes, offsetof() is
0 for ->fxsave or ->fsave, but it would be cleaner to turn
them into a single memset() which zeroes fpu->state.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Rik van Riel <riel@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1425967585-4725-2-git-send-email-bp@alien8.de
Link: http://lkml.kernel.org/r/20150302183257.GC23085@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 1 -
 arch/x86/kernel/i387.c              | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 61609b963eab..5fa1be21ac2a 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -135,7 +135,6 @@ static __always_inline __pure bool use_fxsr(void)
 
 static inline void fx_finit(struct i387_fxsave_struct *fx)
 {
-	memset(fx, 0, xstate_size);
 	fx->cwd = 0x37f;
 	fx->mxcsr = MXCSR_DEFAULT;
 }
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 03cc0add8694..0f3de6674ae3 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -224,11 +224,12 @@ void fpu_finit(struct fpu *fpu)
 		return;
 	}
 
+	memset(fpu->state, 0, xstate_size);
+
 	if (cpu_has_fxsr) {
 		fx_finit(&fpu->state->fxsave);
 	} else {
 		struct i387_fsave_struct *fp = &fpu->state->fsave;
-		memset(fp, 0, xstate_size);
 		fp->cwd = 0xffff037fu;
 		fp->swd = 0xffff0000u;
 		fp->twd = 0xffffffffu;
-- 
cgit v1.2.3


From 29722cd4ef666705b2eda1c3ba44435488e509eb Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 9 Mar 2015 19:39:21 +0100
Subject: x86/asm/entry/64: Save R11 into pt_regs->flags on SYSCALL64 fastpath

Before this patch, R11 was saved in pt_regs->r11.

Which looks natural, but requires messy shuffling to/from iret
frame whenever ptrace or e.g. sys_iopl() wants to modify flags -
because that's how this register is used by SYSCALL/SYSRET.

This patch saves R11 in pt_regs->flags, and uses that value for
the SYSRET64 instruction. Shuffling is eliminated.

FIXUP/RESTORE_TOP_OF_STACK are simplified.

stub_iopl is no longer needed: pt_regs->flags needs no fixing up.

Testing shows that syscall fast path is ~54.3 ns before
and after the patch (on 2.7 GHz Sandy Bridge CPU).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425926364-9526-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calling.h | 20 ++++++++++++++------
 arch/x86/kernel/entry_64.S     | 24 +++++++++++-------------
 2 files changed, 25 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index f1a962ff7ddf..4b5f7bf2b780 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -95,9 +95,11 @@ For 32-bit we have the following conventions - kernel is built with
 	CFI_ADJUST_CFA_OFFSET 15*8+\addskip
 	.endm
 
-	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8plus=1
-	.if \r8plus
+	.macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
+	.if \r11
 	movq_cfi r11, 6*8+\offset
+	.endif
+	.if \r8910
 	movq_cfi r10, 7*8+\offset
 	movq_cfi r9,  8*8+\offset
 	movq_cfi r8,  9*8+\offset
@@ -113,16 +115,19 @@ For 32-bit we have the following conventions - kernel is built with
 	movq_cfi rdi, 14*8+\offset
 	.endm
 	.macro SAVE_C_REGS offset=0
-	SAVE_C_REGS_HELPER \offset, 1, 1, 1
+	SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
 	.endm
 	.macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
-	SAVE_C_REGS_HELPER \offset, 0, 0, 1
+	SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
 	.endm
 	.macro SAVE_C_REGS_EXCEPT_R891011
-	SAVE_C_REGS_HELPER 0, 1, 1, 0
+	SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
 	.endm
 	.macro SAVE_C_REGS_EXCEPT_RCX_R891011
-	SAVE_C_REGS_HELPER 0, 1, 0, 0
+	SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
+	.endm
+	.macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+	SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
 	.endm
 
 	.macro SAVE_EXTRA_REGS offset=0
@@ -179,6 +184,9 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_R11
 	RESTORE_C_REGS_HELPER 1,1,0,1,1
 	.endm
+	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
+	RESTORE_C_REGS_HELPER 1,0,0,1,1
+	.endm
 	.macro RESTORE_RSI_RDI
 	RESTORE_C_REGS_HELPER 0,0,0,0,0
 	.endm
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5117a2baefe9..324200aca431 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -121,14 +121,12 @@ ENDPROC(native_usergs_sysret64)
 #endif
 
 /*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
+ * C code is not supposed to know that the iret frame is not populated.
+ * Every time a C function with an pt_regs argument is called from
+ * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed.
  * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
  * manipulation.
  */
-
-	/* %rsp:at FRAMEEND */
 	.macro FIXUP_TOP_OF_STACK tmp offset=0
 	movq PER_CPU_VAR(old_rsp),\tmp
 	movq \tmp,RSP+\offset(%rsp)
@@ -136,15 +134,13 @@ ENDPROC(native_usergs_sysret64)
 	movq $__USER_CS,CS+\offset(%rsp)
 	movq RIP+\offset(%rsp),\tmp  /* get rip */
 	movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
-	movq R11+\offset(%rsp),\tmp  /* get eflags */
-	movq \tmp,EFLAGS+\offset(%rsp)
+	movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */
+	movq \tmp,R11+\offset(%rsp)
 	.endm
 
 	.macro RESTORE_TOP_OF_STACK tmp offset=0
 	movq RSP+\offset(%rsp),\tmp
 	movq \tmp,PER_CPU_VAR(old_rsp)
-	movq EFLAGS+\offset(%rsp),\tmp
-	movq \tmp,R11+\offset(%rsp)
 	.endm
 
 /*
@@ -257,9 +253,10 @@ GLOBAL(system_call_after_swapgs)
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
-	SAVE_C_REGS_EXCEPT_RAX_RCX
+	SAVE_C_REGS_EXCEPT_RAX_RCX_R11
 	movq	$-ENOSYS,RAX(%rsp)
 	movq_cfi rax,ORIG_RAX
+	movq	%r11,EFLAGS(%rsp)
 	movq	%rcx,RIP(%rsp)
 	CFI_REL_OFFSET rip,RIP
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
@@ -277,7 +274,7 @@ system_call_fastpath:
 	movq %rax,RAX(%rsp)
 /*
  * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Has incompletely filled pt_regs, iret frame is also incomplete.
  */
 ret_from_sys_call:
 	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
@@ -291,9 +288,10 @@ ret_from_sys_call:
 	 * sysretq will re-enable interrupts:
 	 */
 	TRACE_IRQS_ON
-	RESTORE_C_REGS_EXCEPT_RCX
-	movq RIP(%rsp),%rcx
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	movq	RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
+	movq	EFLAGS(%rsp),%r11
 	/*CFI_REGISTER	rflags,r11*/
 	movq	PER_CPU_VAR(old_rsp), %rsp
 	/*
-- 
cgit v1.2.3


From 616ab249f1e42f6135642183529f910fcedc2642 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 10 Mar 2015 11:45:06 +0100
Subject: x86/asm/entry/64: Remove stub_iopl

stub_iopl is no longer needed: pt_regs->flags needs no fixing up
after previous change. Remove it.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425984307-2143-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S       | 13 -------------
 arch/x86/syscalls/syscall_64.tbl |  2 +-
 arch/x86/um/sys_call_table_64.c  |  2 +-
 3 files changed, 2 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 324200aca431..703ced057199 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -421,22 +421,9 @@ ENTRY(stub_\func)
 END(stub_\func)
 	.endm
 
-	.macro FIXED_FRAME label,func
-ENTRY(\label)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8		/* offset 8: return address */
-	FIXUP_TOP_OF_STACK %r11, 8
-	call \func
-	RESTORE_TOP_OF_STACK %r11, 8
-	ret
-	CFI_ENDPROC
-END(\label)
-	.endm
-
 	FORK_LIKE  clone
 	FORK_LIKE  fork
 	FORK_LIKE  vfork
-	FIXED_FRAME stub_iopl, sys_iopl
 
 ENTRY(stub_execve)
 	CFI_STARTPROC
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
 169	common	reboot			sys_reboot
 170	common	sethostname		sys_sethostname
 171	common	setdomainname		sys_setdomainname
-172	common	iopl			stub_iopl
+172	common	iopl			sys_iopl
 173	common	ioperm			sys_ioperm
 174	64	create_module
 175	common	init_module		sys_init_module
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
  */
 
 /* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
 #define sys_ioperm sys_ni_syscall
 
 /*
-- 
cgit v1.2.3


From 263042e4630a85e856b4a8cd72f28dab33ef4741 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 9 Mar 2015 19:39:23 +0100
Subject: x86/asm/entry/64: Save user RSP in pt_regs->sp on SYSCALL64 fastpath

Prepare for the removal of 'usersp', by simplifying PER_CPU(old_rsp) usage:

  - use it only as temp storage

  - store the userspace stack pointer immediately in pt_regs->sp
    on syscall entry, instead of using it later, on syscall exit.

  - change C code to use pt_regs->sp only, instead of PER_CPU(old_rsp)
    and task->thread.usersp.

FIXUP/RESTORE_TOP_OF_STACK are simplified as well.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425926364-9526-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/compat.h |  2 +-
 arch/x86/include/asm/ptrace.h |  8 ++------
 arch/x86/kernel/entry_64.S    | 18 +++++++-----------
 arch/x86/kernel/perf_regs.c   |  2 +-
 arch/x86/kernel/process_64.c  |  3 +--
 5 files changed, 12 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
 		sp = task_pt_regs(current)->sp;
 	} else {
 		/* -128 for the x32 ABI redzone */
-		sp = this_cpu_read(old_rsp) - 128;
+		sp = task_pt_regs(current)->sp - 128;
 	}
 
 	return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 4077d963a1a0..74bb2e0f3030 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -145,12 +145,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 #endif
 }
 
-#define current_user_stack_pointer()	this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer()	\
-	(test_thread_flag(TIF_IA32) 	\
-	 ? current_pt_regs()->sp 	\
-	 : this_cpu_read(old_rsp))
+#define current_user_stack_pointer()	current_pt_regs()->sp
+#define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 703ced057199..d86788c3257b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -128,8 +128,6 @@ ENDPROC(native_usergs_sysret64)
  * manipulation.
  */
 	.macro FIXUP_TOP_OF_STACK tmp offset=0
-	movq PER_CPU_VAR(old_rsp),\tmp
-	movq \tmp,RSP+\offset(%rsp)
 	movq $__USER_DS,SS+\offset(%rsp)
 	movq $__USER_CS,CS+\offset(%rsp)
 	movq RIP+\offset(%rsp),\tmp  /* get rip */
@@ -139,8 +137,7 @@ ENDPROC(native_usergs_sysret64)
 	.endm
 
 	.macro RESTORE_TOP_OF_STACK tmp offset=0
-	movq RSP+\offset(%rsp),\tmp
-	movq \tmp,PER_CPU_VAR(old_rsp)
+	/* nothing to do */
 	.endm
 
 /*
@@ -222,9 +219,6 @@ ENDPROC(native_usergs_sysret64)
  * Interrupts are off on entry.
  * Only called from user space.
  *
- * XXX	if we had a free scratch register we could save the RSP into the stack frame
- *      and report it properly in ps. Unfortunately we haven't.
- *
  * When user can change the frames always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
@@ -253,11 +247,13 @@ GLOBAL(system_call_after_swapgs)
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
 	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
+	movq	%rcx,RIP(%rsp)
+	movq	PER_CPU_VAR(old_rsp),%rcx
+	movq	%r11,EFLAGS(%rsp)
+	movq	%rcx,RSP(%rsp)
+	movq_cfi rax,ORIG_RAX
 	SAVE_C_REGS_EXCEPT_RAX_RCX_R11
 	movq	$-ENOSYS,RAX(%rsp)
-	movq_cfi rax,ORIG_RAX
-	movq	%r11,EFLAGS(%rsp)
-	movq	%rcx,RIP(%rsp)
 	CFI_REL_OFFSET rip,RIP
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
 	jnz tracesys
@@ -293,7 +289,7 @@ ret_from_sys_call:
 	CFI_REGISTER	rip,rcx
 	movq	EFLAGS(%rsp),%r11
 	/*CFI_REGISTER	rflags,r11*/
-	movq	PER_CPU_VAR(old_rsp), %rsp
+	movq	RSP(%rsp),%rsp
 	/*
 	 * 64bit SYSRET restores rip from rcx,
 	 * rflags from r11 (but RF and VM bits are forced to 0),
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..02a8720414c0 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -177,7 +177,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 		 * than just blindly copying user_regs.
 		 */
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-		regs_user_copy->sp = this_cpu_read(old_rsp);
+		regs_user_copy->sp = user_regs->sp;
 		regs_user_copy->cs = __USER_CS;
 		regs_user_copy->ss = __USER_DS;
 		regs_user_copy->cx = -1;  /* usually contains garbage */
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 1e393d27d701..e8c124a1f885 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -602,6 +602,5 @@ long sys_arch_prctl(int code, unsigned long addr)
 
 unsigned long KSTK_ESP(struct task_struct *task)
 {
-	return (test_tsk_thread_flag(task, TIF_IA32)) ?
-			(task_pt_regs(task)->sp) : ((task)->thread.usersp);
+	return task_pt_regs(task)->sp;
 }
-- 
cgit v1.2.3


From 668f198f40d1cc89c2330c6ad56f3b397b05a0bc Mon Sep 17 00:00:00 2001
From: David Kaplan
Date: Fri, 20 Feb 2015 16:02:10 -0600
Subject: KVM: SVM: use kvm_register_write()/read()

KVM has nice wrappers to access the register values, clean up a few places
that should use them but currently do not.

Signed-off-by: David Kaplan <david.kaplan@amd.com>
[forward port and testing]
Signed-off-by: Joel Schopp <joel.schopp@amd.com>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index cc618c882f90..93dda3ccff03 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2757,11 +2757,11 @@ static int invlpga_interception(struct vcpu_svm *svm)
 {
 	struct kvm_vcpu *vcpu = &svm->vcpu;
 
-	trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
-			  vcpu->arch.regs[VCPU_REGS_RAX]);
+	trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
+			  kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
 	/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
-	kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
+	kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
 	skip_emulated_instruction(&svm->vcpu);
@@ -2770,7 +2770,7 @@ static int invlpga_interception(struct vcpu_svm *svm)
 
 static int skinit_interception(struct vcpu_svm *svm)
 {
-	trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
+	trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
 
 	kvm_queue_exception(&svm->vcpu, UD_VECTOR);
 	return 1;
@@ -3133,7 +3133,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
 
 static int rdmsr_interception(struct vcpu_svm *svm)
 {
-	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
+	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
 	u64 data;
 
 	if (svm_get_msr(&svm->vcpu, ecx, &data)) {
@@ -3142,8 +3142,8 @@ static int rdmsr_interception(struct vcpu_svm *svm)
 	} else {
 		trace_kvm_msr_read(ecx, data);
 
-		svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
-		svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, data & 0xffffffff);
+		kvm_register_write(&svm->vcpu, VCPU_REGS_RDX, data >> 32);
 		svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
 		skip_emulated_instruction(&svm->vcpu);
 	}
@@ -3246,9 +3246,8 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 static int wrmsr_interception(struct vcpu_svm *svm)
 {
 	struct msr_data msr;
-	u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
-	u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
-		| ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
+	u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+	u64 data = kvm_read_edx_eax(&svm->vcpu);
 
 	msr.data = data;
 	msr.index = ecx;
-- 
cgit v1.2.3


From 5bda6eed2e3626f40f2602a8fed72007f1fafaf8 Mon Sep 17 00:00:00 2001
From: Wincy Van
Date: Wed, 24 Dec 2014 11:14:29 +0800
Subject: KVM: ioapic: Record edge-triggered interrupts delivery status

This patch fixes the bug discussed in
https://www.mail-archive.com/kvm@vger.kernel.org/msg109813.html

This patch uses a new field named irr_delivered to record the
delivery status of edge-triggered interrupts, and clears the
delivered interrupts in kvm_get_ioapic. So it has the same effect
of commit 0bc830b05c667218d703f2026ec866c49df974fc
("KVM: ioapic: clear IRR for edge-triggered interrupts at delivery")
while avoids the bug of Windows guests.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/ioapic.c | 7 ++++++-
 arch/x86/kvm/ioapic.h | 1 +
 2 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index b1947e0f3e10..a2e9d961c7fe 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -206,6 +206,8 @@ static int ioapic_set_irq(struct kvm_ioapic *ioapic, unsigned int irq,
 
 	old_irr = ioapic->irr;
 	ioapic->irr |= mask;
+	if (edge)
+		ioapic->irr_delivered &= ~mask;
 	if ((edge && old_irr == ioapic->irr) ||
 	    (!edge && entry.fields.remote_irr)) {
 		ret = 0;
@@ -349,7 +351,7 @@ static int ioapic_service(struct kvm_ioapic *ioapic, int irq, bool line_status)
 	irqe.shorthand = 0;
 
 	if (irqe.trig_mode == IOAPIC_EDGE_TRIG)
-		ioapic->irr &= ~(1 << irq);
+		ioapic->irr_delivered |= 1 << irq;
 
 	if (irq == RTC_GSI && line_status) {
 		/*
@@ -597,6 +599,7 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 	ioapic->base_address = IOAPIC_DEFAULT_BASE_ADDRESS;
 	ioapic->ioregsel = 0;
 	ioapic->irr = 0;
+	ioapic->irr_delivered = 0;
 	ioapic->id = 0;
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
@@ -654,6 +657,7 @@ int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 
 	spin_lock(&ioapic->lock);
 	memcpy(state, ioapic, sizeof(struct kvm_ioapic_state));
+	state->irr &= ~ioapic->irr_delivered;
 	spin_unlock(&ioapic->lock);
 	return 0;
 }
@@ -667,6 +671,7 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	spin_lock(&ioapic->lock);
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
+	ioapic->irr_delivered = 0;
 	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index c2e36d934af4..38d8402ea65c 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -77,6 +77,7 @@ struct kvm_ioapic {
 	struct rtc_status rtc_status;
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
+	u32 irr_delivered;
 };
 
 #ifdef DEBUG
-- 
cgit v1.2.3


From 548ef28449c0c06f92194c40ff0eaed248cb4b75 Mon Sep 17 00:00:00 2001
From: Thomas Huth
Date: Tue, 24 Feb 2015 21:29:25 +0100
Subject: KVM: Get rid of kvm_kvfree()

kvm_kvfree() provides exactly the same functionality as the
new common kvfree() function - so let's simply replace the
kvm function with the common function.

Signed-off-by: Thomas Huth <thuth@linux.vnet.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c       |  8 ++++----
 include/linux/kvm_host.h |  1 -
 virt/kvm/kvm_main.c      | 10 +---------
 3 files changed, 5 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70be41b3..c5f7e035e0f1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7429,7 +7429,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
 		if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
-			kvm_kvfree(free->arch.rmap[i]);
+			kvfree(free->arch.rmap[i]);
 			free->arch.rmap[i] = NULL;
 		}
 		if (i == 0)
@@ -7437,7 +7437,7 @@ void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
 
 		if (!dont || free->arch.lpage_info[i - 1] !=
 			     dont->arch.lpage_info[i - 1]) {
-			kvm_kvfree(free->arch.lpage_info[i - 1]);
+			kvfree(free->arch.lpage_info[i - 1]);
 			free->arch.lpage_info[i - 1] = NULL;
 		}
 	}
@@ -7491,12 +7491,12 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
 
 out_free:
 	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
-		kvm_kvfree(slot->arch.rmap[i]);
+		kvfree(slot->arch.rmap[i]);
 		slot->arch.rmap[i] = NULL;
 		if (i == 0)
 			continue;
 
-		kvm_kvfree(slot->arch.lpage_info[i - 1]);
+		kvfree(slot->arch.lpage_info[i - 1]);
 		slot->arch.lpage_info[i - 1] = NULL;
 	}
 	return -ENOMEM;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index d12b2104d19b..0f574ebc82f4 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -658,7 +658,6 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu);
 int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu);
 
 void *kvm_kvzalloc(unsigned long size);
-void kvm_kvfree(const void *addr);
 
 #ifndef __KVM_HAVE_ARCH_VM_ALLOC
 static inline struct kvm *kvm_arch_alloc_vm(void)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b1d6a161eafa..07064dc18f97 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -539,20 +539,12 @@ void *kvm_kvzalloc(unsigned long size)
 		return kzalloc(size, GFP_KERNEL);
 }
 
-void kvm_kvfree(const void *addr)
-{
-	if (is_vmalloc_addr(addr))
-		vfree(addr);
-	else
-		kfree(addr);
-}
-
 static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
 {
 	if (!memslot->dirty_bitmap)
 		return;
 
-	kvm_kvfree(memslot->dirty_bitmap);
+	kvfree(memslot->dirty_bitmap);
 	memslot->dirty_bitmap = NULL;
 }
 
-- 
cgit v1.2.3


From 5cb56059c94ddfaf92567a1c6443deec8363ae1c Mon Sep 17 00:00:00 2001
From: Joel Schopp
Date: Mon, 2 Mar 2015 13:43:31 -0600
Subject: kvm: x86: make kvm_emulate_* consistant

Currently kvm_emulate() skips the instruction but kvm_emulate_* sometimes
don't.  The end reult is the caller ends up doing the skip themselves.
Let's make them consistant.

Signed-off-by: Joel Schopp <joel.schopp@amd.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/svm.c              |  2 --
 arch/x86/kvm/vmx.c              |  9 +++------
 arch/x86/kvm/x86.c              | 23 ++++++++++++++++++++---
 4 files changed, 24 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39cc385..bf5a1606ccd5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -933,6 +933,7 @@ struct x86_emulate_ctxt;
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 93dda3ccff03..16d6e5ca4c03 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
 static int halt_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
-	skip_emulated_instruction(&svm->vcpu);
 	return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	skip_emulated_instruction(&svm->vcpu);
 	kvm_emulate_hypercall(&svm->vcpu);
 	return 1;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b417a3a..fbd949909628 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5000,7 +5000,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
 		if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
 			if (vcpu->arch.halt_request) {
 				vcpu->arch.halt_request = 0;
-				return kvm_emulate_halt(vcpu);
+				return kvm_vcpu_halt(vcpu);
 			}
 			return 1;
 		}
@@ -5527,13 +5527,11 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 
 static int handle_halt(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
 	return kvm_emulate_halt(vcpu);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
 	kvm_emulate_hypercall(vcpu);
 	return 1;
 }
@@ -5564,7 +5562,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
 
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
-	skip_emulated_instruction(vcpu);
 	kvm_emulate_wbinvd(vcpu);
 	return 1;
 }
@@ -5903,7 +5900,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 
 		if (vcpu->arch.halt_request) {
 			vcpu->arch.halt_request = 0;
-			ret = kvm_emulate_halt(vcpu);
+			ret = kvm_vcpu_halt(vcpu);
 			goto out;
 		}
 
@@ -9518,7 +9515,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	vmcs12->launch_state = 1;
 
 	if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-		return kvm_emulate_halt(vcpu);
+		return kvm_vcpu_halt(vcpu);
 
 	vmx->nested.nested_run_pending = 1;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c5f7e035e0f1..d1a1feaa522b 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4706,7 +4706,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
 	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 }
 
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 {
 	if (!need_emulate_wbinvd(vcpu))
 		return X86EMUL_CONTINUE;
@@ -4723,11 +4723,19 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 		wbinvd();
 	return X86EMUL_CONTINUE;
 }
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	return kvm_emulate_wbinvd_noskip(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
+
+
 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-	kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
+	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
@@ -5817,7 +5825,7 @@ void kvm_arch_exit(void)
 	free_percpu(shared_msrs);
 }
 
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 	++vcpu->stat.halt_exits;
 	if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5828,6 +5836,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
 		return 0;
 	}
 }
+EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+	return kvm_vcpu_halt(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5912,6 +5927,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 	unsigned long nr, a0, a1, a2, a3, ret;
 	int op_64_bit, r = 1;
 
+	kvm_x86_ops->skip_emulated_instruction(vcpu);
+
 	if (kvm_hv_hypercall_enabled(vcpu->kvm))
 		return kvm_hv_hypercall(vcpu);
 
-- 
cgit v1.2.3


From dab429a798a8ab3377136e09dda55ea75a41648d Mon Sep 17 00:00:00 2001
From: David Kaplan
Date: Mon, 2 Mar 2015 13:43:37 -0600
Subject: kvm: svm: make wbinvd faster

No need to re-decode WBINVD since we know what it is from the intercept.

Signed-off-by: David Kaplan <David.Kaplan@amd.com>
[extracted from larger unlrelated patch, forward ported, tested,style cleanup]
Signed-off-by: Joel Schopp <joel.schopp@amd.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 16d6e5ca4c03..8f665851724f 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2774,6 +2774,12 @@ static int skinit_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+	kvm_emulate_wbinvd(&svm->vcpu);
+	return 1;
+}
+
 static int xsetbv_interception(struct vcpu_svm *svm)
 {
 	u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -3373,7 +3379,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_STGI]				= stgi_interception,
 	[SVM_EXIT_CLGI]				= clgi_interception,
 	[SVM_EXIT_SKINIT]			= skinit_interception,
-	[SVM_EXIT_WBINVD]                       = emulate_on_interception,
+	[SVM_EXIT_WBINVD]                       = wbinvd_interception,
 	[SVM_EXIT_MONITOR]			= monitor_interception,
 	[SVM_EXIT_MWAIT]			= mwait_interception,
 	[SVM_EXIT_XSETBV]			= xsetbv_interception,
-- 
cgit v1.2.3


From dc9be0fac70a2ad86e31a81372bb0bdfb6945353 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Thu, 5 Mar 2015 11:54:46 +0100
Subject: kvm: move advertising of KVM_CAP_IRQFD to common code

POWER supports irqfds but forgot to advertise them.  Some userspace does
not check for the capability, but others check it---thus they work on
x86 and s390 but not POWER.

To avoid that other architectures in the future make the same mistake, let
common code handle KVM_CAP_IRQFD the same way as KVM_CAP_IRQFD_RESAMPLE.

Reported-and-tested-by: Greg Kurz <gkurz@linux.vnet.ibm.com>
Cc: stable@vger.kernel.org
Fixes: 297e21053a52f060944e9f0de4c64fad9bcd72fc
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/s390/kvm/kvm-s390.c | 1 -
 arch/x86/kvm/x86.c       | 1 -
 virt/kvm/kvm_main.c      | 1 +
 3 files changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index f6579cfde2df..19e17bd7aec0 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -165,7 +165,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ONE_REG:
 	case KVM_CAP_ENABLE_CAP:
 	case KVM_CAP_S390_CSS_SUPPORT:
-	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_DEVICE_CTRL:
 	case KVM_CAP_ENABLE_CAP_VM:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70be41b3..32bf19ef3115 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2744,7 +2744,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_USER_NMI:
 	case KVM_CAP_REINJECT_CONTROL:
 	case KVM_CAP_IRQ_INJECT_STATUS:
-	case KVM_CAP_IRQFD:
 	case KVM_CAP_IOEVENTFD:
 	case KVM_CAP_IOEVENTFD_NO_LENGTH:
 	case KVM_CAP_PIT2:
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a1093700f3a4..a2214d9609bd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2492,6 +2492,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_SIGNAL_MSI:
 #endif
 #ifdef CONFIG_HAVE_KVM_IRQFD
+	case KVM_CAP_IRQFD:
 	case KVM_CAP_IRQFD_RESAMPLE:
 #endif
 	case KVM_CAP_CHECK_EXTENSION_VM:
-- 
cgit v1.2.3


From 2a442c9c6453d3d043dfd89f2e03a1deff8a6f06 Mon Sep 17 00:00:00 2001
From: Paul E. McKenney
Date: Wed, 25 Feb 2015 11:42:15 -0800
Subject: x86: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <x86@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: <xen-devel@lists.xenproject.org>
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  2 +-
 arch/x86/kernel/smpboot.c  | 39 ++++++++++++++++++---------------------
 arch/x86/xen/smp.c         | 46 +++++++++++++++++++++++++---------------------
 4 files changed, 44 insertions(+), 45 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..a5cb4f6e9492 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..c8fa34963ead 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include <asm/realmode.h>
 #include <asm/misc.h>
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
 	lock_vector_lock();
 	set_cpu_online(smp_processor_id(), true);
 	unlock_vector_lock();
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	cpu_set_state_online(smp_processor_id());
 	x86_platform.nmi_init();
 
 	/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	 */
 	mtrr_save_state();
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/* x86 CPUs take themselves offline, so delayed offline is OK. */
+	err = cpu_check_up_prepare(cpu);
+	if (err && err != -EBUSY)
+		return err;
 
 	/* the FPU context is blank, nobody can own it */
 	__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
 	switch_to_new_gdt(me);
 	/* already set me in cpu_online_mask in boot_cpu_init() */
 	cpumask_set_cpu(me, cpu_callout_mask);
-	per_cpu(cpu_state, me) = CPU_ONLINE;
+	cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
 	numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
 	int cpu = smp_processor_id();
 
-	init_completion(&per_cpu(die_complete, smp_processor_id()));
-
 	remove_siblinginfo(cpu);
 
 	/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
 	return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
 {
-	wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+	int ret = 0;
 
-void native_cpu_die(unsigned int cpu)
-{
 	/* We don't do anything here: idle task is faking death itself. */
 
-	cpu_die_common(cpu);
-
 	/* They ack this in play_dead() by setting CPU_DEAD */
-	if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+	if (cpu_wait_death(cpu, 5)) {
 		if (system_state == SYSTEM_RUNNING)
 			pr_info("CPU %u is now offline\n", cpu);
 	} else {
 		pr_err("CPU %u didn't die...\n", cpu);
+		ret = -1;
 	}
+
+	return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+	common_cpu_die(cpu);
 }
 
 void play_dead_common(void)
@@ -1375,10 +1374,8 @@ void play_dead_common(void)
 	reset_lazy_tlbstate();
 	amd_e400_remove_cpu(raw_smp_processor_id());
 
-	mb();
 	/* Ack it */
-	__this_cpu_write(cpu_state, CPU_DEAD);
-	complete(&per_cpu(die_complete, smp_processor_id()));
+	(void)cpu_report_death();
 
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..1c5e760f34ca 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
 
 	set_cpu_online(cpu, true);
 
-	this_cpu_write(cpu_state, CPU_ONLINE);
-
-	wmb();
+	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
 
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
-
-	wmb();			/* make sure everything is out */
 }
 
 /*
@@ -459,7 +455,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/*
+	 * PV VCPUs are always successfully taken down (see 'while' loop
+	 * in xen_cpu_die()), so -EBUSY is an error.
+	 */
+	rc = cpu_check_up_prepare(cpu);
+	if (rc)
+		return rc;
 
 	/* make sure interrupts start blocked */
 	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -479,10 +481,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	BUG_ON(rc);
 
-	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+	while (cpu_report_state(cpu) != CPU_ONLINE)
 		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
-		barrier();
-	}
 
 	return 0;
 }
@@ -511,11 +511,11 @@ static void xen_cpu_die(unsigned int cpu)
 		schedule_timeout(HZ/10);
 	}
 
-	cpu_die_common(cpu);
-
-	xen_smp_intr_free(cpu);
-	xen_uninit_lock_cpu(cpu);
-	xen_teardown_timer(cpu);
+	if (common_cpu_die(cpu) == 0) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+		xen_teardown_timer(cpu);
+	}
 }
 
 static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +747,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
 static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc;
+
+	/*
+	 * This can happen if CPU was offlined earlier and
+	 * offlining timed out in common_cpu_die().
+	 */
+	if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+	}
+
 	/*
 	 * xen_smp_intr_init() needs to run before native_cpu_up()
 	 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return rc;
 }
 
-static void xen_hvm_cpu_die(unsigned int cpu)
-{
-	xen_cpu_die(cpu);
-	native_cpu_die(cpu);
-}
-
 void __init xen_hvm_smp_init(void)
 {
 	if (!xen_have_vector_callback)
@@ -781,7 +785,7 @@ void __init xen_hvm_smp_init(void)
 	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
 	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
 	smp_ops.cpu_up = xen_hvm_cpu_up;
-	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.cpu_die = xen_cpu_die;
 	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
 	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
 	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;
-- 
cgit v1.2.3


From 7486341a98f26857f383aec88ffa10950087c3a1 Mon Sep 17 00:00:00 2001
From: Li, Aubrey
Date: Wed, 11 Mar 2015 16:09:00 +0800
Subject: x86/platform, acpi: Bypass legacy PIC and PIT in ACPI hardware
 reduced mode

On a platform in ACPI Hardware-reduced mode, the legacy PIC and
PIT may not be initialized even though they may be present in
silicon. Touching these legacy components causes unexpected
results on the system.

On the Bay Trail-T(ASUS-T100) platform, touching these legacy
components blocks platform hardware low idle power state(S0ix)
during system suspend. So we should bypass them in ACPI hardware
reduced mode.

Suggested-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Li Aubrey <aubrey.li@linux.intel.com>
Cc: <alan@linux.intel.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: H. Peter Anvin <hpa@linux.intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Link: http://lkml.kernel.org/r/54FFF81C.20703@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/acpi/boot.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 3d525c6124f6..803b684676ff 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1337,6 +1337,26 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 	return 0;
 }
 
+/*
+ * ACPI offers an alternative platform interface model that removes
+ * ACPI hardware requirements for platforms that do not implement
+ * the PC Architecture.
+ *
+ * We initialize the Hardware-reduced ACPI model here:
+ */
+static void __init acpi_reduced_hw_init(void)
+{
+	if (acpi_gbl_reduced_hardware) {
+		/*
+		 * Override x86_init functions and bypass legacy pic
+		 * in Hardware-reduced ACPI mode
+		 */
+		x86_init.timers.timer_init	= x86_init_noop;
+		x86_init.irqs.pre_vector_init	= x86_init_noop;
+		legacy_pic			= &null_legacy_pic;
+	}
+}
+
 /*
  * If your system is blacklisted here, but you find that acpi=force
  * works for you, please contact linux-acpi@vger.kernel.org
@@ -1536,6 +1556,11 @@ int __init early_acpi_boot_init(void)
 	 */
 	early_acpi_process_madt();
 
+	/*
+	 * Hardware-reduced ACPI mode initialization:
+	 */
+	acpi_reduced_hw_init();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From c8a470cab030bae8f9e6e5cfff72b047b7c627a7 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman
Date: Thu, 12 Mar 2015 16:55:13 +0100
Subject: x86/apic/numachip: Fix sibling map with NumaChip

On NumaChip systems, the physical processor ID assignment wasn't
accounting for the number of nodes in AMD multi-module
processors, giving an incorrect sibling map:

  $ cd /sys/devices/system/cpu/cpu29/topology
  $ grep . *
  core_id:5
  core_siblings:00000000,ff000000
  core_siblings_list:24-31
  physical_package_id:3
  thread_siblings:00000000,30000000
  thread_siblings_list:28-29

This fixes it:

  $ cd /sys/devices/system/cpu/cpu29/topology
  $ grep . *
  core_id:5
  core_siblings:00000000,ffff0000
  core_siblings_list:16-31
  physical_package_id:1
  thread_siblings:00000000,30000000
  thread_siblings_list:28-29

Signed-off-by: Daniel J Blueman <daniel@numascale.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Steffen Persvold <sp@numascale.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426135950-10110-1-git-send-email-daniel@numascale.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/apic_numachip.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c
index c2fd21fed002..017149cded07 100644
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -37,10 +37,12 @@ static const struct apic apic_numachip;
 static unsigned int get_apic_id(unsigned long x)
 {
 	unsigned long value;
-	unsigned int id;
+	unsigned int id = (x >> 24) & 0xff;
 
-	rdmsrl(MSR_FAM10H_NODE_ID, value);
-	id = ((x >> 24) & 0xffU) | ((value << 2) & 0xff00U);
+	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+		rdmsrl(MSR_FAM10H_NODE_ID, value);
+		id |= (value << 2) & 0xff00;
+	}
 
 	return id;
 }
@@ -155,10 +157,18 @@ static int __init numachip_probe(void)
 
 static void fixup_cpu_id(struct cpuinfo_x86 *c, int node)
 {
-	if (c->phys_proc_id != node) {
-		c->phys_proc_id = node;
-		per_cpu(cpu_llc_id, smp_processor_id()) = node;
+	u64 val;
+	u32 nodes = 1;
+
+	this_cpu_write(cpu_llc_id, node);
+
+	/* Account for nodes per socket in multi-core-module processors */
+	if (static_cpu_has_safe(X86_FEATURE_NODEID_MSR)) {
+		rdmsrl(MSR_FAM10H_NODE_ID, val);
+		nodes = ((val >> 3) & 7) + 1;
 	}
+
+	c->phys_proc_id = node / nodes;
 }
 
 static int __init numachip_system_init(void)
-- 
cgit v1.2.3


From c1a6bff28cbff796bf6e7db5cf42ec9244911be2 Mon Sep 17 00:00:00 2001
From: Petr Matousek
Date: Wed, 11 Mar 2015 12:16:09 +0100
Subject: kvm: x86: i8259: return initialized data on invalid-size read

If data is read from PIC with invalid access size, the return data stays
uninitialized even though success is returned.

Fix this by always initializing the data.

Signed-off-by: Petr Matousek <pmatouse@redhat.com>
Reported-by: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/i8259.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cc31f7c06d3d..9541ba34126b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s,
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
+		memset(val, 0, len);
 		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
-- 
cgit v1.2.3


From ccfe8c3f7e52ae83155cb038753f4c75b774ca8a Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Thu, 12 Mar 2015 09:17:51 +0100
Subject: crypto: aesni - fix memory usage in GCM decryption

The kernel crypto API logic requires the caller to provide the
length of (ciphertext || authentication tag) as cryptlen for the
AEAD decryption operation. Thus, the cipher implementation must
calculate the size of the plaintext output itself and cannot simply use
cryptlen.

The RFC4106 GCM decryption operation tries to overwrite cryptlen memory
in req->dst. As the destination buffer for decryption only needs to hold
the plaintext memory but cryptlen references the input buffer holding
(ciphertext || authentication tag), the assumption of the destination
buffer length in RFC4106 GCM operation leads to a too large size. This
patch simply uses the already calculated plaintext size.

In addition, this patch fixes the offset calculation of the AAD buffer
pointer: as mentioned before, cryptlen already includes the size of the
tag. Thus, the tag does not need to be added. With the addition, the AAD
will be written beyond the already allocated buffer.

Note, this fixes a kernel crash that can be triggered from user space
via AF_ALG(aead) -- simply use the libkcapi test application
from [1] and update it to use rfc4106-gcm-aes.

Using [1], the changes were tested using CAVS vectors to demonstrate
that the crypto operation still delivers the right results.

[1] http://www.chronox.de/libkcapi.html

CC: Tadeusz Struk <tadeusz.struk@intel.com>
Cc: stable@vger.kernel.org
Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 947c6bf52c33..54f60ab41c63 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -1155,7 +1155,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
 		if (!src)
 			return -ENOMEM;
-		assoc = (src + req->cryptlen + auth_tag_len);
+		assoc = (src + req->cryptlen);
 		scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
 		scatterwalk_map_and_copy(assoc, req->assoc, 0,
 			req->assoclen, 0);
@@ -1180,7 +1180,7 @@ static int __driver_rfc4106_decrypt(struct aead_request *req)
 		scatterwalk_done(&src_sg_walk, 0, 0);
 		scatterwalk_done(&assoc_sg_walk, 0, 0);
 	} else {
-		scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
+		scatterwalk_map_and_copy(dst, req->dst, 0, tempCipherLen, 1);
 		kfree(src);
 	}
 	return retval;
-- 
cgit v1.2.3


From 05713ba9055f1a209d50e480626f36c401bda3ad Mon Sep 17 00:00:00 2001
From: Julia Lawall
Date: Wed, 11 Mar 2015 17:56:26 +0100
Subject: crypto: don't export static symbol

The semantic patch that fixes this problem is as follows:
(http://coccinelle.lip6.fr/)

// <smpl>
@r@
type T;
identifier f;
@@

static T f (...) { ... }

@@
identifier r.f;
declarer name EXPORT_SYMBOL_GPL;
@@

-EXPORT_SYMBOL_GPL(f);
// </smpl>

Signed-off-by: Julia Lawall <Julia.Lawall@lip6.fr>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/glue_helper.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c
index 432f1d76ceb8..6a85598931b5 100644
--- a/arch/x86/crypto/glue_helper.c
+++ b/arch/x86/crypto/glue_helper.c
@@ -232,7 +232,6 @@ static void glue_ctr_crypt_final_128bit(const common_glue_ctr_func_t fn_ctr,
 
 	le128_to_be128((be128 *)walk->iv, &ctrblk);
 }
-EXPORT_SYMBOL_GPL(glue_ctr_crypt_final_128bit);
 
 static unsigned int __glue_ctr_crypt_128bit(const struct common_glue_ctx *gctx,
 					    struct blkcipher_desc *desc,
-- 
cgit v1.2.3


From a7c80ebcac3068b1c3cb27d538d29558c30010c8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 13 Mar 2015 09:53:09 +0100
Subject: x86/fpu: Avoid math_state_restore() without used_math() in
 __restore_xstate_sig()

math_state_restore() assumes it is called with irqs disabled,
but this is not true if the caller is __restore_xstate_sig().

This means that if ia32_fxstate == T and __copy_from_user()
fails, __restore_xstate_sig() returns with irqs disabled too.

This triggers:

  BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:41
   dump_stack
   ___might_sleep
   ? _raw_spin_unlock_irqrestore
   __might_sleep
   down_read
   ? _raw_spin_unlock_irqrestore
   print_vma_addr
   signal_fault
   sys32_rt_sigreturn

Change __restore_xstate_sig() to call set_used_math()
unconditionally. This avoids enabling and disabling interrupts
in math_state_restore(). If copy_from_user() fails, we can
simply do fpu_finit() by hand.

[ Note: this is only the first step. math_state_restore() should
        not check used_math(), it should set this flag. While
	init_fpu() should simply die. ]

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150307153844.GB25954@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/xsave.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 34f66e58a896..cdc6cf903078 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -379,7 +379,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 		 * thread's fpu state, reconstruct fxstate from the fsave
 		 * header. Sanitize the copied state etc.
 		 */
-		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave;
+		struct fpu *fpu = &tsk->thread.fpu;
 		struct user_i387_ia32_struct env;
 		int err = 0;
 
@@ -393,14 +393,15 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		drop_fpu(tsk);
 
-		if (__copy_from_user(xsave, buf_fx, state_size) ||
+		if (__copy_from_user(&fpu->state->xsave, buf_fx, state_size) ||
 		    __copy_from_user(&env, buf, sizeof(env))) {
+			fpu_finit(fpu);
 			err = -1;
 		} else {
 			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only);
-			set_used_math();
 		}
 
+		set_used_math();
 		if (use_eager_fpu()) {
 			preempt_disable();
 			math_state_restore();
-- 
cgit v1.2.3


From f4c3686386393c120710dd34df2a74183ab805fd Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 13 Mar 2015 09:53:10 +0100
Subject: x86/fpu: Drop_fpu() should not assume that tsk equals current

drop_fpu() does clear_used_math() and usually this is correct
because tsk == current.

However switch_fpu_finish()->restore_fpu_checking() is called before
__switch_to() updates the "current_task" variable. If it fails,
we will wrongly clear the PF_USED_MATH flag of the previous task.

So use clear_stopped_child_used_math() instead.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150309171041.GB11388@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 0dbc08282291..72ba21a8b5fc 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -370,7 +370,7 @@ static inline void drop_fpu(struct task_struct *tsk)
 	preempt_disable();
 	tsk->thread.fpu_counter = 0;
 	__drop_fpu(tsk);
-	clear_used_math();
+	clear_stopped_child_used_math(tsk);
 	preempt_enable();
 }
 
-- 
cgit v1.2.3


From 670125bda1d86edfadf81dc56a87582ac7fbd47b Mon Sep 17 00:00:00 2001
From: Wincy Van
Date: Wed, 4 Mar 2015 14:31:56 +0800
Subject: KVM: VMX: Set msr bitmap correctly if vcpu is in guest mode

In commit 3af18d9c5fe9 ("KVM: nVMX: Prepare for using hardware MSR bitmap"),
we are setting MSR_BITMAP in prepare_vmcs02 if we should use hardware. This
is not enough since the field will be modified by following vmx_set_efer.

Fix this by setting vmx_msr_bitmap_nested in vmx_set_msr_bitmap if vcpu is
in guest mode.

Signed-off-by: Wincy Van <fanwenyi0529@gmail.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b417a3a..10a481b7674d 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 {
 	unsigned long *msr_bitmap;
 
-	if (irqchip_in_kernel(vcpu->kvm) && apic_x2apic_mode(vcpu->arch.apic)) {
+	if (is_guest_mode(vcpu))
+		msr_bitmap = vmx_msr_bitmap_nested;
+	else if (irqchip_in_kernel(vcpu->kvm) &&
+		apic_x2apic_mode(vcpu->arch.apic)) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -9218,9 +9221,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 	}
 
 	if (cpu_has_vmx_msr_bitmap() &&
-	    exec_control & CPU_BASED_USE_MSR_BITMAPS &&
-	    nested_vmx_merge_msr_bitmap(vcpu, vmcs12)) {
-		vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_nested));
+	    exec_control & CPU_BASED_USE_MSR_BITMAPS) {
+		nested_vmx_merge_msr_bitmap(vcpu, vmcs12);
+		/* MSR_BITMAP will be set by following vmx_set_efer. */
 	} else
 		exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
 
-- 
cgit v1.2.3


From d8eb8940417559808fdd0180a4d50f8f0281b822 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 13 Mar 2015 14:04:37 +0100
Subject: x86/kexec: Cleanup KEXEC_VERIFY_SIG Kconfig help text

Fix typos and also make it simpler without losing the gist of what it says.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vivek Goyal <vgoyal@redhat.com>
Link: http://lkml.kernel.org/r/1426251877-11415-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 78a3f674c3eb..867bc5bea8dc 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1742,14 +1742,11 @@ config KEXEC_VERIFY_SIG
 	depends on KEXEC_FILE
 	---help---
 	  This option makes kernel signature verification mandatory for
-	  kexec_file_load() syscall. If kernel is signature can not be
-	  verified, kexec_file_load() will fail.
-
-	  This option enforces signature verification at generic level.
-	  One needs to enable signature verification for type of kernel
-	  image being loaded to make sure it works. For example, enable
-	  bzImage signature verification option to be able to load and
-	  verify signatures of bzImage. Otherwise kernel loading will fail.
+	  the kexec_file_load() syscall.
+
+	  In addition to that option, you need to enable signature
+	  verification for the corresponding kernel image type being
+	  loaded in order for this to work.
 
 config KEXEC_BZIMAGE_VERIFY_SIG
 	bool "Enable bzImage signature verification support"
-- 
cgit v1.2.3


From 5e57518d99725e8b4ee34cc94669afb79e4cfe4e Mon Sep 17 00:00:00 2001
From: David Kaplan
Date: Fri, 6 Mar 2015 14:44:35 -0600
Subject: x86: svm: use cr_interception for SVM_EXIT_CR0_SEL_WRITE

Another patch in my war on emulate_on_interception() use as a svm exit handler.

These were pulled out of a larger patch at the suggestion of Radim Krcmar, see
https://lkml.org/lkml/2015/2/25/559

Changes since v1:
	* fixed typo introduced after test, retested

Signed-off-by: David Kaplan <david.kaplan@amd.com>
[separated out just cr_interception part from larger removal of
INTERCEPT_CR0_WRITE, forward ported, tested]
Signed-off-by: Joel Schopp <joel.schopp@amd.com>
Reviewed-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8f665851724f..001e630ea7c6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2944,7 +2944,10 @@ static int cr_interception(struct vcpu_svm *svm)
 		return emulate_on_interception(svm);
 
 	reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
-	cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
+	if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE)
+		cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0;
+	else
+		cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
 
 	err = 0;
 	if (cr >= 16) { /* mov to cr */
@@ -3328,7 +3331,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_READ_CR3]			= cr_interception,
 	[SVM_EXIT_READ_CR4]			= cr_interception,
 	[SVM_EXIT_READ_CR8]			= cr_interception,
-	[SVM_EXIT_CR0_SEL_WRITE]		= emulate_on_interception,
+	[SVM_EXIT_CR0_SEL_WRITE]		= cr_interception,
 	[SVM_EXIT_WRITE_CR0]			= cr_interception,
 	[SVM_EXIT_WRITE_CR3]			= cr_interception,
 	[SVM_EXIT_WRITE_CR4]			= cr_interception,
-- 
cgit v1.2.3


From b34a80517bfcd917bc59d9670d8f465a564af3b9 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Mon, 9 Mar 2015 20:27:43 +0100
Subject: KVM: x86: Fix re-execution of patched vmmcall

For a very long time (since 2b3d2a20), the path handling a vmmcall
instruction of the guest on an Intel host only applied the patch but no
longer handled the hypercall. The reverse case, vmcall on AMD hosts, is
fine. As both em_vmcall and em_vmmcall actually have to do the same, we
can fix the issue by consolidating both into the same handler.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 106c01557f2b..c941abe800ef 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3323,7 +3323,7 @@ static int em_clts(struct x86_emulate_ctxt *ctxt)
 	return X86EMUL_CONTINUE;
 }
 
-static int em_vmcall(struct x86_emulate_ctxt *ctxt)
+static int em_hypercall(struct x86_emulate_ctxt *ctxt)
 {
 	int rc = ctxt->ops->fix_hypercall(ctxt);
 
@@ -3395,17 +3395,6 @@ static int em_lgdt(struct x86_emulate_ctxt *ctxt)
 	return em_lgdt_lidt(ctxt, true);
 }
 
-static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
-{
-	int rc;
-
-	rc = ctxt->ops->fix_hypercall(ctxt);
-
-	/* Disable writeback. */
-	ctxt->dst.type = OP_NONE;
-	return rc;
-}
-
 static int em_lidt(struct x86_emulate_ctxt *ctxt)
 {
 	return em_lgdt_lidt(ctxt, false);
@@ -3769,7 +3758,7 @@ static int check_perm_out(struct x86_emulate_ctxt *ctxt)
 
 static const struct opcode group7_rm0[] = {
 	N,
-	I(SrcNone | Priv | EmulateOnUD,	em_vmcall),
+	I(SrcNone | Priv | EmulateOnUD,	em_hypercall),
 	N, N, N, N, N, N,
 };
 
@@ -3781,7 +3770,7 @@ static const struct opcode group7_rm1[] = {
 
 static const struct opcode group7_rm3[] = {
 	DIP(SrcNone | Prot | Priv,		vmrun,		check_svme_pa),
-	II(SrcNone  | Prot | EmulateOnUD,	em_vmmcall,	vmmcall),
+	II(SrcNone  | Prot | EmulateOnUD,	em_hypercall,	vmmcall),
 	DIP(SrcNone | Prot | Priv,		vmload,		check_svme_pa),
 	DIP(SrcNone | Prot | Priv,		vmsave,		check_svme_pa),
 	DIP(SrcNone | Prot | Priv,		stgi,		check_svme),
-- 
cgit v1.2.3


From ae1f57670703656cc9f293722c3b8b6782f8ab3f Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Mon, 9 Mar 2015 20:56:43 +0100
Subject: KVM: nVMX: Do not emulate #UD while in guest mode

While in L2, leave all #UD to L2 and do not try to emulate it. If L1 is
interested in doing this, it reports its interest via the exception
bitmap, and we never get into handle_exception of L0 anyway.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fbd949909628..50c675b46901 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5065,6 +5065,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	}
 
 	if (is_invalid_opcode(intr_info)) {
+		if (is_guest_mode(vcpu)) {
+			kvm_queue_exception(vcpu, UD_VECTOR);
+			return 1;
+		}
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		if (er != EMULATE_DONE)
 			kvm_queue_exception(vcpu, UD_VECTOR);
-- 
cgit v1.2.3


From 297d716f6260cc9421d971b124ca196b957ee458 Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski
Date: Thu, 12 Mar 2015 08:44:11 +0100
Subject: power_supply: Change ownership from driver to core

Change the ownership of power_supply structure from each driver
implementing the class to the power supply core.

The patch changes power_supply_register() function thus all drivers
implementing power supply class are adjusted.

Each driver provides the implementation of power supply. However it
should not be the owner of power supply class instance because it is
exposed by core to other subsystems with power_supply_get_by_name().
These other subsystems have no knowledge when the driver will unregister
the power supply. This leads to several issues when driver is unbound -
mostly because user of power supply accesses freed memory.

Instead let the core own the instance of struct 'power_supply'.  Other
users of this power supply will still access valid memory because it
will be freed when device reference count reaches 0. Currently this
means "it will leak" but power_supply_put() call in next patches will
solve it.

This solves invalid memory references in following race condition
scenario:

Thread 1: charger manager
Thread 2: power supply driver, used by charger manager

THREAD 1 (charger manager)         THREAD 2 (power supply driver)
==========================         ==============================
psy = power_supply_get_by_name()
                                   Driver unbind, .remove
                                     power_supply_unregister()
                                     Device fully removed
psy->get_property()

The 'get_property' call is executed in invalid context because the driver was
unbound and struct 'power_supply' memory was freed.

This could be observed easily with charger manager driver (here compiled
with max17040 fuel gauge):

$ cat /sys/devices/virtual/power_supply/cm-battery/capacity &
$ echo "1-0036" > /sys/bus/i2c/drivers/max17040/unbind
[   55.725123] Unable to handle kernel NULL pointer dereference at virtual address 00000000
[   55.732584] pgd = d98d4000
[   55.734060] [00000000] *pgd=5afa2831, *pte=00000000, *ppte=00000000
[   55.740318] Internal error: Oops: 80000007 [#1] PREEMPT SMP ARM
[   55.746210] Modules linked in:
[   55.749259] CPU: 1 PID: 2936 Comm: cat Tainted: G        W       3.19.0-rc1-next-20141226-00048-gf79f475f3c44-dirty #1496
[   55.760190] Hardware name: SAMSUNG EXYNOS (Flattened Device Tree)
[   55.766270] task: d9b76f00 ti: daf54000 task.ti: daf54000
[   55.771647] PC is at 0x0
[   55.774182] LR is at charger_get_property+0x2f4/0x36c
[   55.779201] pc : [<00000000>]    lr : [<c034b0b4>]    psr: 60000013
[   55.779201] sp : daf55e90  ip : 00000003  fp : 00000000
[   55.790657] r10: 00000000  r9 : c06e2878  r8 : d9b26c68
[   55.795865] r7 : dad81610  r6 : daec7410  r5 : daf55ebc  r4 : 00000000
[   55.802367] r3 : 00000000  r2 : daf55ebc  r1 : 0000002a  r0 : d9b26c68
[   55.808879] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment user
[   55.815994] Control: 10c5387d  Table: 598d406a  DAC: 00000015
[   55.821723] Process cat (pid: 2936, stack limit = 0xdaf54210)
[   55.827451] Stack: (0xdaf55e90 to 0xdaf56000)
[   55.831795] 5e80:                                     60000013 c01459c4 0000002a c06f8ef8
[   55.839956] 5ea0: db651000 c06f8ef8 daebac00 c04cb668 daebac08 c0346864 00000000 c01459c4
[   55.848115] 5ec0: d99eaa80 c06f8ef8 00000fff 00001000 db651000 c027f25c c027f240 d99eaa80
[   55.856274] 5ee0: d9a06c00 c0146218 daf55f18 00001000 d99eaa80 db4c18c0 00000001 00000001
[   55.864468] 5f00: daf55f80 c0144c78 c0144c54 c0107f90 00015000 d99eaab0 00000000 00000000
[   55.872603] 5f20: 000051c7 00000000 db4c18c0 c04a9370 00015000 00001000 daf55f80 00001000
[   55.880763] 5f40: daf54000 00015000 00000000 c00e53dc db4c18c0 c00e548c 0000000d 00008124
[   55.888937] 5f60: 00000001 00000000 00000000 db4c18c0 db4c18c0 00001000 00015000 c00e5550
[   55.897099] 5f80: 00000000 00000000 00001000 00001000 00015000 00000003 00000003 c000f364
[   55.905239] 5fa0: 00000000 c000f1a0 00001000 00015000 00000003 00015000 00001000 0001333c
[   55.913399] 5fc0: 00001000 00015000 00000003 00000003 00000002 00000000 00000000 00000000
[   55.921560] 5fe0: 7fffe000 be999850 0000a225 b6f3c19c 60000010 00000003 00000000 00000000
[   55.929744] [<c034b0b4>] (charger_get_property) from [<c0346864>] (power_supply_show_property+0x48/0x20c)
[   55.939286] [<c0346864>] (power_supply_show_property) from [<c027f25c>] (dev_attr_show+0x1c/0x48)
[   55.948130] [<c027f25c>] (dev_attr_show) from [<c0146218>] (sysfs_kf_seq_show+0x84/0x104)
[   55.956298] [<c0146218>] (sysfs_kf_seq_show) from [<c0144c78>] (kernfs_seq_show+0x24/0x28)
[   55.964536] [<c0144c78>] (kernfs_seq_show) from [<c0107f90>] (seq_read+0x1b0/0x484)
[   55.972172] [<c0107f90>] (seq_read) from [<c00e53dc>] (__vfs_read+0x18/0x4c)
[   55.979188] [<c00e53dc>] (__vfs_read) from [<c00e548c>] (vfs_read+0x7c/0x100)
[   55.986304] [<c00e548c>] (vfs_read) from [<c00e5550>] (SyS_read+0x40/0x8c)
[   55.993164] [<c00e5550>] (SyS_read) from [<c000f1a0>] (ret_fast_syscall+0x0/0x48)
[   56.000626] Code: bad PC value
[   56.011652] ---[ end trace 7b64343fbdae8ef1 ]---

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>

[for the nvec part]
Reviewed-by: Marc Dietrich <marvin24@gmx.de>

[for compal-laptop.c]
Acked-by: Darren Hart <dvhart@linux.intel.com>

[for the mfd part]
Acked-by: Lee Jones <lee.jones@linaro.org>

[for the hid part]
Acked-by: Jiri Kosina <jkosina@suse.cz>

[for the acpi part]
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>

Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c     |   4 +-
 arch/x86/platform/olpc/olpc-xo15-sci.c    |   4 +-
 drivers/acpi/ac.c                         |  32 ++--
 drivers/acpi/battery.c                    |  55 ++++---
 drivers/acpi/sbs.c                        |  68 +++++----
 drivers/hid/hid-input.c                   |  51 ++++---
 drivers/hid/hid-sony.c                    |  43 +++---
 drivers/hid/hid-wiimote-modules.c         |  41 ++---
 drivers/hid/hid-wiimote.h                 |   3 +-
 drivers/hid/wacom.h                       |   8 +-
 drivers/hid/wacom_sys.c                   |  71 ++++-----
 drivers/platform/x86/compal-laptop.c      |  29 ++--
 drivers/power/88pm860x_battery.c          |  40 ++---
 drivers/power/88pm860x_charger.c          |  32 ++--
 drivers/power/ab8500_btemp.c              |  68 ++++-----
 drivers/power/ab8500_charger.c            | 136 +++++++++--------
 drivers/power/ab8500_fg.c                 | 124 ++++++---------
 drivers/power/abx500_chargalg.c           |  90 +++++------
 drivers/power/apm_power.c                 |   2 +-
 drivers/power/axp288_fuel_gauge.c         |  47 +++---
 drivers/power/bq2415x_charger.c           |  73 ++++-----
 drivers/power/bq24190_charger.c           | 103 ++++++-------
 drivers/power/bq24735-charger.c           |  45 +++---
 drivers/power/bq27x00_battery.c           |  74 ++++-----
 drivers/power/charger-manager.c           |  54 +++----
 drivers/power/collie_battery.c            |  75 +++++----
 drivers/power/da9030_battery.c            |  33 ++--
 drivers/power/da9052-battery.c            |  25 +--
 drivers/power/da9150-charger.c            |  80 +++++-----
 drivers/power/ds2760_battery.c            |  56 +++----
 drivers/power/ds2780_battery.c            |  45 +++---
 drivers/power/ds2781_battery.c            |  47 +++---
 drivers/power/ds2782_battery.c            |  30 ++--
 drivers/power/generic-adc-battery.c       |  54 ++++---
 drivers/power/goldfish_battery.c          |  63 ++++----
 drivers/power/gpio-charger.c              |  34 +++--
 drivers/power/intel_mid_battery.c         |  57 ++++---
 drivers/power/ipaq_micro_battery.c        |  34 +++--
 drivers/power/isp1704_charger.c           |  49 +++---
 drivers/power/jz4740-battery.c            |  37 +++--
 drivers/power/lp8727_charger.c            |  88 ++++++-----
 drivers/power/lp8788-charger.c            |  55 ++++---
 drivers/power/ltc2941-battery-gauge.c     |  51 ++++---
 drivers/power/max14577_charger.c          |  34 +++--
 drivers/power/max17040_battery.c          |  31 ++--
 drivers/power/max17042_battery.c          |  45 ++++--
 drivers/power/max77693_charger.c          |  32 ++--
 drivers/power/max8903_charger.c           |  52 ++++---
 drivers/power/max8925_power.c             |  88 ++++++-----
 drivers/power/max8997_charger.c           |  31 ++--
 drivers/power/max8998_charger.c           |  32 ++--
 drivers/power/olpc_battery.c              |  54 ++++---
 drivers/power/pcf50633-charger.c          |  95 +++++++-----
 drivers/power/pda_power.c                 |  52 ++++---
 drivers/power/pm2301_charger.c            |  42 +++---
 drivers/power/pm2301_charger.h            |   1 +
 drivers/power/pmu_battery.c               |  42 ++++--
 drivers/power/power_supply_core.c         | 242 ++++++++++++++++++++----------
 drivers/power/power_supply_leds.c         |  25 +--
 drivers/power/power_supply_sysfs.c        |  20 +--
 drivers/power/rt5033_battery.c            |  27 ++--
 drivers/power/rx51_battery.c              |  27 ++--
 drivers/power/s3c_adc_battery.c           |  77 +++++-----
 drivers/power/sbs-battery.c               |  69 +++++----
 drivers/power/smb347-charger.c            | 107 +++++++------
 drivers/power/test_power.c                |  34 +++--
 drivers/power/tosa_battery.c              | 112 ++++++++------
 drivers/power/tps65090-charger.c          |  35 +++--
 drivers/power/twl4030_charger.c           |  65 ++++----
 drivers/power/twl4030_madc_battery.c      |  41 ++---
 drivers/power/wm831x_backup.c             |  26 ++--
 drivers/power/wm831x_power.c              |  95 ++++++------
 drivers/power/wm8350_power.c              |  89 ++++++-----
 drivers/power/wm97xx_battery.c            |  37 ++---
 drivers/power/z2_battery.c                |  60 ++++----
 drivers/staging/nvec/nvec_power.c         |  29 ++--
 include/linux/hid.h                       |   6 +-
 include/linux/mfd/abx500/ux500_chargalg.h |  11 +-
 include/linux/mfd/rt5033.h                |   2 +-
 include/linux/mfd/wm8350/supply.h         |   6 +-
 include/linux/power/charger-manager.h     |   3 +-
 include/linux/power_supply.h              |  49 +++---
 82 files changed, 2296 insertions(+), 1839 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index 9a2e590dd202..e4ed28bbf79d 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 08e350e757dc..186634e9021d 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(psy->dev);
+		put_device(&psy->dev);
 	}
 }
 
diff --git a/drivers/acpi/ac.c b/drivers/acpi/ac.c
index 8bf516885ede..bbcc2b5a70d4 100644
--- a/drivers/acpi/ac.c
+++ b/drivers/acpi/ac.c
@@ -95,13 +95,14 @@ static struct acpi_driver acpi_ac_driver = {
 };
 
 struct acpi_ac {
-	struct power_supply charger;
+	struct power_supply *charger;
+	struct power_supply_desc charger_desc;
 	struct acpi_device * device;
 	unsigned long long state;
 	struct notifier_block battery_nb;
 };
 
-#define to_acpi_ac(x) container_of(x, struct acpi_ac, charger)
+#define to_acpi_ac(x) power_supply_get_drvdata(x)
 
 #ifdef CONFIG_ACPI_PROCFS_POWER
 static const struct file_operations acpi_ac_fops = {
@@ -275,7 +276,7 @@ static void acpi_ac_notify(struct acpi_device *device, u32 event)
 						  dev_name(&device->dev), event,
 						  (u32) ac->state);
 		acpi_notifier_call_chain(device, event, (u32) ac->state);
-		kobject_uevent(&ac->charger.dev->kobj, KOBJ_CHANGE);
+		kobject_uevent(&ac->charger->dev.kobj, KOBJ_CHANGE);
 	}
 
 	return;
@@ -321,6 +322,7 @@ static struct dmi_system_id ac_dmi_table[] = {
 
 static int acpi_ac_add(struct acpi_device *device)
 {
+	struct power_supply_config psy_cfg = {};
 	int result = 0;
 	struct acpi_ac *ac = NULL;
 
@@ -341,19 +343,24 @@ static int acpi_ac_add(struct acpi_device *device)
 	if (result)
 		goto end;
 
-	ac->charger.name = acpi_device_bid(device);
+	psy_cfg.drv_data = ac;
+
+	ac->charger_desc.name = acpi_device_bid(device);
 #ifdef CONFIG_ACPI_PROCFS_POWER
 	result = acpi_ac_add_fs(ac);
 	if (result)
 		goto end;
 #endif
-	ac->charger.type = POWER_SUPPLY_TYPE_MAINS;
-	ac->charger.properties = ac_props;
-	ac->charger.num_properties = ARRAY_SIZE(ac_props);
-	ac->charger.get_property = get_ac_property;
-	result = power_supply_register(&ac->device->dev, &ac->charger, NULL);
-	if (result)
+	ac->charger_desc.type = POWER_SUPPLY_TYPE_MAINS;
+	ac->charger_desc.properties = ac_props;
+	ac->charger_desc.num_properties = ARRAY_SIZE(ac_props);
+	ac->charger_desc.get_property = get_ac_property;
+	ac->charger = power_supply_register(&ac->device->dev,
+					    &ac->charger_desc, &psy_cfg);
+	if (IS_ERR(ac->charger)) {
+		result = PTR_ERR(ac->charger);
 		goto end;
+	}
 
 	printk(KERN_INFO PREFIX "%s [%s] (%s)\n",
 	       acpi_device_name(device), acpi_device_bid(device),
@@ -390,7 +397,7 @@ static int acpi_ac_resume(struct device *dev)
 	if (acpi_ac_get_state(ac))
 		return 0;
 	if (old_state != ac->state)
-		kobject_uevent(&ac->charger.dev->kobj, KOBJ_CHANGE);
+		kobject_uevent(&ac->charger->dev.kobj, KOBJ_CHANGE);
 	return 0;
 }
 #else
@@ -407,8 +414,7 @@ static int acpi_ac_remove(struct acpi_device *device)
 
 	ac = acpi_driver_data(device);
 
-	if (ac->charger.dev)
-		power_supply_unregister(&ac->charger);
+	power_supply_unregister(ac->charger);
 	unregister_acpi_notifier(&ac->battery_nb);
 
 #ifdef CONFIG_ACPI_PROCFS_POWER
diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c
index fd8c06f492a1..fdc16ce9d272 100644
--- a/drivers/acpi/battery.c
+++ b/drivers/acpi/battery.c
@@ -117,7 +117,8 @@ enum {
 struct acpi_battery {
 	struct mutex lock;
 	struct mutex sysfs_lock;
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct acpi_device *device;
 	struct notifier_block pm_nb;
 	unsigned long update_time;
@@ -149,7 +150,7 @@ struct acpi_battery {
 	unsigned long flags;
 };
 
-#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat)
+#define to_acpi_battery(x) power_supply_get_drvdata(x)
 
 static inline int acpi_battery_present(struct acpi_battery *battery)
 {
@@ -608,41 +609,45 @@ static struct device_attribute alarm_attr = {
 
 static int sysfs_add_battery(struct acpi_battery *battery)
 {
-	int result;
+	struct power_supply_config psy_cfg = { .drv_data = battery, };
 
 	if (battery->power_unit == ACPI_BATTERY_POWER_UNIT_MA) {
-		battery->bat.properties = charge_battery_props;
-		battery->bat.num_properties =
+		battery->bat_desc.properties = charge_battery_props;
+		battery->bat_desc.num_properties =
 			ARRAY_SIZE(charge_battery_props);
 	} else {
-		battery->bat.properties = energy_battery_props;
-		battery->bat.num_properties =
+		battery->bat_desc.properties = energy_battery_props;
+		battery->bat_desc.num_properties =
 			ARRAY_SIZE(energy_battery_props);
 	}
 
-	battery->bat.name = acpi_device_bid(battery->device);
-	battery->bat.type = POWER_SUPPLY_TYPE_BATTERY;
-	battery->bat.get_property = acpi_battery_get_property;
+	battery->bat_desc.name = acpi_device_bid(battery->device);
+	battery->bat_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	battery->bat_desc.get_property = acpi_battery_get_property;
 
-	result = power_supply_register_no_ws(&battery->device->dev,
-			&battery->bat, NULL);
+	battery->bat = power_supply_register_no_ws(&battery->device->dev,
+				&battery->bat_desc, &psy_cfg);
 
-	if (result)
+	if (IS_ERR(battery->bat)) {
+		int result = PTR_ERR(battery->bat);
+
+		battery->bat = NULL;
 		return result;
-	return device_create_file(battery->bat.dev, &alarm_attr);
+	}
+	return device_create_file(&battery->bat->dev, &alarm_attr);
 }
 
 static void sysfs_remove_battery(struct acpi_battery *battery)
 {
 	mutex_lock(&battery->sysfs_lock);
-	if (!battery->bat.dev) {
+	if (!battery->bat) {
 		mutex_unlock(&battery->sysfs_lock);
 		return;
 	}
 
-	device_remove_file(battery->bat.dev, &alarm_attr);
-	power_supply_unregister(&battery->bat);
-	battery->bat.dev = NULL;
+	device_remove_file(&battery->bat->dev, &alarm_attr);
+	power_supply_unregister(battery->bat);
+	battery->bat = NULL;
 	mutex_unlock(&battery->sysfs_lock);
 }
 
@@ -739,7 +744,7 @@ static int acpi_battery_update(struct acpi_battery *battery, bool resume)
 			return result;
 		acpi_battery_init_alarm(battery);
 	}
-	if (!battery->bat.dev) {
+	if (!battery->bat) {
 		result = sysfs_add_battery(battery);
 		if (result)
 			return result;
@@ -765,7 +770,7 @@ static void acpi_battery_refresh(struct acpi_battery *battery)
 {
 	int power_unit;
 
-	if (!battery->bat.dev)
+	if (!battery->bat)
 		return;
 
 	power_unit = battery->power_unit;
@@ -1063,11 +1068,11 @@ static void acpi_battery_remove_fs(struct acpi_device *device)
 static void acpi_battery_notify(struct acpi_device *device, u32 event)
 {
 	struct acpi_battery *battery = acpi_driver_data(device);
-	struct device *old;
+	struct power_supply *old;
 
 	if (!battery)
 		return;
-	old = battery->bat.dev;
+	old = battery->bat;
 	/*
 	* On Acer Aspire V5-573G notifications are sometimes triggered too
 	* early. For example, when AC is unplugged and notification is
@@ -1084,8 +1089,8 @@ static void acpi_battery_notify(struct acpi_device *device, u32 event)
 					acpi_battery_present(battery));
 	acpi_notifier_call_chain(device, event, acpi_battery_present(battery));
 	/* acpi_battery_update could remove power_supply object */
-	if (old && battery->bat.dev)
-		power_supply_changed(&battery->bat);
+	if (old && battery->bat)
+		power_supply_changed(battery->bat);
 }
 
 static int battery_notify(struct notifier_block *nb,
@@ -1101,7 +1106,7 @@ static int battery_notify(struct notifier_block *nb,
 		if (!acpi_battery_present(battery))
 			return 0;
 
-		if (!battery->bat.dev) {
+		if (battery->bat) {
 			result = acpi_battery_get_info(battery);
 			if (result)
 				return result;
diff --git a/drivers/acpi/sbs.c b/drivers/acpi/sbs.c
index 2038ec1d021d..cd827625cf07 100644
--- a/drivers/acpi/sbs.c
+++ b/drivers/acpi/sbs.c
@@ -74,7 +74,8 @@ static const struct acpi_device_id sbs_device_ids[] = {
 MODULE_DEVICE_TABLE(acpi, sbs_device_ids);
 
 struct acpi_battery {
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct acpi_sbs *sbs;
 	unsigned long update_time;
 	char name[8];
@@ -101,10 +102,10 @@ struct acpi_battery {
 	u8 have_sysfs_alarm:1;
 };
 
-#define to_acpi_battery(x) container_of(x, struct acpi_battery, bat)
+#define to_acpi_battery(x) power_supply_get_drvdata(x)
 
 struct acpi_sbs {
-	struct power_supply charger;
+	struct power_supply *charger;
 	struct acpi_device *device;
 	struct acpi_smb_hc *hc;
 	struct mutex lock;
@@ -115,7 +116,7 @@ struct acpi_sbs {
 	u8 charger_exists:1;
 };
 
-#define to_acpi_sbs(x) container_of(x, struct acpi_sbs, charger)
+#define to_acpi_sbs(x) power_supply_get_drvdata(x)
 
 static int acpi_sbs_remove(struct acpi_device *device);
 static int acpi_battery_get_state(struct acpi_battery *battery);
@@ -303,6 +304,13 @@ static enum power_supply_property sbs_energy_battery_props[] = {
 	POWER_SUPPLY_PROP_MANUFACTURER,
 };
 
+static const struct power_supply_desc acpi_sbs_charger_desc = {
+	.name		= "sbs-charger",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= sbs_ac_props,
+	.num_properties	= ARRAY_SIZE(sbs_ac_props),
+	.get_property	= sbs_get_ac_property,
+};
 
 /* --------------------------------------------------------------------------
                             Smart Battery System Management
@@ -519,6 +527,7 @@ static int acpi_battery_read(struct acpi_battery *battery)
 static int acpi_battery_add(struct acpi_sbs *sbs, int id)
 {
 	struct acpi_battery *battery = &sbs->battery[id];
+	struct power_supply_config psy_cfg = { .drv_data = battery, };
 	int result;
 
 	battery->id = id;
@@ -528,23 +537,27 @@ static int acpi_battery_add(struct acpi_sbs *sbs, int id)
 		return result;
 
 	sprintf(battery->name, ACPI_BATTERY_DIR_NAME, id);
-	battery->bat.name = battery->name;
-	battery->bat.type = POWER_SUPPLY_TYPE_BATTERY;
+	battery->bat_desc.name = battery->name;
+	battery->bat_desc.type = POWER_SUPPLY_TYPE_BATTERY;
 	if (!acpi_battery_mode(battery)) {
-		battery->bat.properties = sbs_charge_battery_props;
-		battery->bat.num_properties =
+		battery->bat_desc.properties = sbs_charge_battery_props;
+		battery->bat_desc.num_properties =
 		    ARRAY_SIZE(sbs_charge_battery_props);
 	} else {
-		battery->bat.properties = sbs_energy_battery_props;
-		battery->bat.num_properties =
+		battery->bat_desc.properties = sbs_energy_battery_props;
+		battery->bat_desc.num_properties =
 		    ARRAY_SIZE(sbs_energy_battery_props);
 	}
-	battery->bat.get_property = acpi_sbs_battery_get_property;
-	result = power_supply_register(&sbs->device->dev, &battery->bat, NULL);
-	if (result)
+	battery->bat_desc.get_property = acpi_sbs_battery_get_property;
+	battery->bat = power_supply_register(&sbs->device->dev,
+					&battery->bat_desc, &psy_cfg);
+	if (IS_ERR(battery->bat)) {
+		result = PTR_ERR(battery->bat);
+		battery->bat = NULL;
 		goto end;
+	}
 
-	result = device_create_file(battery->bat.dev, &alarm_attr);
+	result = device_create_file(&battery->bat->dev, &alarm_attr);
 	if (result)
 		goto end;
 	battery->have_sysfs_alarm = 1;
@@ -559,28 +572,29 @@ static void acpi_battery_remove(struct acpi_sbs *sbs, int id)
 {
 	struct acpi_battery *battery = &sbs->battery[id];
 
-	if (battery->bat.dev) {
+	if (battery->bat) {
 		if (battery->have_sysfs_alarm)
-			device_remove_file(battery->bat.dev, &alarm_attr);
-		power_supply_unregister(&battery->bat);
+			device_remove_file(&battery->bat->dev, &alarm_attr);
+		power_supply_unregister(battery->bat);
 	}
 }
 
 static int acpi_charger_add(struct acpi_sbs *sbs)
 {
 	int result;
+	struct power_supply_config psy_cfg = { .drv_data = sbs, };
 
 	result = acpi_ac_get_present(sbs);
 	if (result)
 		goto end;
 
 	sbs->charger_exists = 1;
-	sbs->charger.name = "sbs-charger";
-	sbs->charger.type = POWER_SUPPLY_TYPE_MAINS;
-	sbs->charger.properties = sbs_ac_props;
-	sbs->charger.num_properties = ARRAY_SIZE(sbs_ac_props);
-	sbs->charger.get_property = sbs_get_ac_property;
-	power_supply_register(&sbs->device->dev, &sbs->charger, NULL);
+	sbs->charger = power_supply_register(&sbs->device->dev,
+					&acpi_sbs_charger_desc, &psy_cfg);
+	if (IS_ERR(sbs->charger)) {
+		result = PTR_ERR(sbs->charger);
+		sbs->charger = NULL;
+	}
 	printk(KERN_INFO PREFIX "%s [%s]: AC Adapter [%s] (%s)\n",
 	       ACPI_SBS_DEVICE_NAME, acpi_device_bid(sbs->device),
 	       ACPI_AC_DIR_NAME, sbs->charger_present ? "on-line" : "off-line");
@@ -590,8 +604,8 @@ static int acpi_charger_add(struct acpi_sbs *sbs)
 
 static void acpi_charger_remove(struct acpi_sbs *sbs)
 {
-	if (sbs->charger.dev)
-		power_supply_unregister(&sbs->charger);
+	if (sbs->charger)
+		power_supply_unregister(sbs->charger);
 }
 
 static void acpi_sbs_callback(void *context)
@@ -605,7 +619,7 @@ static void acpi_sbs_callback(void *context)
 	if (sbs->charger_exists) {
 		acpi_ac_get_present(sbs);
 		if (sbs->charger_present != saved_charger_state)
-			kobject_uevent(&sbs->charger.dev->kobj, KOBJ_CHANGE);
+			kobject_uevent(&sbs->charger->dev.kobj, KOBJ_CHANGE);
 	}
 
 	if (sbs->manager_present) {
@@ -617,7 +631,7 @@ static void acpi_sbs_callback(void *context)
 			acpi_battery_read(bat);
 			if (saved_battery_state == bat->present)
 				continue;
-			kobject_uevent(&bat->bat.dev->kobj, KOBJ_CHANGE);
+			kobject_uevent(&bat->bat->dev.kobj, KOBJ_CHANGE);
 		}
 	}
 }
diff --git a/drivers/hid/hid-input.c b/drivers/hid/hid-input.c
index 6182265a6571..5d5a8c42645f 100644
--- a/drivers/hid/hid-input.c
+++ b/drivers/hid/hid-input.c
@@ -339,7 +339,7 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 					 enum power_supply_property prop,
 					 union power_supply_propval *val)
 {
-	struct hid_device *dev = container_of(psy, struct hid_device, battery);
+	struct hid_device *dev = power_supply_get_drvdata(psy);
 	int ret = 0;
 	__u8 *buf;
 
@@ -397,26 +397,32 @@ static int hidinput_get_battery_property(struct power_supply *psy,
 
 static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type, struct hid_field *field)
 {
-	struct power_supply *battery = &dev->battery;
-	int ret;
+	struct power_supply_desc *psy_desc = NULL;
+	struct power_supply_config psy_cfg = { .drv_data = dev, };
 	unsigned quirks;
 	s32 min, max;
 
 	if (field->usage->hid != HID_DC_BATTERYSTRENGTH)
 		return false;	/* no match */
 
-	if (battery->name != NULL)
+	if (dev->battery != NULL)
 		goto out;	/* already initialized? */
 
-	battery->name = kasprintf(GFP_KERNEL, "hid-%s-battery", dev->uniq);
-	if (battery->name == NULL)
+	psy_desc = kzalloc(sizeof(*psy_desc), GFP_KERNEL);
+	if (psy_desc == NULL)
 		goto out;
 
-	battery->type = POWER_SUPPLY_TYPE_BATTERY;
-	battery->properties = hidinput_battery_props;
-	battery->num_properties = ARRAY_SIZE(hidinput_battery_props);
-	battery->use_for_apm = 0;
-	battery->get_property = hidinput_get_battery_property;
+	psy_desc->name = kasprintf(GFP_KERNEL, "hid-%s-battery", dev->uniq);
+	if (psy_desc->name == NULL) {
+		kfree(psy_desc);
+		goto out;
+	}
+
+	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+	psy_desc->properties = hidinput_battery_props;
+	psy_desc->num_properties = ARRAY_SIZE(hidinput_battery_props);
+	psy_desc->use_for_apm = 0;
+	psy_desc->get_property = hidinput_get_battery_property;
 
 	quirks = find_battery_quirk(dev);
 
@@ -439,14 +445,16 @@ static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
 	dev->battery_report_type = report_type;
 	dev->battery_report_id = field->report->id;
 
-	ret = power_supply_register(&dev->dev, battery, NULL);
-	if (ret != 0) {
-		hid_warn(dev, "can't register power supply: %d\n", ret);
-		kfree(battery->name);
-		battery->name = NULL;
+	dev->battery = power_supply_register(&dev->dev, psy_desc, &psy_cfg);
+	if (IS_ERR(dev->battery)) {
+		hid_warn(dev, "can't register power supply: %ld\n",
+				PTR_ERR(dev->battery));
+		kfree(psy_desc->name);
+		kfree(psy_desc);
+		dev->battery = NULL;
 	}
 
-	power_supply_powers(battery, &dev->dev);
+	power_supply_powers(dev->battery, &dev->dev);
 
 out:
 	return true;
@@ -454,12 +462,13 @@ out:
 
 static void hidinput_cleanup_battery(struct hid_device *dev)
 {
-	if (!dev->battery.name)
+	if (!dev->battery)
 		return;
 
-	power_supply_unregister(&dev->battery);
-	kfree(dev->battery.name);
-	dev->battery.name = NULL;
+	power_supply_unregister(dev->battery);
+	kfree(dev->battery->desc->name);
+	kfree(dev->battery->desc);
+	dev->battery = NULL;
 }
 #else  /* !CONFIG_HID_BATTERY_STRENGTH */
 static bool hidinput_setup_battery(struct hid_device *dev, unsigned report_type,
diff --git a/drivers/hid/hid-sony.c b/drivers/hid/hid-sony.c
index 16fc6a17ac63..65007ec2d96e 100644
--- a/drivers/hid/hid-sony.c
+++ b/drivers/hid/hid-sony.c
@@ -815,7 +815,8 @@ struct sony_sc {
 	struct led_classdev *leds[MAX_LEDS];
 	unsigned long quirks;
 	struct work_struct state_worker;
-	struct power_supply battery;
+	struct power_supply *battery;
+	struct power_supply_desc battery_desc;
 	int device_id;
 	__u8 *output_report_dmabuf;
 
@@ -1660,7 +1661,7 @@ static int sony_battery_get_property(struct power_supply *psy,
 				     enum power_supply_property psp,
 				     union power_supply_propval *val)
 {
-	struct sony_sc *sc = container_of(psy, struct sony_sc, battery);
+	struct sony_sc *sc = power_supply_get_drvdata(psy);
 	unsigned long flags;
 	int ret = 0;
 	u8 battery_charging, battery_capacity, cable_state;
@@ -1699,6 +1700,7 @@ static int sony_battery_get_property(struct power_supply *psy,
 
 static int sony_battery_probe(struct sony_sc *sc)
 {
+	struct power_supply_config psy_cfg = { .drv_data = sc, };
 	struct hid_device *hdev = sc->hdev;
 	int ret;
 
@@ -1708,39 +1710,42 @@ static int sony_battery_probe(struct sony_sc *sc)
 	 */
 	sc->battery_capacity = 100;
 
-	sc->battery.properties = sony_battery_props;
-	sc->battery.num_properties = ARRAY_SIZE(sony_battery_props);
-	sc->battery.get_property = sony_battery_get_property;
-	sc->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	sc->battery.use_for_apm = 0;
-	sc->battery.name = kasprintf(GFP_KERNEL, "sony_controller_battery_%pMR",
-				     sc->mac_address);
-	if (!sc->battery.name)
+	sc->battery_desc.properties = sony_battery_props;
+	sc->battery_desc.num_properties = ARRAY_SIZE(sony_battery_props);
+	sc->battery_desc.get_property = sony_battery_get_property;
+	sc->battery_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	sc->battery_desc.use_for_apm = 0;
+	sc->battery_desc.name = kasprintf(GFP_KERNEL,
+					  "sony_controller_battery_%pMR",
+					  sc->mac_address);
+	if (!sc->battery_desc.name)
 		return -ENOMEM;
 
-	ret = power_supply_register(&hdev->dev, &sc->battery, NULL);
-	if (ret) {
+	sc->battery = power_supply_register(&hdev->dev, &sc->battery_desc,
+					    &psy_cfg);
+	if (IS_ERR(sc->battery)) {
+		ret = PTR_ERR(sc->battery);
 		hid_err(hdev, "Unable to register battery device\n");
 		goto err_free;
 	}
 
-	power_supply_powers(&sc->battery, &hdev->dev);
+	power_supply_powers(sc->battery, &hdev->dev);
 	return 0;
 
 err_free:
-	kfree(sc->battery.name);
-	sc->battery.name = NULL;
+	kfree(sc->battery_desc.name);
+	sc->battery_desc.name = NULL;
 	return ret;
 }
 
 static void sony_battery_remove(struct sony_sc *sc)
 {
-	if (!sc->battery.name)
+	if (!sc->battery_desc.name)
 		return;
 
-	power_supply_unregister(&sc->battery);
-	kfree(sc->battery.name);
-	sc->battery.name = NULL;
+	power_supply_unregister(sc->battery);
+	kfree(sc->battery_desc.name);
+	sc->battery_desc.name = NULL;
 }
 
 /*
diff --git a/drivers/hid/hid-wiimote-modules.c b/drivers/hid/hid-wiimote-modules.c
index 91cb00abcb8e..05e23c417d50 100644
--- a/drivers/hid/hid-wiimote-modules.c
+++ b/drivers/hid/hid-wiimote-modules.c
@@ -203,8 +203,7 @@ static int wiimod_battery_get_property(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       union power_supply_propval *val)
 {
-	struct wiimote_data *wdata = container_of(psy, struct wiimote_data,
-						  battery);
+	struct wiimote_data *wdata = power_supply_get_drvdata(psy);
 	int ret = 0, state;
 	unsigned long flags;
 
@@ -238,42 +237,46 @@ static int wiimod_battery_get_property(struct power_supply *psy,
 static int wiimod_battery_probe(const struct wiimod_ops *ops,
 				struct wiimote_data *wdata)
 {
+	struct power_supply_config psy_cfg = { .drv_data = wdata, };
 	int ret;
 
-	wdata->battery.properties = wiimod_battery_props;
-	wdata->battery.num_properties = ARRAY_SIZE(wiimod_battery_props);
-	wdata->battery.get_property = wiimod_battery_get_property;
-	wdata->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	wdata->battery.use_for_apm = 0;
-	wdata->battery.name = kasprintf(GFP_KERNEL, "wiimote_battery_%s",
-					wdata->hdev->uniq);
-	if (!wdata->battery.name)
+	wdata->battery_desc.properties = wiimod_battery_props;
+	wdata->battery_desc.num_properties = ARRAY_SIZE(wiimod_battery_props);
+	wdata->battery_desc.get_property = wiimod_battery_get_property;
+	wdata->battery_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	wdata->battery_desc.use_for_apm = 0;
+	wdata->battery_desc.name = kasprintf(GFP_KERNEL, "wiimote_battery_%s",
+					     wdata->hdev->uniq);
+	if (!wdata->battery_desc.name)
 		return -ENOMEM;
 
-	ret = power_supply_register(&wdata->hdev->dev, &wdata->battery, NULL);
-	if (ret) {
+	wdata->battery = power_supply_register(&wdata->hdev->dev,
+					       &wdata->battery_desc,
+					       &psy_cfg);
+	if (IS_ERR(wdata->battery)) {
 		hid_err(wdata->hdev, "cannot register battery device\n");
+		ret = PTR_ERR(wdata->battery);
 		goto err_free;
 	}
 
-	power_supply_powers(&wdata->battery, &wdata->hdev->dev);
+	power_supply_powers(wdata->battery, &wdata->hdev->dev);
 	return 0;
 
 err_free:
-	kfree(wdata->battery.name);
-	wdata->battery.name = NULL;
+	kfree(wdata->battery_desc.name);
+	wdata->battery_desc.name = NULL;
 	return ret;
 }
 
 static void wiimod_battery_remove(const struct wiimod_ops *ops,
 				  struct wiimote_data *wdata)
 {
-	if (!wdata->battery.name)
+	if (!wdata->battery_desc.name)
 		return;
 
-	power_supply_unregister(&wdata->battery);
-	kfree(wdata->battery.name);
-	wdata->battery.name = NULL;
+	power_supply_unregister(wdata->battery);
+	kfree(wdata->battery_desc.name);
+	wdata->battery_desc.name = NULL;
 }
 
 static const struct wiimod_ops wiimod_battery = {
diff --git a/drivers/hid/hid-wiimote.h b/drivers/hid/hid-wiimote.h
index 10934aa129fb..875694d43e4d 100644
--- a/drivers/hid/hid-wiimote.h
+++ b/drivers/hid/hid-wiimote.h
@@ -147,7 +147,8 @@ struct wiimote_data {
 	struct led_classdev *leds[4];
 	struct input_dev *accel;
 	struct input_dev *ir;
-	struct power_supply battery;
+	struct power_supply *battery;
+	struct power_supply_desc battery_desc;
 	struct input_dev *mp;
 	struct timer_list timer;
 	struct wiimote_debug *debug;
diff --git a/drivers/hid/wacom.h b/drivers/hid/wacom.h
index 7db432809e9e..0d0d0dd89d17 100644
--- a/drivers/hid/wacom.h
+++ b/drivers/hid/wacom.h
@@ -119,8 +119,10 @@ struct wacom {
 		u8 img_lum;   /* OLED matrix display brightness */
 	} led;
 	bool led_initialized;
-	struct power_supply battery;
-	struct power_supply ac;
+	struct power_supply *battery;
+	struct power_supply *ac;
+	struct power_supply_desc battery_desc;
+	struct power_supply_desc ac_desc;
 };
 
 static inline void wacom_schedule_work(struct wacom_wac *wacom_wac)
@@ -133,7 +135,7 @@ static inline void wacom_notify_battery(struct wacom_wac *wacom_wac)
 {
 	struct wacom *wacom = container_of(wacom_wac, struct wacom, wacom_wac);
 
-	power_supply_changed(&wacom->battery);
+	power_supply_changed(wacom->battery);
 }
 
 extern const struct hid_device_id wacom_ids[];
diff --git a/drivers/hid/wacom_sys.c b/drivers/hid/wacom_sys.c
index 148949c0e039..ba9af470bea0 100644
--- a/drivers/hid/wacom_sys.c
+++ b/drivers/hid/wacom_sys.c
@@ -944,7 +944,7 @@ static int wacom_battery_get_property(struct power_supply *psy,
 				      enum power_supply_property psp,
 				      union power_supply_propval *val)
 {
-	struct wacom *wacom = container_of(psy, struct wacom, battery);
+	struct wacom *wacom = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -976,7 +976,7 @@ static int wacom_ac_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct wacom *wacom = container_of(psy, struct wacom, ac);
+	struct wacom *wacom = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -998,43 +998,46 @@ static int wacom_ac_get_property(struct power_supply *psy,
 static int wacom_initialize_battery(struct wacom *wacom)
 {
 	static atomic_t battery_no = ATOMIC_INIT(0);
-	int error;
+	struct power_supply_config psy_cfg = { .drv_data = wacom, };
 	unsigned long n;
 
 	if (wacom->wacom_wac.features.quirks & WACOM_QUIRK_BATTERY) {
+		struct power_supply_desc *bat_desc = &wacom->battery_desc;
+		struct power_supply_desc *ac_desc = &wacom->ac_desc;
 		n = atomic_inc_return(&battery_no) - 1;
 
-		wacom->battery.properties = wacom_battery_props;
-		wacom->battery.num_properties = ARRAY_SIZE(wacom_battery_props);
-		wacom->battery.get_property = wacom_battery_get_property;
+		bat_desc->properties = wacom_battery_props;
+		bat_desc->num_properties = ARRAY_SIZE(wacom_battery_props);
+		bat_desc->get_property = wacom_battery_get_property;
 		sprintf(wacom->wacom_wac.bat_name, "wacom_battery_%ld", n);
-		wacom->battery.name = wacom->wacom_wac.bat_name;
-		wacom->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-		wacom->battery.use_for_apm = 0;
+		bat_desc->name = wacom->wacom_wac.bat_name;
+		bat_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+		bat_desc->use_for_apm = 0;
 
-		wacom->ac.properties = wacom_ac_props;
-		wacom->ac.num_properties = ARRAY_SIZE(wacom_ac_props);
-		wacom->ac.get_property = wacom_ac_get_property;
+		ac_desc->properties = wacom_ac_props;
+		ac_desc->num_properties = ARRAY_SIZE(wacom_ac_props);
+		ac_desc->get_property = wacom_ac_get_property;
 		sprintf(wacom->wacom_wac.ac_name, "wacom_ac_%ld", n);
-		wacom->ac.name = wacom->wacom_wac.ac_name;
-		wacom->ac.type = POWER_SUPPLY_TYPE_MAINS;
-		wacom->ac.use_for_apm = 0;
-
-		error = power_supply_register(&wacom->hdev->dev,
-					      &wacom->battery, NULL);
-		if (error)
-			return error;
-
-		power_supply_powers(&wacom->battery, &wacom->hdev->dev);
-
-		error = power_supply_register(&wacom->hdev->dev, &wacom->ac,
-					      NULL);
-		if (error) {
-			power_supply_unregister(&wacom->battery);
-			return error;
+		ac_desc->name = wacom->wacom_wac.ac_name;
+		ac_desc->type = POWER_SUPPLY_TYPE_MAINS;
+		ac_desc->use_for_apm = 0;
+
+		wacom->battery = power_supply_register(&wacom->hdev->dev,
+					      &wacom->battery_desc, &psy_cfg);
+		if (IS_ERR(wacom->battery))
+			return PTR_ERR(wacom->battery);
+
+		power_supply_powers(wacom->battery, &wacom->hdev->dev);
+
+		wacom->ac = power_supply_register(&wacom->hdev->dev,
+						  &wacom->ac_desc,
+						  &psy_cfg);
+		if (IS_ERR(wacom->ac)) {
+			power_supply_unregister(wacom->battery);
+			return PTR_ERR(wacom->ac);
 		}
 
-		power_supply_powers(&wacom->ac, &wacom->hdev->dev);
+		power_supply_powers(wacom->ac, &wacom->hdev->dev);
 	}
 
 	return 0;
@@ -1043,11 +1046,11 @@ static int wacom_initialize_battery(struct wacom *wacom)
 static void wacom_destroy_battery(struct wacom *wacom)
 {
 	if ((wacom->wacom_wac.features.quirks & WACOM_QUIRK_BATTERY) &&
-	     wacom->battery.dev) {
-		power_supply_unregister(&wacom->battery);
-		wacom->battery.dev = NULL;
-		power_supply_unregister(&wacom->ac);
-		wacom->ac.dev = NULL;
+	     wacom->battery) {
+		power_supply_unregister(wacom->battery);
+		wacom->battery = NULL;
+		power_supply_unregister(wacom->ac);
+		wacom->ac = NULL;
 	}
 }
 
diff --git a/drivers/platform/x86/compal-laptop.c b/drivers/platform/x86/compal-laptop.c
index 041a592fe753..b4e94471f3d5 100644
--- a/drivers/platform/x86/compal-laptop.c
+++ b/drivers/platform/x86/compal-laptop.c
@@ -177,7 +177,7 @@ struct compal_data{
 	unsigned char curr_pwm;
 
 	/* Power supply */
-	struct power_supply psy;
+	struct power_supply *psy;
 	struct power_supply_info psy_info;
 	char bat_model_name[BAT_MODEL_NAME_LEN + 1];
 	char bat_manufacturer_name[BAT_MANUFACTURER_NAME_LEN + 1];
@@ -565,8 +565,7 @@ static int bat_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct compal_data *data;
-	data = container_of(psy, struct compal_data, psy);
+	struct compal_data *data = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -875,13 +874,16 @@ static struct dmi_system_id __initdata compal_dmi_table[] = {
 };
 MODULE_DEVICE_TABLE(dmi, compal_dmi_table);
 
+static const struct power_supply_desc psy_bat_desc = {
+	.name		= DRIVER_NAME,
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= compal_bat_properties,
+	.num_properties	= ARRAY_SIZE(compal_bat_properties),
+	.get_property	= bat_get_property,
+};
+
 static void initialize_power_supply_data(struct compal_data *data)
 {
-	data->psy.name = DRIVER_NAME;
-	data->psy.type = POWER_SUPPLY_TYPE_BATTERY;
-	data->psy.properties = compal_bat_properties;
-	data->psy.num_properties = ARRAY_SIZE(compal_bat_properties);
-	data->psy.get_property = bat_get_property;
 
 	ec_read_sequence(BAT_MANUFACTURER_NAME_ADDR,
 					data->bat_manufacturer_name,
@@ -1011,6 +1013,7 @@ static int compal_probe(struct platform_device *pdev)
 	int err;
 	struct compal_data *data;
 	struct device *hwmon_dev;
+	struct power_supply_config psy_cfg = {};
 
 	if (!extra_features)
 		return 0;
@@ -1036,9 +1039,13 @@ static int compal_probe(struct platform_device *pdev)
 
 	/* Power supply */
 	initialize_power_supply_data(data);
-	err = power_supply_register(&compal_device->dev, &data->psy, NULL);
-	if (err < 0)
+	psy_cfg.drv_data = data;
+	data->psy = power_supply_register(&compal_device->dev, &psy_bat_desc,
+					  &psy_cfg);
+	if (IS_ERR(data->psy)) {
+		err = PTR_ERR(data->psy);
 		goto remove;
+	}
 
 	platform_set_drvdata(pdev, data);
 
@@ -1073,7 +1080,7 @@ static int compal_remove(struct platform_device *pdev)
 	pwm_disable_control();
 
 	data = platform_get_drvdata(pdev);
-	power_supply_unregister(&data->psy);
+	power_supply_unregister(data->psy);
 
 	sysfs_remove_group(&pdev->dev.kobj, &compal_platform_attr_group);
 
diff --git a/drivers/power/88pm860x_battery.c b/drivers/power/88pm860x_battery.c
index a99d7f2829a7..d49579b227ec 100644
--- a/drivers/power/88pm860x_battery.c
+++ b/drivers/power/88pm860x_battery.c
@@ -98,7 +98,7 @@ struct pm860x_battery_info {
 	struct i2c_client *i2c;
 	struct device *dev;
 
-	struct power_supply battery;
+	struct power_supply *battery;
 	struct mutex lock;
 	int status;
 	int irq_cc;
@@ -798,9 +798,8 @@ out:
 
 static void pm860x_external_power_changed(struct power_supply *psy)
 {
-	struct pm860x_battery_info *info;
+	struct pm860x_battery_info *info = dev_get_drvdata(psy->dev.parent);
 
-	info = container_of(psy, struct pm860x_battery_info, battery);
 	calc_resistor(info);
 }
 
@@ -808,7 +807,7 @@ static int pm860x_batt_get_prop(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct pm860x_battery_info *info = dev_get_drvdata(psy->dev->parent);
+	struct pm860x_battery_info *info = dev_get_drvdata(psy->dev.parent);
 	int data;
 	int ret;
 
@@ -874,7 +873,7 @@ static int pm860x_batt_set_prop(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       const union power_supply_propval *val)
 {
-	struct pm860x_battery_info *info = dev_get_drvdata(psy->dev->parent);
+	struct pm860x_battery_info *info = dev_get_drvdata(psy->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_CHARGE_FULL:
@@ -901,6 +900,16 @@ static enum power_supply_property pm860x_batt_props[] = {
 	POWER_SUPPLY_PROP_TEMP,
 };
 
+static const struct power_supply_desc pm860x_battery_desc = {
+	.name			= "battery-monitor",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= pm860x_batt_props,
+	.num_properties		= ARRAY_SIZE(pm860x_batt_props),
+	.get_property		= pm860x_batt_get_prop,
+	.set_property		= pm860x_batt_set_prop,
+	.external_power_changed	= pm860x_external_power_changed,
+};
+
 static int pm860x_battery_probe(struct platform_device *pdev)
 {
 	struct pm860x_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -936,14 +945,6 @@ static int pm860x_battery_probe(struct platform_device *pdev)
 
 	pm860x_init_battery(info);
 
-	info->battery.name = "battery-monitor";
-	info->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	info->battery.properties = pm860x_batt_props;
-	info->battery.num_properties = ARRAY_SIZE(pm860x_batt_props);
-	info->battery.get_property = pm860x_batt_get_prop;
-	info->battery.set_property = pm860x_batt_set_prop;
-	info->battery.external_power_changed = pm860x_external_power_changed;
-
 	if (pdata && pdata->max_capacity)
 		info->max_capacity = pdata->max_capacity;
 	else
@@ -953,10 +954,11 @@ static int pm860x_battery_probe(struct platform_device *pdev)
 	else
 		info->resistor = 300;	/* set default internal resistor */
 
-	ret = power_supply_register(&pdev->dev, &info->battery, NULL);
-	if (ret)
-		return ret;
-	info->battery.dev->parent = &pdev->dev;
+	info->battery = power_supply_register(&pdev->dev, &pm860x_battery_desc,
+					      NULL);
+	if (IS_ERR(info->battery))
+		return PTR_ERR(info->battery);
+	info->battery->dev.parent = &pdev->dev;
 
 	ret = request_threaded_irq(info->irq_cc, NULL,
 				pm860x_coulomb_handler, IRQF_ONESHOT,
@@ -981,7 +983,7 @@ static int pm860x_battery_probe(struct platform_device *pdev)
 out_coulomb:
 	free_irq(info->irq_cc, info);
 out_reg:
-	power_supply_unregister(&info->battery);
+	power_supply_unregister(info->battery);
 	return ret;
 }
 
@@ -991,7 +993,7 @@ static int pm860x_battery_remove(struct platform_device *pdev)
 
 	free_irq(info->irq_batt, info);
 	free_irq(info->irq_cc, info);
-	power_supply_unregister(&info->battery);
+	power_supply_unregister(info->battery);
 	return 0;
 }
 
diff --git a/drivers/power/88pm860x_charger.c b/drivers/power/88pm860x_charger.c
index 98e31419d9cf..a7f32a5b2299 100644
--- a/drivers/power/88pm860x_charger.c
+++ b/drivers/power/88pm860x_charger.c
@@ -102,7 +102,7 @@ struct pm860x_charger_info {
 	struct i2c_client *i2c_8606;
 	struct device *dev;
 
-	struct power_supply usb;
+	struct power_supply *usb;
 	struct mutex lock;
 	int irq_nums;
 	int irq[7];
@@ -415,7 +415,7 @@ static irqreturn_t pm860x_charger_handler(int irq, void *data)
 
 	set_charging_fsm(info);
 
-	power_supply_changed(&info->usb);
+	power_supply_changed(info->usb);
 out:
 	return IRQ_HANDLED;
 }
@@ -587,8 +587,7 @@ static int pm860x_usb_get_prop(struct power_supply *psy,
 			       enum power_supply_property psp,
 			       union power_supply_propval *val)
 {
-	struct pm860x_charger_info *info =
-	    dev_get_drvdata(psy->dev->parent);
+	struct pm860x_charger_info *info = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -648,6 +647,14 @@ static struct pm860x_irq_desc {
 	{ "vchg", pm860x_vchg_handler },
 };
 
+static const struct power_supply_desc pm860x_charger_desc = {
+	.name		= "usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= pm860x_usb_props,
+	.num_properties	= ARRAY_SIZE(pm860x_usb_props),
+	.get_property	= pm860x_usb_get_prop,
+};
+
 static int pm860x_charger_probe(struct platform_device *pdev)
 {
 	struct pm860x_chip *chip = dev_get_drvdata(pdev->dev.parent);
@@ -689,16 +696,15 @@ static int pm860x_charger_probe(struct platform_device *pdev)
 	mutex_init(&info->lock);
 	platform_set_drvdata(pdev, info);
 
-	info->usb.name = "usb";
-	info->usb.type = POWER_SUPPLY_TYPE_USB;
-	info->usb.properties = pm860x_usb_props;
-	info->usb.num_properties = ARRAY_SIZE(pm860x_usb_props);
-	info->usb.get_property = pm860x_usb_get_prop;
+	psy_cfg.drv_data = info;
 	psy_cfg.supplied_to = pm860x_supplied_to;
 	psy_cfg.num_supplicants = ARRAY_SIZE(pm860x_supplied_to);
-	ret = power_supply_register(&pdev->dev, &info->usb, &psy_cfg);
-	if (ret)
+	info->usb = power_supply_register(&pdev->dev, &pm860x_charger_desc,
+					  &psy_cfg);
+	if (IS_ERR(info->usb)) {
+		ret = PTR_ERR(info->usb);
 		goto out;
+	}
 
 	pm860x_init_charger(info);
 
@@ -715,7 +721,7 @@ static int pm860x_charger_probe(struct platform_device *pdev)
 	return 0;
 
 out_irq:
-	power_supply_unregister(&info->usb);
+	power_supply_unregister(info->usb);
 	while (--i >= 0)
 		free_irq(info->irq[i], info);
 out:
@@ -727,7 +733,7 @@ static int pm860x_charger_remove(struct platform_device *pdev)
 	struct pm860x_charger_info *info = platform_get_drvdata(pdev);
 	int i;
 
-	power_supply_unregister(&info->usb);
+	power_supply_unregister(info->usb);
 	free_irq(info->irq[0], info);
 	for (i = 0; i < info->irq_nums; i++)
 		free_irq(info->irq[i], info);
diff --git a/drivers/power/ab8500_btemp.c b/drivers/power/ab8500_btemp.c
index 4d18464d6400..8f8044e1acf3 100644
--- a/drivers/power/ab8500_btemp.c
+++ b/drivers/power/ab8500_btemp.c
@@ -45,9 +45,6 @@
 #define BTEMP_BATCTRL_CURR_SRC_60UA	60
 #define BTEMP_BATCTRL_CURR_SRC_120UA	120
 
-#define to_ab8500_btemp_device_info(x) container_of((x), \
-	struct ab8500_btemp, btemp_psy);
-
 /**
  * struct ab8500_btemp_interrupts - ab8500 interrupts
  * @name:	name of the interrupt
@@ -102,7 +99,7 @@ struct ab8500_btemp {
 	struct ab8500_gpadc *gpadc;
 	struct ab8500_fg *fg;
 	struct abx500_bm_data *bm;
-	struct power_supply btemp_psy;
+	struct power_supply *btemp_psy;
 	struct ab8500_btemp_events events;
 	struct ab8500_btemp_ranges btemp_ranges;
 	struct workqueue_struct *btemp_wq;
@@ -654,14 +651,14 @@ static void ab8500_btemp_periodic_work(struct work_struct *work)
 		if ((di->bat_temp != di->prev_bat_temp) || !di->initialized) {
 			di->initialized = true;
 			di->bat_temp = bat_temp;
-			power_supply_changed(&di->btemp_psy);
+			power_supply_changed(di->btemp_psy);
 		}
 	} else if (bat_temp < di->prev_bat_temp) {
 		di->bat_temp--;
-		power_supply_changed(&di->btemp_psy);
+		power_supply_changed(di->btemp_psy);
 	} else if (bat_temp > di->prev_bat_temp) {
 		di->bat_temp++;
-		power_supply_changed(&di->btemp_psy);
+		power_supply_changed(di->btemp_psy);
 	}
 	di->prev_bat_temp = bat_temp;
 
@@ -689,7 +686,7 @@ static irqreturn_t ab8500_btemp_batctrlindb_handler(int irq, void *_di)
 	dev_err(di->dev, "Battery removal detected!\n");
 
 	di->events.batt_rem = true;
-	power_supply_changed(&di->btemp_psy);
+	power_supply_changed(di->btemp_psy);
 
 	return IRQ_HANDLED;
 }
@@ -715,7 +712,7 @@ static irqreturn_t ab8500_btemp_templow_handler(int irq, void *_di)
 		di->events.btemp_high = false;
 		di->events.btemp_medhigh = false;
 		di->events.btemp_lowmed = false;
-		power_supply_changed(&di->btemp_psy);
+		power_supply_changed(di->btemp_psy);
 	}
 
 	return IRQ_HANDLED;
@@ -738,7 +735,7 @@ static irqreturn_t ab8500_btemp_temphigh_handler(int irq, void *_di)
 	di->events.btemp_medhigh = false;
 	di->events.btemp_lowmed = false;
 	di->events.btemp_low = false;
-	power_supply_changed(&di->btemp_psy);
+	power_supply_changed(di->btemp_psy);
 
 	return IRQ_HANDLED;
 }
@@ -760,7 +757,7 @@ static irqreturn_t ab8500_btemp_lowmed_handler(int irq, void *_di)
 	di->events.btemp_medhigh = false;
 	di->events.btemp_high = false;
 	di->events.btemp_low = false;
-	power_supply_changed(&di->btemp_psy);
+	power_supply_changed(di->btemp_psy);
 
 	return IRQ_HANDLED;
 }
@@ -782,7 +779,7 @@ static irqreturn_t ab8500_btemp_medhigh_handler(int irq, void *_di)
 	di->events.btemp_lowmed = false;
 	di->events.btemp_high = false;
 	di->events.btemp_low = false;
-	power_supply_changed(&di->btemp_psy);
+	power_supply_changed(di->btemp_psy);
 
 	return IRQ_HANDLED;
 }
@@ -884,9 +881,7 @@ static int ab8500_btemp_get_property(struct power_supply *psy,
 	enum power_supply_property psp,
 	union power_supply_propval *val)
 {
-	struct ab8500_btemp *di;
-
-	di = to_ab8500_btemp_device_info(psy);
+	struct ab8500_btemp *di = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_PRESENT:
@@ -919,14 +914,14 @@ static int ab8500_btemp_get_ext_psy_data(struct device *dev, void *data)
 
 	psy = (struct power_supply *)data;
 	ext = dev_get_drvdata(dev);
-	di = to_ab8500_btemp_device_info(psy);
+	di = power_supply_get_drvdata(psy);
 
 	/*
 	 * For all psy where the name of your driver
 	 * appears in any supplied_to
 	 */
 	for (i = 0; i < ext->num_supplicants; i++) {
-		if (!strcmp(ext->supplied_to[i], psy->name))
+		if (!strcmp(ext->supplied_to[i], psy->desc->name))
 			psy_found = true;
 	}
 
@@ -934,16 +929,16 @@ static int ab8500_btemp_get_ext_psy_data(struct device *dev, void *data)
 		return 0;
 
 	/* Go through all properties for the psy */
-	for (j = 0; j < ext->num_properties; j++) {
+	for (j = 0; j < ext->desc->num_properties; j++) {
 		enum power_supply_property prop;
-		prop = ext->properties[j];
+		prop = ext->desc->properties[j];
 
 		if (power_supply_get_property(ext, prop, &ret))
 			continue;
 
 		switch (prop) {
 		case POWER_SUPPLY_PROP_PRESENT:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_MAINS:
 				/* AC disconnected */
 				if (!ret.intval && di->events.ac_conn) {
@@ -990,10 +985,10 @@ static int ab8500_btemp_get_ext_psy_data(struct device *dev, void *data)
  */
 static void ab8500_btemp_external_power_changed(struct power_supply *psy)
 {
-	struct ab8500_btemp *di = to_ab8500_btemp_device_info(psy);
+	struct ab8500_btemp *di = power_supply_get_drvdata(psy);
 
 	class_for_each_device(power_supply_class, NULL,
-		&di->btemp_psy, ab8500_btemp_get_ext_psy_data);
+		di->btemp_psy, ab8500_btemp_get_ext_psy_data);
 }
 
 /* ab8500 btemp driver interrupts and their respective isr */
@@ -1044,7 +1039,7 @@ static int ab8500_btemp_remove(struct platform_device *pdev)
 	destroy_workqueue(di->btemp_wq);
 
 	flush_scheduled_work();
-	power_supply_unregister(&di->btemp_psy);
+	power_supply_unregister(di->btemp_psy);
 
 	return 0;
 }
@@ -1054,6 +1049,15 @@ static char *supply_interface[] = {
 	"ab8500_fg",
 };
 
+static const struct power_supply_desc ab8500_btemp_desc = {
+	.name			= "ab8500_btemp",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= ab8500_btemp_props,
+	.num_properties		= ARRAY_SIZE(ab8500_btemp_props),
+	.get_property		= ab8500_btemp_get_property,
+	.external_power_changed	= ab8500_btemp_external_power_changed,
+};
+
 static int ab8500_btemp_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
@@ -1090,17 +1094,9 @@ static int ab8500_btemp_probe(struct platform_device *pdev)
 
 	di->initialized = false;
 
-	/* BTEMP supply */
-	di->btemp_psy.name = "ab8500_btemp";
-	di->btemp_psy.type = POWER_SUPPLY_TYPE_BATTERY;
-	di->btemp_psy.properties = ab8500_btemp_props;
-	di->btemp_psy.num_properties = ARRAY_SIZE(ab8500_btemp_props);
-	di->btemp_psy.get_property = ab8500_btemp_get_property;
-	di->btemp_psy.external_power_changed =
-		ab8500_btemp_external_power_changed;
-
 	psy_cfg.supplied_to = supply_interface;
 	psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	psy_cfg.drv_data = di;
 
 	/* Create a work queue for the btemp */
 	di->btemp_wq =
@@ -1141,9 +1137,11 @@ static int ab8500_btemp_probe(struct platform_device *pdev)
 	}
 
 	/* Register BTEMP power supply class */
-	ret = power_supply_register(di->dev, &di->btemp_psy, &psy_cfg);
-	if (ret) {
+	di->btemp_psy = power_supply_register(di->dev, &ab8500_btemp_desc,
+					      &psy_cfg);
+	if (IS_ERR(di->btemp_psy)) {
 		dev_err(di->dev, "failed to register BTEMP psy\n");
+		ret = PTR_ERR(di->btemp_psy);
 		goto free_btemp_wq;
 	}
 
@@ -1172,7 +1170,7 @@ static int ab8500_btemp_probe(struct platform_device *pdev)
 	return ret;
 
 free_irq:
-	power_supply_unregister(&di->btemp_psy);
+	power_supply_unregister(di->btemp_psy);
 
 	/* We also have to free all successfully registered irqs */
 	for (i = i - 1; i >= 0; i--) {
diff --git a/drivers/power/ab8500_charger.c b/drivers/power/ab8500_charger.c
index f9eb7fff1d65..e388171f4e58 100644
--- a/drivers/power/ab8500_charger.c
+++ b/drivers/power/ab8500_charger.c
@@ -435,7 +435,7 @@ static void ab8500_charger_set_usb_connected(struct ab8500_charger *di,
 		if (!connected)
 			di->flags.vbus_drop_end = false;
 
-		sysfs_notify(&di->usb_chg.psy.dev->kobj, NULL, "present");
+		sysfs_notify(&di->usb_chg.psy->dev.kobj, NULL, "present");
 
 		if (connected) {
 			mutex_lock(&di->charger_attached_mutex);
@@ -1516,7 +1516,7 @@ static int ab8500_charger_ac_en(struct ux500_charger *charger,
 
 		dev_dbg(di->dev, "%s Disabled AC charging\n", __func__);
 	}
-	ab8500_power_supply_changed(di, &di->ac_chg.psy);
+	ab8500_power_supply_changed(di, di->ac_chg.psy);
 
 	return ret;
 }
@@ -1672,7 +1672,7 @@ static int ab8500_charger_usb_en(struct ux500_charger *charger,
 		cancel_delayed_work(&di->check_vbat_work);
 
 	}
-	ab8500_power_supply_changed(di, &di->usb_chg.psy);
+	ab8500_power_supply_changed(di, di->usb_chg.psy);
 
 	return ret;
 }
@@ -1811,9 +1811,9 @@ static int ab8500_charger_watchdog_kick(struct ux500_charger *charger)
 	int ret;
 	struct ab8500_charger *di;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_MAINS)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
 		di = to_ab8500_charger_ac_device_info(charger);
-	else if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+	else if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
 		di = to_ab8500_charger_usb_device_info(charger);
 	else
 		return -ENXIO;
@@ -1839,9 +1839,9 @@ static int ab8500_charger_update_charger_current(struct ux500_charger *charger,
 	int ret;
 	struct ab8500_charger *di;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_MAINS)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
 		di = to_ab8500_charger_ac_device_info(charger);
-	else if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+	else if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
 		di = to_ab8500_charger_usb_device_info(charger);
 	else
 		return -ENXIO;
@@ -1879,7 +1879,7 @@ static int ab8540_charger_power_path_enable(struct ux500_charger *charger,
 	int ret;
 	struct ab8500_charger *di;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
 		di = to_ab8500_charger_usb_device_info(charger);
 	else
 		return -ENXIO;
@@ -1910,7 +1910,7 @@ static int ab8540_charger_usb_pre_chg_enable(struct ux500_charger *charger,
 	int ret;
 	struct ab8500_charger *di;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_USB)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_USB)
 		di = to_ab8500_charger_usb_device_info(charger);
 	else
 		return -ENXIO;
@@ -1937,7 +1937,7 @@ static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
 	struct ux500_charger *usb_chg;
 
 	usb_chg = (struct ux500_charger *)data;
-	psy = &usb_chg->psy;
+	psy = usb_chg->psy;
 
 	di = to_ab8500_charger_usb_device_info(usb_chg);
 
@@ -1945,7 +1945,7 @@ static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
 
 	/* For all psy where the driver name appears in any supplied_to */
 	for (i = 0; i < ext->num_supplicants; i++) {
-		if (!strcmp(ext->supplied_to[i], psy->name))
+		if (!strcmp(ext->supplied_to[i], psy->desc->name))
 			psy_found = true;
 	}
 
@@ -1953,16 +1953,16 @@ static int ab8500_charger_get_ext_psy_data(struct device *dev, void *data)
 		return 0;
 
 	/* Go through all properties for the psy */
-	for (j = 0; j < ext->num_properties; j++) {
+	for (j = 0; j < ext->desc->num_properties; j++) {
 		enum power_supply_property prop;
-		prop = ext->properties[j];
+		prop = ext->desc->properties[j];
 
 		if (power_supply_get_property(ext, prop, &ret))
 			continue;
 
 		switch (prop) {
 		case POWER_SUPPLY_PROP_VOLTAGE_NOW:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				di->vbat = ret.intval / 1000;
 				break;
@@ -1993,7 +1993,7 @@ static void ab8500_charger_check_vbat_work(struct work_struct *work)
 		struct ab8500_charger, check_vbat_work.work);
 
 	class_for_each_device(power_supply_class, NULL,
-		&di->usb_chg.psy, ab8500_charger_get_ext_psy_data);
+		di->usb_chg.psy, ab8500_charger_get_ext_psy_data);
 
 	/* First run old_vbat is 0. */
 	if (di->old_vbat == 0)
@@ -2009,7 +2009,7 @@ static void ab8500_charger_check_vbat_work(struct work_struct *work)
 			di->vbat, di->old_vbat);
 		ab8500_charger_set_vbus_in_curr(di,
 					di->max_usb_in_curr.usb_type_max);
-		power_supply_changed(&di->usb_chg.psy);
+		power_supply_changed(di->usb_chg.psy);
 	}
 
 	di->old_vbat = di->vbat;
@@ -2049,7 +2049,7 @@ static void ab8500_charger_check_hw_failure_work(struct work_struct *work)
 		}
 		if (!(reg_value & MAIN_CH_NOK)) {
 			di->flags.mainextchnotok = false;
-			ab8500_power_supply_changed(di, &di->ac_chg.psy);
+			ab8500_power_supply_changed(di, di->ac_chg.psy);
 		}
 	}
 	if (di->flags.vbus_ovv) {
@@ -2062,7 +2062,7 @@ static void ab8500_charger_check_hw_failure_work(struct work_struct *work)
 		}
 		if (!(reg_value & VBUS_OVV_TH)) {
 			di->flags.vbus_ovv = false;
-			ab8500_power_supply_changed(di, &di->usb_chg.psy);
+			ab8500_power_supply_changed(di, di->usb_chg.psy);
 		}
 	}
 	/* If we still have a failure, schedule a new check */
@@ -2132,8 +2132,8 @@ static void ab8500_charger_ac_work(struct work_struct *work)
 		di->ac.charger_connected = 0;
 	}
 
-	ab8500_power_supply_changed(di, &di->ac_chg.psy);
-	sysfs_notify(&di->ac_chg.psy.dev->kobj, NULL, "present");
+	ab8500_power_supply_changed(di, di->ac_chg.psy);
+	sysfs_notify(&di->ac_chg.psy->dev.kobj, NULL, "present");
 }
 
 static void ab8500_charger_usb_attached_work(struct work_struct *work)
@@ -2240,7 +2240,7 @@ static void ab8500_charger_detect_usb_type_work(struct work_struct *work)
 		dev_dbg(di->dev, "%s di->vbus_detected = false\n", __func__);
 		di->vbus_detected = false;
 		ab8500_charger_set_usb_connected(di, false);
-		ab8500_power_supply_changed(di, &di->usb_chg.psy);
+		ab8500_power_supply_changed(di, di->usb_chg.psy);
 	} else {
 		dev_dbg(di->dev, "%s di->vbus_detected = true\n", __func__);
 		di->vbus_detected = true;
@@ -2250,7 +2250,7 @@ static void ab8500_charger_detect_usb_type_work(struct work_struct *work)
 			if (!ret) {
 				ab8500_charger_set_usb_connected(di, true);
 				ab8500_power_supply_changed(di,
-							    &di->usb_chg.psy);
+							    di->usb_chg.psy);
 			}
 		} else {
 			/*
@@ -2267,7 +2267,7 @@ static void ab8500_charger_detect_usb_type_work(struct work_struct *work)
 					ab8500_charger_set_usb_connected(di,
 						true);
 					ab8500_power_supply_changed(di,
-						&di->usb_chg.psy);
+						di->usb_chg.psy);
 				}
 			}
 		}
@@ -2295,7 +2295,7 @@ static void ab8500_charger_usb_link_attach_work(struct work_struct *work)
 	}
 
 	ab8500_charger_set_usb_connected(di, true);
-	ab8500_power_supply_changed(di, &di->usb_chg.psy);
+	ab8500_power_supply_changed(di, di->usb_chg.psy);
 }
 
 /**
@@ -2393,7 +2393,7 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 	if (!(detected_chargers & USB_PW_CONN)) {
 		di->vbus_detected = false;
 		ab8500_charger_set_usb_connected(di, false);
-		ab8500_power_supply_changed(di, &di->usb_chg.psy);
+		ab8500_power_supply_changed(di, di->usb_chg.psy);
 		return;
 	}
 
@@ -2404,7 +2404,7 @@ static void ab8500_charger_usb_link_status_work(struct work_struct *work)
 		if (ret == -ENXIO) {
 			/* No valid charger type detected */
 			ab8500_charger_set_usb_connected(di, false);
-			ab8500_power_supply_changed(di, &di->usb_chg.psy);
+			ab8500_power_supply_changed(di, di->usb_chg.psy);
 		}
 		return;
 	}
@@ -2463,7 +2463,7 @@ static void ab8500_charger_usb_state_changed_work(struct work_struct *work)
 	case AB8500_BM_USB_STATE_SUSPEND:
 	case AB8500_BM_USB_STATE_MAX:
 		ab8500_charger_set_usb_connected(di, false);
-		ab8500_power_supply_changed(di, &di->usb_chg.psy);
+		ab8500_power_supply_changed(di, di->usb_chg.psy);
 		break;
 
 	case AB8500_BM_USB_STATE_RESUME:
@@ -2486,7 +2486,7 @@ static void ab8500_charger_usb_state_changed_work(struct work_struct *work)
 				return;
 
 			ab8500_charger_set_usb_connected(di, true);
-			ab8500_power_supply_changed(di, &di->usb_chg.psy);
+			ab8500_power_supply_changed(di, di->usb_chg.psy);
 		}
 		break;
 
@@ -2530,7 +2530,7 @@ static void ab8500_charger_check_usbchargernotok_work(struct work_struct *work)
 	}
 
 	if (prev_status != di->flags.usbchargernotok)
-		ab8500_power_supply_changed(di, &di->usb_chg.psy);
+		ab8500_power_supply_changed(di, di->usb_chg.psy);
 }
 
 /**
@@ -2560,7 +2560,7 @@ static void ab8500_charger_check_main_thermal_prot_work(
 	else
 		di->flags.main_thermal_prot = false;
 
-	ab8500_power_supply_changed(di, &di->ac_chg.psy);
+	ab8500_power_supply_changed(di, di->ac_chg.psy);
 }
 
 /**
@@ -2590,7 +2590,7 @@ static void ab8500_charger_check_usb_thermal_prot_work(
 	else
 		di->flags.usb_thermal_prot = false;
 
-	ab8500_power_supply_changed(di, &di->usb_chg.psy);
+	ab8500_power_supply_changed(di, di->usb_chg.psy);
 }
 
 /**
@@ -2651,7 +2651,7 @@ static irqreturn_t ab8500_charger_mainextchnotok_handler(int irq, void *_di)
 
 	dev_dbg(di->dev, "Main charger not ok\n");
 	di->flags.mainextchnotok = true;
-	ab8500_power_supply_changed(di, &di->ac_chg.psy);
+	ab8500_power_supply_changed(di, di->ac_chg.psy);
 
 	/* Schedule a new HW failure check */
 	queue_delayed_work(di->charger_wq, &di->check_hw_failure_work, 0);
@@ -2880,11 +2880,11 @@ static irqreturn_t ab8500_charger_chwdexp_handler(int irq, void *_di)
 	 */
 	if (di->ac.charger_online) {
 		di->ac.wd_expired = true;
-		ab8500_power_supply_changed(di, &di->ac_chg.psy);
+		ab8500_power_supply_changed(di, di->ac_chg.psy);
 	}
 	if (di->usb.charger_online) {
 		di->usb.wd_expired = true;
-		ab8500_power_supply_changed(di, &di->usb_chg.psy);
+		ab8500_power_supply_changed(di, di->usb_chg.psy);
 	}
 
 	return IRQ_HANDLED;
@@ -2927,7 +2927,7 @@ static irqreturn_t ab8500_charger_vbusovv_handler(int irq, void *_di)
 
 	dev_dbg(di->dev, "VBUS overvoltage detected\n");
 	di->flags.vbus_ovv = true;
-	ab8500_power_supply_changed(di, &di->usb_chg.psy);
+	ab8500_power_supply_changed(di, di->usb_chg.psy);
 
 	/* Schedule a new HW failure check */
 	queue_delayed_work(di->charger_wq, &di->check_hw_failure_work, 0);
@@ -3428,10 +3428,10 @@ static int ab8500_charger_remove(struct platform_device *pdev)
 
 	flush_scheduled_work();
 	if (di->usb_chg.enabled)
-		power_supply_unregister(&di->usb_chg.psy);
+		power_supply_unregister(di->usb_chg.psy);
 
 	if (di->ac_chg.enabled && !di->ac_chg.external)
-		power_supply_unregister(&di->ac_chg.psy);
+		power_supply_unregister(di->ac_chg.psy);
 
 	return 0;
 }
@@ -3442,11 +3442,27 @@ static char *supply_interface[] = {
 	"ab8500_btemp",
 };
 
+static const struct power_supply_desc ab8500_ac_chg_desc = {
+	.name		= "ab8500_ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= ab8500_charger_ac_props,
+	.num_properties	= ARRAY_SIZE(ab8500_charger_ac_props),
+	.get_property	= ab8500_charger_ac_get_property,
+};
+
+static const struct power_supply_desc ab8500_usb_chg_desc = {
+	.name		= "ab8500_usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= ab8500_charger_usb_props,
+	.num_properties	= ARRAY_SIZE(ab8500_charger_usb_props),
+	.get_property	= ab8500_charger_usb_get_property,
+};
+
 static int ab8500_charger_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
 	struct abx500_bm_data *plat = pdev->dev.platform_data;
-	struct power_supply_config psy_cfg = {};
+	struct power_supply_config ac_psy_cfg = {}, usb_psy_cfg = {};
 	struct ab8500_charger *di;
 	int irq, i, charger_status, ret = 0, ch_stat;
 
@@ -3485,16 +3501,14 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	di->invalid_charger_detect_state = 0;
 
 	/* AC and USB supply config */
-	psy_cfg.supplied_to = supply_interface;
-	psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	ac_psy_cfg.supplied_to = supply_interface;
+	ac_psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	ac_psy_cfg.drv_data = &di->ac_chg;
+	usb_psy_cfg.supplied_to = supply_interface;
+	usb_psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	usb_psy_cfg.drv_data = &di->usb_chg;
 
 	/* AC supply */
-	/* power_supply base class */
-	di->ac_chg.psy.name = "ab8500_ac";
-	di->ac_chg.psy.type = POWER_SUPPLY_TYPE_MAINS;
-	di->ac_chg.psy.properties = ab8500_charger_ac_props;
-	di->ac_chg.psy.num_properties = ARRAY_SIZE(ab8500_charger_ac_props);
-	di->ac_chg.psy.get_property = ab8500_charger_ac_get_property;
 	/* ux500_charger sub-class */
 	di->ac_chg.ops.enable = &ab8500_charger_ac_en;
 	di->ac_chg.ops.check_enable = &ab8500_charger_ac_check_enable;
@@ -3514,12 +3528,6 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 			&charger_notifier_list, &charger_nb);
 
 	/* USB supply */
-	/* power_supply base class */
-	di->usb_chg.psy.name = "ab8500_usb";
-	di->usb_chg.psy.type = POWER_SUPPLY_TYPE_USB;
-	di->usb_chg.psy.properties = ab8500_charger_usb_props;
-	di->usb_chg.psy.num_properties = ARRAY_SIZE(ab8500_charger_usb_props);
-	di->usb_chg.psy.get_property = ab8500_charger_usb_get_property;
 	/* ux500_charger sub-class */
 	di->usb_chg.ops.enable = &ab8500_charger_usb_en;
 	di->usb_chg.ops.check_enable = &ab8500_charger_usb_check_enable;
@@ -3617,20 +3625,24 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 
 	/* Register AC charger class */
 	if (di->ac_chg.enabled) {
-		ret = power_supply_register(di->dev, &di->ac_chg.psy,
-						&psy_cfg);
-		if (ret) {
+		di->ac_chg.psy = power_supply_register(di->dev,
+						       &ab8500_ac_chg_desc,
+						       &ac_psy_cfg);
+		if (IS_ERR(di->ac_chg.psy)) {
 			dev_err(di->dev, "failed to register AC charger\n");
+			ret = PTR_ERR(di->ac_chg.psy);
 			goto free_charger_wq;
 		}
 	}
 
 	/* Register USB charger class */
 	if (di->usb_chg.enabled) {
-		ret = power_supply_register(di->dev, &di->usb_chg.psy,
-						&psy_cfg);
-		if (ret) {
+		di->usb_chg.psy = power_supply_register(di->dev,
+							&ab8500_usb_chg_desc,
+							&usb_psy_cfg);
+		if (IS_ERR(di->usb_chg.psy)) {
 			dev_err(di->dev, "failed to register USB charger\n");
+			ret = PTR_ERR(di->usb_chg.psy);
 			goto free_ac;
 		}
 	}
@@ -3653,8 +3665,8 @@ static int ab8500_charger_probe(struct platform_device *pdev)
 	if (charger_status & AC_PW_CONN) {
 		di->ac.charger_connected = 1;
 		di->ac_conn = true;
-		ab8500_power_supply_changed(di, &di->ac_chg.psy);
-		sysfs_notify(&di->ac_chg.psy.dev->kobj, NULL, "present");
+		ab8500_power_supply_changed(di, di->ac_chg.psy);
+		sysfs_notify(&di->ac_chg.psy->dev.kobj, NULL, "present");
 	}
 
 	if (charger_status & USB_PW_CONN) {
@@ -3715,10 +3727,10 @@ put_usb_phy:
 	usb_put_phy(di->usb_phy);
 free_usb:
 	if (di->usb_chg.enabled)
-		power_supply_unregister(&di->usb_chg.psy);
+		power_supply_unregister(di->usb_chg.psy);
 free_ac:
 	if (di->ac_chg.enabled)
-		power_supply_unregister(&di->ac_chg.psy);
+		power_supply_unregister(di->ac_chg.psy);
 free_charger_wq:
 	destroy_workqueue(di->charger_wq);
 	return ret;
diff --git a/drivers/power/ab8500_fg.c b/drivers/power/ab8500_fg.c
index 7a2e3ac44cf3..3830dade5d69 100644
--- a/drivers/power/ab8500_fg.c
+++ b/drivers/power/ab8500_fg.c
@@ -57,9 +57,6 @@
 #define interpolate(x, x1, y1, x2, y2) \
 	((y1) + ((((y2) - (y1)) * ((x) - (x1))) / ((x2) - (x1))));
 
-#define to_ab8500_fg_device_info(x) container_of((x), \
-	struct ab8500_fg, fg_psy);
-
 /**
  * struct ab8500_fg_interrupts - ab8500 fg interupts
  * @name:	name of the interrupt
@@ -229,7 +226,7 @@ struct ab8500_fg {
 	struct ab8500 *parent;
 	struct ab8500_gpadc *gpadc;
 	struct abx500_bm_data *bm;
-	struct power_supply fg_psy;
+	struct power_supply *fg_psy;
 	struct workqueue_struct *fg_wq;
 	struct delayed_work fg_periodic_work;
 	struct delayed_work fg_low_bat_work;
@@ -1391,7 +1388,7 @@ static void ab8500_fg_check_capacity_limits(struct ab8500_fg *di, bool init)
 				di->bat_cap.prev_percent,
 				di->bat_cap.cap_scale.scaled_cap);
 		}
-		power_supply_changed(&di->fg_psy);
+		power_supply_changed(di->fg_psy);
 		if (di->flags.fully_charged && di->flags.force_full) {
 			dev_dbg(di->dev, "Battery full, notifying.\n");
 			di->flags.force_full = false;
@@ -1850,7 +1847,7 @@ static void ab8500_fg_check_hw_failure_work(struct work_struct *work)
 		if (!di->flags.bat_ovv) {
 			dev_dbg(di->dev, "Battery OVV\n");
 			di->flags.bat_ovv = true;
-			power_supply_changed(&di->fg_psy);
+			power_supply_changed(di->fg_psy);
 		}
 		/* Not yet recovered from ovv, reschedule this test */
 		queue_delayed_work(di->fg_wq, &di->fg_check_hw_failure_work,
@@ -1858,7 +1855,7 @@ static void ab8500_fg_check_hw_failure_work(struct work_struct *work)
 		} else {
 			dev_dbg(di->dev, "Battery recovered from OVV\n");
 			di->flags.bat_ovv = false;
-			power_supply_changed(&di->fg_psy);
+			power_supply_changed(di->fg_psy);
 	}
 }
 
@@ -2096,9 +2093,7 @@ static int ab8500_fg_get_property(struct power_supply *psy,
 	enum power_supply_property psp,
 	union power_supply_propval *val)
 {
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	/*
 	 * If battery is identified as unknown and charging of unknown
@@ -2181,14 +2176,14 @@ static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
 
 	psy = (struct power_supply *)data;
 	ext = dev_get_drvdata(dev);
-	di = to_ab8500_fg_device_info(psy);
+	di = power_supply_get_drvdata(psy);
 
 	/*
 	 * For all psy where the name of your driver
 	 * appears in any supplied_to
 	 */
 	for (i = 0; i < ext->num_supplicants; i++) {
-		if (!strcmp(ext->supplied_to[i], psy->name))
+		if (!strcmp(ext->supplied_to[i], psy->desc->name))
 			psy_found = true;
 	}
 
@@ -2196,16 +2191,16 @@ static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
 		return 0;
 
 	/* Go through all properties for the psy */
-	for (j = 0; j < ext->num_properties; j++) {
+	for (j = 0; j < ext->desc->num_properties; j++) {
 		enum power_supply_property prop;
-		prop = ext->properties[j];
+		prop = ext->desc->properties[j];
 
 		if (power_supply_get_property(ext, prop, &ret))
 			continue;
 
 		switch (prop) {
 		case POWER_SUPPLY_PROP_STATUS:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				switch (ret.intval) {
 				case POWER_SUPPLY_STATUS_UNKNOWN:
@@ -2244,7 +2239,7 @@ static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
 			};
 			break;
 		case POWER_SUPPLY_PROP_TECHNOLOGY:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				if (!di->flags.batt_id_received &&
 				    di->bm->batt_id != BATTERY_UNKNOWN) {
@@ -2274,7 +2269,7 @@ static int ab8500_fg_get_ext_psy_data(struct device *dev, void *data)
 			}
 			break;
 		case POWER_SUPPLY_PROP_TEMP:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				if (di->flags.batt_id_received)
 					di->bat_temp = ret.intval;
@@ -2399,10 +2394,10 @@ out:
  */
 static void ab8500_fg_external_power_changed(struct power_supply *psy)
 {
-	struct ab8500_fg *di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	class_for_each_device(power_supply_class, NULL,
-		&di->fg_psy, ab8500_fg_get_ext_psy_data);
+		di->fg_psy, ab8500_fg_get_ext_psy_data);
 }
 
 /**
@@ -2580,9 +2575,7 @@ static ssize_t ab8505_powercut_flagtime_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 		AB8505_RTC_PCUT_FLAG_TIME_REG, &reg_value);
@@ -2605,9 +2598,7 @@ static ssize_t ab8505_powercut_flagtime_write(struct device *dev,
 	int ret;
 	long unsigned reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	reg_value = simple_strtoul(buf, NULL, 10);
 
@@ -2633,9 +2624,7 @@ static ssize_t ab8505_powercut_maxtime_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 		AB8505_RTC_PCUT_MAX_TIME_REG, &reg_value);
@@ -2659,9 +2648,7 @@ static ssize_t ab8505_powercut_maxtime_write(struct device *dev,
 	int ret;
 	int reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	reg_value = simple_strtoul(buf, NULL, 10);
 	if (reg_value > 0x7F) {
@@ -2686,9 +2673,7 @@ static ssize_t ab8505_powercut_restart_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 		AB8505_RTC_PCUT_RESTART_REG, &reg_value);
@@ -2711,9 +2696,7 @@ static ssize_t ab8505_powercut_restart_write(struct device *dev,
 	int ret;
 	int reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	reg_value = simple_strtoul(buf, NULL, 10);
 	if (reg_value > 0xF) {
@@ -2739,9 +2722,7 @@ static ssize_t ab8505_powercut_timer_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_TIME_REG, &reg_value);
@@ -2764,9 +2745,7 @@ static ssize_t ab8505_powercut_restart_counter_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_RESTART_REG, &reg_value);
@@ -2789,9 +2768,7 @@ static ssize_t ab8505_powercut_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_CTL_STATUS_REG, &reg_value);
@@ -2812,9 +2789,7 @@ static ssize_t ab8505_powercut_write(struct device *dev,
 	int ret;
 	int reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	reg_value = simple_strtoul(buf, NULL, 10);
 	if (reg_value > 0x1) {
@@ -2840,9 +2815,7 @@ static ssize_t ab8505_powercut_flag_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_CTL_STATUS_REG,  &reg_value);
@@ -2865,9 +2838,7 @@ static ssize_t ab8505_powercut_debounce_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_DEBOUNCE_REG,  &reg_value);
@@ -2890,9 +2861,7 @@ static ssize_t ab8505_powercut_debounce_write(struct device *dev,
 	int ret;
 	int reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	reg_value = simple_strtoul(buf, NULL, 10);
 	if (reg_value > 0x7) {
@@ -2917,9 +2886,7 @@ static ssize_t ab8505_powercut_enable_status_read(struct device *dev,
 	int ret;
 	u8 reg_value;
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct ab8500_fg *di;
-
-	di = to_ab8500_fg_device_info(psy);
+	struct ab8500_fg *di = power_supply_get_drvdata(psy);
 
 	ret = abx500_get_register_interruptible(di->dev, AB8500_RTC,
 						AB8505_RTC_PCUT_CTL_STATUS_REG, &reg_value);
@@ -2962,15 +2929,16 @@ static int ab8500_fg_sysfs_psy_create_attrs(struct ab8500_fg *di)
 	     abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
 	    || is_ab8540(di->parent)) {
 		for (i = 0; i < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); i++)
-			if (device_create_file(di->fg_psy.dev,
+			if (device_create_file(&di->fg_psy->dev,
 					       &ab8505_fg_sysfs_psy_attrs[i]))
 				goto sysfs_psy_create_attrs_failed_ab8505;
 	}
 	return 0;
 sysfs_psy_create_attrs_failed_ab8505:
-	dev_err(di->fg_psy.dev, "Failed creating sysfs psy attrs for ab8505.\n");
+	dev_err(&di->fg_psy->dev, "Failed creating sysfs psy attrs for ab8505.\n");
 	while (i--)
-		device_remove_file(di->fg_psy.dev, &ab8505_fg_sysfs_psy_attrs[i]);
+		device_remove_file(&di->fg_psy->dev,
+				   &ab8505_fg_sysfs_psy_attrs[i]);
 
 	return -EIO;
 }
@@ -2983,7 +2951,7 @@ static void ab8500_fg_sysfs_psy_remove_attrs(struct ab8500_fg *di)
 	     abx500_get_chip_id(di->dev) >= AB8500_CUT2P0)
 	    || is_ab8540(di->parent)) {
 		for (i = 0; i < ARRAY_SIZE(ab8505_fg_sysfs_psy_attrs); i++)
-			(void)device_remove_file(di->fg_psy.dev,
+			(void)device_remove_file(&di->fg_psy->dev,
 						 &ab8505_fg_sysfs_psy_attrs[i]);
 	}
 }
@@ -3050,7 +3018,7 @@ static int ab8500_fg_remove(struct platform_device *pdev)
 
 	flush_scheduled_work();
 	ab8500_fg_sysfs_psy_remove_attrs(di);
-	power_supply_unregister(&di->fg_psy);
+	power_supply_unregister(di->fg_psy);
 	return ret;
 }
 
@@ -3071,6 +3039,15 @@ static char *supply_interface[] = {
 	"ab8500_usb",
 };
 
+static const struct power_supply_desc ab8500_fg_desc = {
+	.name			= "ab8500_fg",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= ab8500_fg_props,
+	.num_properties		= ARRAY_SIZE(ab8500_fg_props),
+	.get_property		= ab8500_fg_get_property,
+	.external_power_changed	= ab8500_fg_external_power_changed,
+};
+
 static int ab8500_fg_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
@@ -3107,15 +3084,9 @@ static int ab8500_fg_probe(struct platform_device *pdev)
 	di->parent = dev_get_drvdata(pdev->dev.parent);
 	di->gpadc = ab8500_gpadc_get("ab8500-gpadc.0");
 
-	di->fg_psy.name = "ab8500_fg";
-	di->fg_psy.type = POWER_SUPPLY_TYPE_BATTERY;
-	di->fg_psy.properties = ab8500_fg_props;
-	di->fg_psy.num_properties = ARRAY_SIZE(ab8500_fg_props);
-	di->fg_psy.get_property = ab8500_fg_get_property;
-	di->fg_psy.external_power_changed = ab8500_fg_external_power_changed;
-
 	psy_cfg.supplied_to = supply_interface;
 	psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	psy_cfg.drv_data = di;
 
 	di->bat_cap.max_mah_design = MILLI_TO_MICRO *
 		di->bm->bat_type[di->bm->batt_id].charge_full_design;
@@ -3176,9 +3147,10 @@ static int ab8500_fg_probe(struct platform_device *pdev)
 	di->flags.batt_id_received = false;
 
 	/* Register FG power supply class */
-	ret = power_supply_register(di->dev, &di->fg_psy, &psy_cfg);
-	if (ret) {
+	di->fg_psy = power_supply_register(di->dev, &ab8500_fg_desc, &psy_cfg);
+	if (IS_ERR(di->fg_psy)) {
 		dev_err(di->dev, "failed to register FG psy\n");
+		ret = PTR_ERR(di->fg_psy);
 		goto free_inst_curr_wq;
 	}
 
@@ -3256,7 +3228,7 @@ static int ab8500_fg_probe(struct platform_device *pdev)
 	return ret;
 
 free_irq:
-	power_supply_unregister(&di->fg_psy);
+	power_supply_unregister(di->fg_psy);
 
 	/* We also have to free all registered irqs */
 	for (i = 0; i < ARRAY_SIZE(ab8500_fg_irq_th); i++) {
diff --git a/drivers/power/abx500_chargalg.c b/drivers/power/abx500_chargalg.c
index ac6f4a22f846..541f702e0451 100644
--- a/drivers/power/abx500_chargalg.c
+++ b/drivers/power/abx500_chargalg.c
@@ -50,9 +50,6 @@
 #define CHARGALG_CURR_STEP_LOW		0
 #define CHARGALG_CURR_STEP_HIGH	100
 
-#define to_abx500_chargalg_device_info(x) container_of((x), \
-	struct abx500_chargalg, chargalg_psy);
-
 enum abx500_chargers {
 	NO_CHG,
 	AC_CHG,
@@ -256,7 +253,7 @@ struct abx500_chargalg {
 	struct ab8500 *parent;
 	struct abx500_chargalg_current_step_status curr_status;
 	struct abx500_bm_data *bm;
-	struct power_supply chargalg_psy;
+	struct power_supply *chargalg_psy;
 	struct ux500_charger *ac_chg;
 	struct ux500_charger *usb_chg;
 	struct abx500_chargalg_events events;
@@ -695,7 +692,7 @@ static void abx500_chargalg_stop_charging(struct abx500_chargalg *di)
 	di->charge_status = POWER_SUPPLY_STATUS_NOT_CHARGING;
 	di->maintenance_chg = false;
 	cancel_delayed_work(&di->chargalg_wd_work);
-	power_supply_changed(&di->chargalg_psy);
+	power_supply_changed(di->chargalg_psy);
 }
 
 /**
@@ -715,7 +712,7 @@ static void abx500_chargalg_hold_charging(struct abx500_chargalg *di)
 	di->charge_status = POWER_SUPPLY_STATUS_CHARGING;
 	di->maintenance_chg = false;
 	cancel_delayed_work(&di->chargalg_wd_work);
-	power_supply_changed(&di->chargalg_psy);
+	power_supply_changed(di->chargalg_psy);
 }
 
 /**
@@ -842,7 +839,7 @@ static void abx500_chargalg_end_of_charge(struct abx500_chargalg *di)
 			di->charge_status = POWER_SUPPLY_STATUS_FULL;
 			di->maintenance_chg = true;
 			dev_dbg(di->dev, "EOC reached!\n");
-			power_supply_changed(&di->chargalg_psy);
+			power_supply_changed(di->chargalg_psy);
 		} else {
 			dev_dbg(di->dev,
 				" EOC limit reached for the %d"
@@ -987,10 +984,10 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 
 	psy = (struct power_supply *)data;
 	ext = dev_get_drvdata(dev);
-	di = to_abx500_chargalg_device_info(psy);
+	di = power_supply_get_drvdata(psy);
 	/* For all psy where the driver name appears in any supplied_to */
 	for (i = 0; i < ext->num_supplicants; i++) {
-		if (!strcmp(ext->supplied_to[i], psy->name))
+		if (!strcmp(ext->supplied_to[i], psy->desc->name))
 			psy_found = true;
 	}
 	if (!psy_found)
@@ -1007,23 +1004,25 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 	}
 
 	/* Go through all properties for the psy */
-	for (j = 0; j < ext->num_properties; j++) {
+	for (j = 0; j < ext->desc->num_properties; j++) {
 		enum power_supply_property prop;
-		prop = ext->properties[j];
+		prop = ext->desc->properties[j];
 
-		/* Initialize chargers if not already done */
+		/*
+		 * Initialize chargers if not already done.
+		 * The ab8500_charger*/
 		if (!di->ac_chg &&
-			ext->type == POWER_SUPPLY_TYPE_MAINS)
+			ext->desc->type == POWER_SUPPLY_TYPE_MAINS)
 			di->ac_chg = psy_to_ux500_charger(ext);
 		else if (!di->usb_chg &&
-			ext->type == POWER_SUPPLY_TYPE_USB)
+			ext->desc->type == POWER_SUPPLY_TYPE_USB)
 			di->usb_chg = psy_to_ux500_charger(ext);
 
 		if (power_supply_get_property(ext, prop, &ret))
 			continue;
 		switch (prop) {
 		case POWER_SUPPLY_PROP_PRESENT:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				/* Battery present */
 				if (ret.intval)
@@ -1070,7 +1069,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_ONLINE:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				break;
 			case POWER_SUPPLY_TYPE_MAINS:
@@ -1115,7 +1114,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_HEALTH:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				break;
 			case POWER_SUPPLY_TYPE_MAINS:
@@ -1198,7 +1197,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_VOLTAGE_NOW:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				di->batt_data.volt = ret.intval / 1000;
 				break;
@@ -1214,7 +1213,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_VOLTAGE_AVG:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_MAINS:
 				/* AVG is used to indicate when we are
 				 * in CV mode */
@@ -1239,7 +1238,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_TECHNOLOGY:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				if (ret.intval)
 					di->events.batt_unknown = false;
@@ -1257,7 +1256,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_CURRENT_NOW:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_MAINS:
 					di->chg_info.ac_curr =
 						ret.intval / 1000;
@@ -1275,7 +1274,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
 			break;
 
 		case POWER_SUPPLY_PROP_CURRENT_AVG:
-			switch (ext->type) {
+			switch (ext->desc->type) {
 			case POWER_SUPPLY_TYPE_BATTERY:
 				di->batt_data.avg_curr = ret.intval / 1000;
 				break;
@@ -1311,7 +1310,7 @@ static int abx500_chargalg_get_ext_psy_data(struct device *dev, void *data)
  */
 static void abx500_chargalg_external_power_changed(struct power_supply *psy)
 {
-	struct abx500_chargalg *di = to_abx500_chargalg_device_info(psy);
+	struct abx500_chargalg *di = power_supply_get_drvdata(psy);
 
 	/*
 	 * Trigger execution of the algorithm instantly and read
@@ -1336,7 +1335,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 
 	/* Collect data from all power_supply class devices */
 	class_for_each_device(power_supply_class, NULL,
-		&di->chargalg_psy, abx500_chargalg_get_ext_psy_data);
+		di->chargalg_psy, abx500_chargalg_get_ext_psy_data);
 
 	abx500_chargalg_end_of_charge(di);
 	abx500_chargalg_check_temp(di);
@@ -1478,7 +1477,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 		di->charge_status = POWER_SUPPLY_STATUS_NOT_CHARGING;
 		di->maintenance_chg = false;
 		abx500_chargalg_state_to(di, STATE_SUSPENDED);
-		power_supply_changed(&di->chargalg_psy);
+		power_supply_changed(di->chargalg_psy);
 		/* Intentional fallthrough */
 
 	case STATE_SUSPENDED:
@@ -1576,7 +1575,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 		di->charge_status = POWER_SUPPLY_STATUS_CHARGING;
 		di->eoc_cnt = 0;
 		di->maintenance_chg = false;
-		power_supply_changed(&di->chargalg_psy);
+		power_supply_changed(di->chargalg_psy);
 
 		break;
 
@@ -1624,7 +1623,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 			di->bm->bat_type[
 				di->bm->batt_id].maint_a_cur_lvl);
 		abx500_chargalg_state_to(di, STATE_MAINTENANCE_A);
-		power_supply_changed(&di->chargalg_psy);
+		power_supply_changed(di->chargalg_psy);
 		/* Intentional fallthrough*/
 
 	case STATE_MAINTENANCE_A:
@@ -1644,7 +1643,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 			di->bm->bat_type[
 				di->bm->batt_id].maint_b_cur_lvl);
 		abx500_chargalg_state_to(di, STATE_MAINTENANCE_B);
-		power_supply_changed(&di->chargalg_psy);
+		power_supply_changed(di->chargalg_psy);
 		/* Intentional fallthrough*/
 
 	case STATE_MAINTENANCE_B:
@@ -1663,7 +1662,7 @@ static void abx500_chargalg_algorithm(struct abx500_chargalg *di)
 		abx500_chargalg_stop_maintenance_timer(di);
 		di->charge_status = POWER_SUPPLY_STATUS_CHARGING;
 		abx500_chargalg_state_to(di, STATE_TEMP_LOWHIGH);
-		power_supply_changed(&di->chargalg_psy);
+		power_supply_changed(di->chargalg_psy);
 		/* Intentional fallthrough */
 
 	case STATE_TEMP_LOWHIGH:
@@ -1779,9 +1778,7 @@ static int abx500_chargalg_get_property(struct power_supply *psy,
 	enum power_supply_property psp,
 	union power_supply_propval *val)
 {
-	struct abx500_chargalg *di;
-
-	di = to_abx500_chargalg_device_info(psy);
+	struct abx500_chargalg *di = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -2034,7 +2031,7 @@ static int abx500_chargalg_remove(struct platform_device *pdev)
 	/* Delete the work queue */
 	destroy_workqueue(di->chargalg_wq);
 
-	power_supply_unregister(&di->chargalg_psy);
+	power_supply_unregister(di->chargalg_psy);
 
 	return 0;
 }
@@ -2043,6 +2040,15 @@ static char *supply_interface[] = {
 	"ab8500_fg",
 };
 
+static const struct power_supply_desc abx500_chargalg_desc = {
+	.name			= "abx500_chargalg",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= abx500_chargalg_props,
+	.num_properties		= ARRAY_SIZE(abx500_chargalg_props),
+	.get_property		= abx500_chargalg_get_property,
+	.external_power_changed	= abx500_chargalg_external_power_changed,
+};
+
 static int abx500_chargalg_probe(struct platform_device *pdev)
 {
 	struct device_node *np = pdev->dev.of_node;
@@ -2075,17 +2081,9 @@ static int abx500_chargalg_probe(struct platform_device *pdev)
 	di->dev = &pdev->dev;
 	di->parent = dev_get_drvdata(pdev->dev.parent);
 
-	/* chargalg supply */
-	di->chargalg_psy.name = "abx500_chargalg";
-	di->chargalg_psy.type = POWER_SUPPLY_TYPE_BATTERY;
-	di->chargalg_psy.properties = abx500_chargalg_props;
-	di->chargalg_psy.num_properties = ARRAY_SIZE(abx500_chargalg_props);
-	di->chargalg_psy.get_property = abx500_chargalg_get_property;
-	di->chargalg_psy.external_power_changed =
-		abx500_chargalg_external_power_changed;
-
 	psy_cfg.supplied_to = supply_interface;
 	psy_cfg.num_supplicants = ARRAY_SIZE(supply_interface);
+	psy_cfg.drv_data = di;
 
 	/* Initilialize safety timer */
 	hrtimer_init(&di->safety_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -2117,9 +2115,11 @@ static int abx500_chargalg_probe(struct platform_device *pdev)
 	di->chg_info.prev_conn_chg = -1;
 
 	/* Register chargalg power supply class */
-	ret = power_supply_register(di->dev, &di->chargalg_psy, &psy_cfg);
-	if (ret) {
+	di->chargalg_psy = power_supply_register(di->dev, &abx500_chargalg_desc,
+						 &psy_cfg);
+	if (IS_ERR(di->chargalg_psy)) {
 		dev_err(di->dev, "failed to register chargalg psy\n");
+		ret = PTR_ERR(di->chargalg_psy);
 		goto free_chargalg_wq;
 	}
 
@@ -2140,7 +2140,7 @@ static int abx500_chargalg_probe(struct platform_device *pdev)
 	return ret;
 
 free_psy:
-	power_supply_unregister(&di->chargalg_psy);
+	power_supply_unregister(di->chargalg_psy);
 free_chargalg_wq:
 	destroy_workqueue(di->chargalg_wq);
 	return ret;
diff --git a/drivers/power/apm_power.c b/drivers/power/apm_power.c
index 2206f9ff6488..9d1a7fbcaed4 100644
--- a/drivers/power/apm_power.c
+++ b/drivers/power/apm_power.c
@@ -48,7 +48,7 @@ static int __find_main_battery(struct device *dev, void *data)
 
 	bp->bat = dev_get_drvdata(dev);
 
-	if (bp->bat->use_for_apm) {
+	if (bp->bat->desc->use_for_apm) {
 		/* nice, we explicitly asked to report this battery. */
 		bp->main = bp->bat;
 		return 1;
diff --git a/drivers/power/axp288_fuel_gauge.c b/drivers/power/axp288_fuel_gauge.c
index 2bec8d241e62..ca1cc5a47eb1 100644
--- a/drivers/power/axp288_fuel_gauge.c
+++ b/drivers/power/axp288_fuel_gauge.c
@@ -127,7 +127,7 @@ struct axp288_fg_info {
 	struct regmap *regmap;
 	struct regmap_irq_chip_data *regmap_irqc;
 	int irq[AXP288_FG_INTR_NUM];
-	struct power_supply bat;
+	struct power_supply *bat;
 	struct mutex lock;
 	int status;
 	struct delayed_work status_monitor;
@@ -588,8 +588,7 @@ static int fuel_gauge_get_property(struct power_supply *ps,
 		enum power_supply_property prop,
 		union power_supply_propval *val)
 {
-	struct axp288_fg_info *info = container_of(ps,
-			struct axp288_fg_info, bat);
+	struct axp288_fg_info *info = power_supply_get_drvdata(ps);
 	int ret = 0, value;
 
 	mutex_lock(&info->lock);
@@ -715,8 +714,7 @@ static int fuel_gauge_set_property(struct power_supply *ps,
 		enum power_supply_property prop,
 		const union power_supply_propval *val)
 {
-	struct axp288_fg_info *info = container_of(ps,
-				struct axp288_fg_info, bat);
+	struct axp288_fg_info *info = power_supply_get_drvdata(ps);
 	int ret = 0;
 
 	mutex_lock(&info->lock);
@@ -798,7 +796,7 @@ static void fuel_gauge_status_monitor(struct work_struct *work)
 		struct axp288_fg_info, status_monitor.work);
 
 	fuel_gauge_get_status(info);
-	power_supply_changed(&info->bat);
+	power_supply_changed(info->bat);
 	schedule_delayed_work(&info->status_monitor, STATUS_MON_DELAY_JIFFIES);
 }
 
@@ -844,18 +842,28 @@ static irqreturn_t fuel_gauge_thread_handler(int irq, void *dev)
 		dev_warn(&info->pdev->dev, "Spurious Interrupt!!!\n");
 	}
 
-	power_supply_changed(&info->bat);
+	power_supply_changed(info->bat);
 	return IRQ_HANDLED;
 }
 
 static void fuel_gauge_external_power_changed(struct power_supply *psy)
 {
-	struct axp288_fg_info *info = container_of(psy,
-				struct axp288_fg_info, bat);
+	struct axp288_fg_info *info = power_supply_get_drvdata(psy);
 
-	power_supply_changed(&info->bat);
+	power_supply_changed(info->bat);
 }
 
+static const struct power_supply_desc fuel_gauge_desc = {
+	.name			= DEV_NAME,
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= fuel_gauge_props,
+	.num_properties		= ARRAY_SIZE(fuel_gauge_props),
+	.get_property		= fuel_gauge_get_property,
+	.set_property		= fuel_gauge_set_property,
+	.property_is_writeable	= fuel_gauge_property_is_writeable,
+	.external_power_changed	= fuel_gauge_external_power_changed,
+};
+
 static int fuel_gauge_set_lowbatt_thresholds(struct axp288_fg_info *info)
 {
 	int ret;
@@ -1070,9 +1078,10 @@ static void fuel_gauge_init_hw_regs(struct axp288_fg_info *info)
 
 static int axp288_fuel_gauge_probe(struct platform_device *pdev)
 {
-	int ret;
+	int ret = 0;
 	struct axp288_fg_info *info;
 	struct axp20x_dev *axp20x = dev_get_drvdata(pdev->dev.parent);
+	struct power_supply_config psy_cfg = {};
 
 	info = devm_kzalloc(&pdev->dev, sizeof(*info), GFP_KERNEL);
 	if (!info)
@@ -1091,16 +1100,10 @@ static int axp288_fuel_gauge_probe(struct platform_device *pdev)
 	mutex_init(&info->lock);
 	INIT_DELAYED_WORK(&info->status_monitor, fuel_gauge_status_monitor);
 
-	info->bat.name = DEV_NAME;
-	info->bat.type = POWER_SUPPLY_TYPE_BATTERY;
-	info->bat.properties = fuel_gauge_props;
-	info->bat.num_properties = ARRAY_SIZE(fuel_gauge_props);
-	info->bat.get_property = fuel_gauge_get_property;
-	info->bat.set_property = fuel_gauge_set_property;
-	info->bat.property_is_writeable = fuel_gauge_property_is_writeable;
-	info->bat.external_power_changed = fuel_gauge_external_power_changed;
-	ret = power_supply_register(&pdev->dev, &info->bat, NULL);
-	if (ret) {
+	psy_cfg.drv_data = info;
+	info->bat = power_supply_register(&pdev->dev, &fuel_gauge_desc, &psy_cfg);
+	if (IS_ERR(info->bat)) {
+		ret = PTR_ERR(info->bat);
 		dev_err(&pdev->dev, "failed to register battery: %d\n", ret);
 		return ret;
 	}
@@ -1125,7 +1128,7 @@ static int axp288_fuel_gauge_remove(struct platform_device *pdev)
 	int i;
 
 	cancel_delayed_work_sync(&info->status_monitor);
-	power_supply_unregister(&info->bat);
+	power_supply_unregister(info->bat);
 	fuel_gauge_remove_debugfs(info);
 
 	for (i = 0; i < AXP288_FG_INTR_NUM; i++)
diff --git a/drivers/power/bq2415x_charger.c b/drivers/power/bq2415x_charger.c
index d31ccc7935d0..c745d278815d 100644
--- a/drivers/power/bq2415x_charger.c
+++ b/drivers/power/bq2415x_charger.c
@@ -166,7 +166,8 @@ static char *bq2415x_chip_name[] = {
 struct bq2415x_device {
 	struct device *dev;
 	struct bq2415x_platform_data init_data;
-	struct power_supply charger;
+	struct power_supply *charger;
+	struct power_supply_desc charger_desc;
 	struct delayed_work work;
 	struct power_supply *notify_psy;
 	struct notifier_block nb;
@@ -784,7 +785,7 @@ static int bq2415x_set_mode(struct bq2415x_device *bq, enum bq2415x_mode mode)
 	bq2415x_set_default_value(bq, battery_regulation_voltage);
 
 	bq->mode = mode;
-	sysfs_notify(&bq->charger.dev->kobj, NULL, "mode");
+	sysfs_notify(&bq->charger->dev.kobj, NULL, "mode");
 
 	return 0;
 
@@ -868,7 +869,7 @@ static void bq2415x_set_autotimer(struct bq2415x_device *bq, int state)
 static void bq2415x_timer_error(struct bq2415x_device *bq, const char *msg)
 {
 	bq->timer_error = msg;
-	sysfs_notify(&bq->charger.dev->kobj, NULL, "timer");
+	sysfs_notify(&bq->charger->dev.kobj, NULL, "timer");
 	dev_err(bq->dev, "%s\n", msg);
 	if (bq->automode > 0)
 		bq->automode = 0;
@@ -886,7 +887,7 @@ static void bq2415x_timer_work(struct work_struct *work)
 	int boost;
 
 	if (bq->automode > 0 && (bq->reported_mode != bq->mode)) {
-		sysfs_notify(&bq->charger.dev->kobj, NULL, "reported_mode");
+		sysfs_notify(&bq->charger->dev.kobj, NULL, "reported_mode");
 		bq2415x_set_mode(bq, bq->reported_mode);
 	}
 
@@ -992,8 +993,7 @@ static int bq2415x_power_supply_get_property(struct power_supply *psy,
 					     enum power_supply_property psp,
 					     union power_supply_propval *val)
 {
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	int ret;
 
 	switch (psp) {
@@ -1024,12 +1024,14 @@ static int bq2415x_power_supply_init(struct bq2415x_device *bq)
 	int ret;
 	int chip;
 	char revstr[8];
+	struct power_supply_config psy_cfg = { .drv_data = bq, };
 
-	bq->charger.name = bq->name;
-	bq->charger.type = POWER_SUPPLY_TYPE_USB;
-	bq->charger.properties = bq2415x_power_supply_props;
-	bq->charger.num_properties = ARRAY_SIZE(bq2415x_power_supply_props);
-	bq->charger.get_property = bq2415x_power_supply_get_property;
+	bq->charger_desc.name = bq->name;
+	bq->charger_desc.type = POWER_SUPPLY_TYPE_USB;
+	bq->charger_desc.properties = bq2415x_power_supply_props;
+	bq->charger_desc.num_properties =
+			ARRAY_SIZE(bq2415x_power_supply_props);
+	bq->charger_desc.get_property = bq2415x_power_supply_get_property;
 
 	ret = bq2415x_detect_chip(bq);
 	if (ret < 0)
@@ -1052,10 +1054,11 @@ static int bq2415x_power_supply_init(struct bq2415x_device *bq)
 		return -ENOMEM;
 	}
 
-	ret = power_supply_register(bq->dev, &bq->charger, NULL);
-	if (ret) {
+	bq->charger = power_supply_register(bq->dev, &bq->charger_desc,
+					    &psy_cfg);
+	if (IS_ERR(bq->charger)) {
 		kfree(bq->model);
-		return ret;
+		return PTR_ERR(bq->charger);
 	}
 
 	return 0;
@@ -1067,7 +1070,7 @@ static void bq2415x_power_supply_exit(struct bq2415x_device *bq)
 	if (bq->automode > 0)
 		bq->automode = 0;
 	cancel_delayed_work_sync(&bq->work);
-	power_supply_unregister(&bq->charger);
+	power_supply_unregister(bq->charger);
 	kfree(bq->model);
 }
 
@@ -1079,8 +1082,7 @@ static ssize_t bq2415x_sysfs_show_status(struct device *dev,
 					 char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	enum bq2415x_command command;
 	int ret;
 
@@ -1113,8 +1115,7 @@ static ssize_t bq2415x_sysfs_set_timer(struct device *dev,
 				       size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	if (strncmp(buf, "auto", 4) == 0)
@@ -1135,8 +1136,7 @@ static ssize_t bq2415x_sysfs_show_timer(struct device *dev,
 					char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 
 	if (bq->timer_error)
 		return sprintf(buf, "%s\n", bq->timer_error);
@@ -1160,8 +1160,7 @@ static ssize_t bq2415x_sysfs_set_mode(struct device *dev,
 				      size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	enum bq2415x_mode mode;
 	int ret = 0;
 
@@ -1213,8 +1212,7 @@ static ssize_t bq2415x_sysfs_show_mode(struct device *dev,
 				       char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	ssize_t ret = 0;
 
 	if (bq->automode > 0)
@@ -1251,8 +1249,7 @@ static ssize_t bq2415x_sysfs_show_reported_mode(struct device *dev,
 						char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 
 	if (bq->automode < 0)
 		return -EINVAL;
@@ -1280,8 +1277,7 @@ static ssize_t bq2415x_sysfs_set_registers(struct device *dev,
 					   size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	ssize_t ret = 0;
 	unsigned int reg;
 	unsigned int val;
@@ -1316,8 +1312,7 @@ static ssize_t bq2415x_sysfs_show_registers(struct device *dev,
 					    char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	ssize_t ret = 0;
 
 	ret += bq2415x_sysfs_print_reg(bq, BQ2415X_REG_STATUS, buf+ret);
@@ -1335,8 +1330,7 @@ static ssize_t bq2415x_sysfs_set_limit(struct device *dev,
 				       size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	long val;
 	int ret;
 
@@ -1367,8 +1361,7 @@ static ssize_t bq2415x_sysfs_show_limit(struct device *dev,
 					char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	int ret;
 
 	if (strcmp(attr->attr.name, "current_limit") == 0)
@@ -1396,8 +1389,7 @@ static ssize_t bq2415x_sysfs_set_enable(struct device *dev,
 					size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	enum bq2415x_command command;
 	long val;
 	int ret;
@@ -1432,8 +1424,7 @@ static ssize_t bq2415x_sysfs_show_enable(struct device *dev,
 					 char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq2415x_device *bq = container_of(psy, struct bq2415x_device,
-						 charger);
+	struct bq2415x_device *bq = power_supply_get_drvdata(psy);
 	enum bq2415x_command command;
 	int ret;
 
@@ -1524,13 +1515,13 @@ static const struct attribute_group bq2415x_sysfs_attr_group = {
 
 static int bq2415x_sysfs_init(struct bq2415x_device *bq)
 {
-	return sysfs_create_group(&bq->charger.dev->kobj,
+	return sysfs_create_group(&bq->charger->dev.kobj,
 			&bq2415x_sysfs_attr_group);
 }
 
 static void bq2415x_sysfs_exit(struct bq2415x_device *bq)
 {
-	sysfs_remove_group(&bq->charger.dev->kobj, &bq2415x_sysfs_attr_group);
+	sysfs_remove_group(&bq->charger->dev.kobj, &bq2415x_sysfs_attr_group);
 }
 
 /* main bq2415x probe function */
diff --git a/drivers/power/bq24190_charger.c b/drivers/power/bq24190_charger.c
index 54eb58485d55..407c4af83891 100644
--- a/drivers/power/bq24190_charger.c
+++ b/drivers/power/bq24190_charger.c
@@ -152,8 +152,8 @@
 struct bq24190_dev_info {
 	struct i2c_client		*client;
 	struct device			*dev;
-	struct power_supply		charger;
-	struct power_supply		battery;
+	struct power_supply		*charger;
+	struct power_supply		*battery;
 	char				model_name[I2C_NAME_SIZE];
 	kernel_ulong_t			model;
 	unsigned int			gpio_int;
@@ -423,8 +423,7 @@ static ssize_t bq24190_sysfs_show(struct device *dev,
 		struct device_attribute *attr, char *buf)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, charger);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	struct bq24190_sysfs_field_info *info;
 	int ret;
 	u8 v;
@@ -444,8 +443,7 @@ static ssize_t bq24190_sysfs_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct power_supply *psy = dev_get_drvdata(dev);
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, charger);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	struct bq24190_sysfs_field_info *info;
 	int ret;
 	u8 v;
@@ -469,13 +467,13 @@ static int bq24190_sysfs_create_group(struct bq24190_dev_info *bdi)
 {
 	bq24190_sysfs_init_attrs();
 
-	return sysfs_create_group(&bdi->charger.dev->kobj,
+	return sysfs_create_group(&bdi->charger->dev.kobj,
 			&bq24190_sysfs_attr_group);
 }
 
 static void bq24190_sysfs_remove_group(struct bq24190_dev_info *bdi)
 {
-	sysfs_remove_group(&bdi->charger.dev->kobj, &bq24190_sysfs_attr_group);
+	sysfs_remove_group(&bdi->charger->dev.kobj, &bq24190_sysfs_attr_group);
 }
 #else
 static int bq24190_sysfs_create_group(struct bq24190_dev_info *bdi)
@@ -807,8 +805,7 @@ static int bq24190_charger_set_voltage(struct bq24190_dev_info *bdi,
 static int bq24190_charger_get_property(struct power_supply *psy,
 		enum power_supply_property psp, union power_supply_propval *val)
 {
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, charger);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	int ret;
 
 	dev_dbg(bdi->dev, "prop: %d\n", psp);
@@ -861,8 +858,7 @@ static int bq24190_charger_set_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		const union power_supply_propval *val)
 {
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, charger);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	int ret;
 
 	dev_dbg(bdi->dev, "prop: %d\n", psp);
@@ -922,18 +918,15 @@ static char *bq24190_charger_supplied_to[] = {
 	"main-battery",
 };
 
-static void bq24190_charger_init(struct power_supply *charger)
-{
-	charger->name = "bq24190-charger";
-	charger->type = POWER_SUPPLY_TYPE_USB;
-	charger->properties = bq24190_charger_properties;
-	charger->num_properties = ARRAY_SIZE(bq24190_charger_properties);
-	charger->supplied_to = bq24190_charger_supplied_to;
-	charger->num_supplicants = ARRAY_SIZE(bq24190_charger_supplied_to);
-	charger->get_property = bq24190_charger_get_property;
-	charger->set_property = bq24190_charger_set_property;
-	charger->property_is_writeable = bq24190_charger_property_is_writeable;
-}
+static const struct power_supply_desc bq24190_charger_desc = {
+	.name			= "bq24190-charger",
+	.type			= POWER_SUPPLY_TYPE_USB,
+	.properties		= bq24190_charger_properties,
+	.num_properties		= ARRAY_SIZE(bq24190_charger_properties),
+	.get_property		= bq24190_charger_get_property,
+	.set_property		= bq24190_charger_set_property,
+	.property_is_writeable	= bq24190_charger_property_is_writeable,
+};
 
 /* Battery power supply property routines */
 
@@ -1102,8 +1095,7 @@ static int bq24190_battery_set_temp_alert_max(struct bq24190_dev_info *bdi,
 static int bq24190_battery_get_property(struct power_supply *psy,
 		enum power_supply_property psp, union power_supply_propval *val)
 {
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, battery);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	int ret;
 
 	dev_dbg(bdi->dev, "prop: %d\n", psp);
@@ -1144,8 +1136,7 @@ static int bq24190_battery_set_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		const union power_supply_propval *val)
 {
-	struct bq24190_dev_info *bdi =
-			container_of(psy, struct bq24190_dev_info, battery);
+	struct bq24190_dev_info *bdi = power_supply_get_drvdata(psy);
 	int ret;
 
 	dev_dbg(bdi->dev, "prop: %d\n", psp);
@@ -1193,16 +1184,15 @@ static enum power_supply_property bq24190_battery_properties[] = {
 	POWER_SUPPLY_PROP_SCOPE,
 };
 
-static void bq24190_battery_init(struct power_supply *battery)
-{
-	battery->name = "bq24190-battery";
-	battery->type = POWER_SUPPLY_TYPE_BATTERY;
-	battery->properties = bq24190_battery_properties;
-	battery->num_properties = ARRAY_SIZE(bq24190_battery_properties);
-	battery->get_property = bq24190_battery_get_property;
-	battery->set_property = bq24190_battery_set_property;
-	battery->property_is_writeable = bq24190_battery_property_is_writeable;
-}
+static const struct power_supply_desc bq24190_battery_desc = {
+	.name			= "bq24190-battery",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= bq24190_battery_properties,
+	.num_properties		= ARRAY_SIZE(bq24190_battery_properties),
+	.get_property		= bq24190_battery_get_property,
+	.set_property		= bq24190_battery_set_property,
+	.property_is_writeable	= bq24190_battery_property_is_writeable,
+};
 
 static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
 {
@@ -1269,8 +1259,8 @@ static irqreturn_t bq24190_irq_handler_thread(int irq, void *data)
 	 * interrupt received).
 	 */
 	if (alert_userspace && !bdi->first_time) {
-		power_supply_changed(&bdi->charger);
-		power_supply_changed(&bdi->battery);
+		power_supply_changed(bdi->charger);
+		power_supply_changed(bdi->battery);
 		bdi->first_time = false;
 	}
 
@@ -1362,6 +1352,7 @@ static int bq24190_probe(struct i2c_client *client,
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
 	struct device *dev = &client->dev;
 	struct bq24190_platform_data *pdata = client->dev.platform_data;
+	struct power_supply_config charger_cfg = {}, battery_cfg = {};
 	struct bq24190_dev_info *bdi;
 	int ret;
 
@@ -1416,19 +1407,23 @@ static int bq24190_probe(struct i2c_client *client,
 		goto out2;
 	}
 
-	bq24190_charger_init(&bdi->charger);
-
-	ret = power_supply_register(dev, &bdi->charger, NULL);
-	if (ret) {
+	charger_cfg.drv_data = bdi;
+	charger_cfg.supplied_to = bq24190_charger_supplied_to;
+	charger_cfg.num_supplicants = ARRAY_SIZE(bq24190_charger_supplied_to),
+	bdi->charger = power_supply_register(dev, &bq24190_charger_desc,
+						&charger_cfg);
+	if (IS_ERR(bdi->charger)) {
 		dev_err(dev, "Can't register charger\n");
+		ret = PTR_ERR(bdi->charger);
 		goto out2;
 	}
 
-	bq24190_battery_init(&bdi->battery);
-
-	ret = power_supply_register(dev, &bdi->battery, NULL);
-	if (ret) {
+	battery_cfg.drv_data = bdi;
+	bdi->battery = power_supply_register(dev, &bq24190_battery_desc,
+						&battery_cfg);
+	if (IS_ERR(bdi->battery)) {
 		dev_err(dev, "Can't register battery\n");
+		ret = PTR_ERR(bdi->battery);
 		goto out3;
 	}
 
@@ -1441,9 +1436,9 @@ static int bq24190_probe(struct i2c_client *client,
 	return 0;
 
 out4:
-	power_supply_unregister(&bdi->battery);
+	power_supply_unregister(bdi->battery);
 out3:
-	power_supply_unregister(&bdi->charger);
+	power_supply_unregister(bdi->charger);
 out2:
 	pm_runtime_disable(dev);
 out1:
@@ -1462,8 +1457,8 @@ static int bq24190_remove(struct i2c_client *client)
 	pm_runtime_put_sync(bdi->dev);
 
 	bq24190_sysfs_remove_group(bdi);
-	power_supply_unregister(&bdi->battery);
-	power_supply_unregister(&bdi->charger);
+	power_supply_unregister(bdi->battery);
+	power_supply_unregister(bdi->charger);
 	pm_runtime_disable(bdi->dev);
 
 	if (bdi->gpio_int)
@@ -1499,8 +1494,8 @@ static int bq24190_pm_resume(struct device *dev)
 	pm_runtime_put_sync(bdi->dev);
 
 	/* Things may have changed while suspended so alert upper layer */
-	power_supply_changed(&bdi->charger);
-	power_supply_changed(&bdi->battery);
+	power_supply_changed(bdi->charger);
+	power_supply_changed(bdi->battery);
 
 	return 0;
 }
diff --git a/drivers/power/bq24735-charger.c b/drivers/power/bq24735-charger.c
index 242e79bfe217..961a18930027 100644
--- a/drivers/power/bq24735-charger.c
+++ b/drivers/power/bq24735-charger.c
@@ -44,14 +44,15 @@
 #define BQ24735_DEVICE_ID		0xff
 
 struct bq24735 {
-	struct power_supply	charger;
-	struct i2c_client	*client;
-	struct bq24735_platform	*pdata;
+	struct power_supply		*charger;
+	struct power_supply_desc	charger_desc;
+	struct i2c_client		*client;
+	struct bq24735_platform		*pdata;
 };
 
 static inline struct bq24735 *to_bq24735(struct power_supply *psy)
 {
-	return container_of(psy, struct bq24735, charger);
+	return power_supply_get_drvdata(psy);
 }
 
 static enum power_supply_property bq24735_charger_properties[] = {
@@ -192,9 +193,7 @@ static int bq24735_charger_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct bq24735 *charger;
-
-	charger = container_of(psy, struct bq24735, charger);
+	struct bq24735 *charger = to_bq24735(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_ONLINE:
@@ -248,7 +247,7 @@ static int bq24735_charger_probe(struct i2c_client *client,
 {
 	int ret;
 	struct bq24735 *charger;
-	struct power_supply *supply;
+	struct power_supply_desc *supply_desc;
 	struct power_supply_config psy_cfg = {};
 	char *name;
 
@@ -278,17 +277,18 @@ static int bq24735_charger_probe(struct i2c_client *client,
 
 	charger->client = client;
 
-	supply = &charger->charger;
+	supply_desc = &charger->charger_desc;
 
-	supply->name = name;
-	supply->type = POWER_SUPPLY_TYPE_MAINS;
-	supply->properties = bq24735_charger_properties;
-	supply->num_properties = ARRAY_SIZE(bq24735_charger_properties);
-	supply->get_property = bq24735_charger_get_property;
+	supply_desc->name = name;
+	supply_desc->type = POWER_SUPPLY_TYPE_MAINS;
+	supply_desc->properties = bq24735_charger_properties;
+	supply_desc->num_properties = ARRAY_SIZE(bq24735_charger_properties);
+	supply_desc->get_property = bq24735_charger_get_property;
 
 	psy_cfg.supplied_to = charger->pdata->supplied_to;
 	psy_cfg.num_supplicants = charger->pdata->num_supplicants;
 	psy_cfg.of_node = client->dev.of_node;
+	psy_cfg.drv_data = charger;
 
 	i2c_set_clientdata(client, charger);
 
@@ -343,8 +343,10 @@ static int bq24735_charger_probe(struct i2c_client *client,
 		}
 	}
 
-	ret = power_supply_register(&client->dev, supply, &psy_cfg);
-	if (ret < 0) {
+	charger->charger = power_supply_register(&client->dev, supply_desc,
+						 &psy_cfg);
+	if (IS_ERR(charger->charger)) {
+		ret = PTR_ERR(charger->charger);
 		dev_err(&client->dev, "Failed to register power supply: %d\n",
 			ret);
 		goto err_free_name;
@@ -356,7 +358,8 @@ static int bq24735_charger_probe(struct i2c_client *client,
 						IRQF_TRIGGER_RISING |
 						IRQF_TRIGGER_FALLING |
 						IRQF_ONESHOT,
-						supply->name, supply);
+						supply_desc->name,
+						charger->charger);
 		if (ret) {
 			dev_err(&client->dev,
 				"Unable to register IRQ %d err %d\n",
@@ -367,7 +370,7 @@ static int bq24735_charger_probe(struct i2c_client *client,
 
 	return 0;
 err_unregister_supply:
-	power_supply_unregister(supply);
+	power_supply_unregister(charger->charger);
 err_free_name:
 	if (name != charger->pdata->name)
 		kfree(name);
@@ -383,10 +386,10 @@ static int bq24735_charger_remove(struct i2c_client *client)
 		devm_free_irq(&charger->client->dev, charger->client->irq,
 			      &charger->charger);
 
-	power_supply_unregister(&charger->charger);
+	power_supply_unregister(charger->charger);
 
-	if (charger->charger.name != charger->pdata->name)
-		kfree(charger->charger.name);
+	if (charger->charger_desc.name != charger->pdata->name)
+		kfree(charger->charger_desc.name);
 
 	return 0;
 }
diff --git a/drivers/power/bq27x00_battery.c b/drivers/power/bq27x00_battery.c
index a992e43908a2..a57433de5c24 100644
--- a/drivers/power/bq27x00_battery.c
+++ b/drivers/power/bq27x00_battery.c
@@ -116,7 +116,7 @@ struct bq27x00_device_info {
 	unsigned long last_update;
 	struct delayed_work work;
 
-	struct power_supply	bat;
+	struct power_supply	*bat;
 
 	struct bq27x00_access_methods bus;
 
@@ -531,7 +531,7 @@ static void bq27x00_update(struct bq27x00_device_info *di)
 	}
 
 	if (di->cache.capacity != cache.capacity)
-		power_supply_changed(&di->bat);
+		power_supply_changed(di->bat);
 
 	if (memcmp(&di->cache, &cache, sizeof(cache)) != 0)
 		di->cache = cache;
@@ -603,7 +603,7 @@ static int bq27x00_battery_status(struct bq27x00_device_info *di,
 			status = POWER_SUPPLY_STATUS_FULL;
 		else if (di->cache.flags & BQ27000_FLAG_CHGS)
 			status = POWER_SUPPLY_STATUS_CHARGING;
-		else if (power_supply_am_i_supplied(&di->bat))
+		else if (power_supply_am_i_supplied(di->bat))
 			status = POWER_SUPPLY_STATUS_NOT_CHARGING;
 		else
 			status = POWER_SUPPLY_STATUS_DISCHARGING;
@@ -675,15 +675,12 @@ static int bq27x00_simple_value(int value,
 	return 0;
 }
 
-#define to_bq27x00_device_info(x) container_of((x), \
-				struct bq27x00_device_info, bat);
-
 static int bq27x00_battery_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
 	int ret = 0;
-	struct bq27x00_device_info *di = to_bq27x00_device_info(psy);
+	struct bq27x00_device_info *di = power_supply_get_drvdata(psy);
 
 	mutex_lock(&di->lock);
 	if (time_is_before_jiffies(di->last_update + 5 * HZ)) {
@@ -761,38 +758,47 @@ static int bq27x00_battery_get_property(struct power_supply *psy,
 
 static void bq27x00_external_power_changed(struct power_supply *psy)
 {
-	struct bq27x00_device_info *di = to_bq27x00_device_info(psy);
+	struct bq27x00_device_info *di = power_supply_get_drvdata(psy);
 
 	cancel_delayed_work_sync(&di->work);
 	schedule_delayed_work(&di->work, 0);
 }
 
-static int bq27x00_powersupply_init(struct bq27x00_device_info *di)
+static int bq27x00_powersupply_init(struct bq27x00_device_info *di,
+				    const char *name)
 {
 	int ret;
+	struct power_supply_desc *psy_desc;
+	struct power_supply_config psy_cfg = { .drv_data = di, };
+
+	psy_desc = devm_kzalloc(di->dev, sizeof(*psy_desc), GFP_KERNEL);
+	if (!psy_desc)
+		return -ENOMEM;
 
-	di->bat.type = POWER_SUPPLY_TYPE_BATTERY;
+	psy_desc->name = name;
+	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
 	if (di->chip == BQ27425) {
-		di->bat.properties = bq27425_battery_props;
-		di->bat.num_properties = ARRAY_SIZE(bq27425_battery_props);
+		psy_desc->properties = bq27425_battery_props;
+		psy_desc->num_properties = ARRAY_SIZE(bq27425_battery_props);
 	} else if (di->chip == BQ27742) {
-		di->bat.properties = bq27742_battery_props;
-		di->bat.num_properties = ARRAY_SIZE(bq27742_battery_props);
+		psy_desc->properties = bq27742_battery_props;
+		psy_desc->num_properties = ARRAY_SIZE(bq27742_battery_props);
 	} else if (di->chip == BQ27510) {
-		di->bat.properties = bq27510_battery_props;
-		di->bat.num_properties = ARRAY_SIZE(bq27510_battery_props);
+		psy_desc->properties = bq27510_battery_props;
+		psy_desc->num_properties = ARRAY_SIZE(bq27510_battery_props);
 	} else {
-		di->bat.properties = bq27x00_battery_props;
-		di->bat.num_properties = ARRAY_SIZE(bq27x00_battery_props);
+		psy_desc->properties = bq27x00_battery_props;
+		psy_desc->num_properties = ARRAY_SIZE(bq27x00_battery_props);
 	}
-	di->bat.get_property = bq27x00_battery_get_property;
-	di->bat.external_power_changed = bq27x00_external_power_changed;
+	psy_desc->get_property = bq27x00_battery_get_property;
+	psy_desc->external_power_changed = bq27x00_external_power_changed;
 
 	INIT_DELAYED_WORK(&di->work, bq27x00_battery_poll);
 	mutex_init(&di->lock);
 
-	ret = power_supply_register_no_ws(di->dev, &di->bat, NULL);
-	if (ret) {
+	di->bat = power_supply_register_no_ws(di->dev, psy_desc, &psy_cfg);
+	if (IS_ERR(di->bat)) {
+		ret = PTR_ERR(di->bat);
 		dev_err(di->dev, "failed to register battery: %d\n", ret);
 		return ret;
 	}
@@ -816,7 +822,7 @@ static void bq27x00_powersupply_unregister(struct bq27x00_device_info *di)
 
 	cancel_delayed_work_sync(&di->work);
 
-	power_supply_unregister(&di->bat);
+	power_supply_unregister(di->bat);
 
 	mutex_destroy(&di->lock);
 }
@@ -880,37 +886,34 @@ static int bq27x00_battery_probe(struct i2c_client *client,
 	if (num < 0)
 		return num;
 
-	name = kasprintf(GFP_KERNEL, "%s-%d", id->name, num);
+	name = devm_kasprintf(&client->dev, GFP_KERNEL, "%s-%d", id->name, num);
 	if (!name) {
 		dev_err(&client->dev, "failed to allocate device name\n");
 		retval = -ENOMEM;
-		goto batt_failed_1;
+		goto batt_failed;
 	}
 
 	di = devm_kzalloc(&client->dev, sizeof(*di), GFP_KERNEL);
 	if (!di) {
 		dev_err(&client->dev, "failed to allocate device info data\n");
 		retval = -ENOMEM;
-		goto batt_failed_2;
+		goto batt_failed;
 	}
 
 	di->id = num;
 	di->dev = &client->dev;
 	di->chip = id->driver_data;
-	di->bat.name = name;
 	di->bus.read = &bq27x00_read_i2c;
 
-	retval = bq27x00_powersupply_init(di);
+	retval = bq27x00_powersupply_init(di, name);
 	if (retval)
-		goto batt_failed_2;
+		goto batt_failed;
 
 	i2c_set_clientdata(client, di);
 
 	return 0;
 
-batt_failed_2:
-	kfree(name);
-batt_failed_1:
+batt_failed:
 	mutex_lock(&battery_mutex);
 	idr_remove(&battery_id, num);
 	mutex_unlock(&battery_mutex);
@@ -924,8 +927,6 @@ static int bq27x00_battery_remove(struct i2c_client *client)
 
 	bq27x00_powersupply_unregister(di);
 
-	kfree(di->bat.name);
-
 	mutex_lock(&battery_mutex);
 	idr_remove(&battery_id, di->id);
 	mutex_unlock(&battery_mutex);
@@ -1014,6 +1015,7 @@ static int bq27000_battery_probe(struct platform_device *pdev)
 {
 	struct bq27x00_device_info *di;
 	struct bq27000_platform_data *pdata = pdev->dev.platform_data;
+	const char *name;
 
 	if (!pdata) {
 		dev_err(&pdev->dev, "no platform_data supplied\n");
@@ -1036,10 +1038,10 @@ static int bq27000_battery_probe(struct platform_device *pdev)
 	di->dev = &pdev->dev;
 	di->chip = BQ27000;
 
-	di->bat.name = pdata->name ?: dev_name(&pdev->dev);
+	name = pdata->name ?: dev_name(&pdev->dev);
 	di->bus.read = &bq27000_read_platform;
 
-	return bq27x00_powersupply_init(di);
+	return bq27x00_powersupply_init(di, name);
 }
 
 static int bq27000_battery_remove(struct platform_device *pdev)
diff --git a/drivers/power/charger-manager.c b/drivers/power/charger-manager.c
index a5b86b60d244..5c47409c6889 100644
--- a/drivers/power/charger-manager.c
+++ b/drivers/power/charger-manager.c
@@ -864,8 +864,7 @@ static int charger_get_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		union power_supply_propval *val)
 {
-	struct charger_manager *cm = container_of(psy,
-			struct charger_manager, charger_psy);
+	struct charger_manager *cm = power_supply_get_drvdata(psy);
 	struct charger_desc *desc = cm->desc;
 	struct power_supply *fuel_gauge;
 	int ret = 0;
@@ -1018,7 +1017,7 @@ static enum power_supply_property default_charger_props[] = {
 	 */
 };
 
-static struct power_supply psy_default = {
+static const struct power_supply_desc psy_default = {
 	.name = "battery",
 	.type = POWER_SUPPLY_TYPE_BATTERY,
 	.properties = default_charger_props,
@@ -1399,7 +1398,7 @@ static int charger_manager_register_sysfs(struct charger_manager *cm)
 		dev_info(cm->dev, "'%s' regulator's externally_control is %d\n",
 			 charger->regulator_name, charger->externally_control);
 
-		ret = sysfs_create_group(&cm->charger_psy.dev->kobj,
+		ret = sysfs_create_group(&cm->charger_psy->dev.kobj,
 					&charger->attr_g);
 		if (ret < 0) {
 			dev_err(cm->dev, "Cannot create sysfs entry of %s regulator\n",
@@ -1431,9 +1430,9 @@ static int cm_init_thermal_data(struct charger_manager *cm,
 					POWER_SUPPLY_PROP_TEMP, &val);
 
 	if (!ret) {
-		cm->charger_psy.properties[cm->charger_psy.num_properties] =
+		cm->charger_psy_desc.properties[cm->charger_psy_desc.num_properties] =
 				POWER_SUPPLY_PROP_TEMP;
-		cm->charger_psy.num_properties++;
+		cm->charger_psy_desc.num_properties++;
 		cm->desc->measure_battery_temp = true;
 	}
 #ifdef CONFIG_THERMAL
@@ -1444,9 +1443,9 @@ static int cm_init_thermal_data(struct charger_manager *cm,
 			return PTR_ERR(cm->tzd_batt);
 
 		/* Use external thermometer */
-		cm->charger_psy.properties[cm->charger_psy.num_properties] =
+		cm->charger_psy_desc.properties[cm->charger_psy_desc.num_properties] =
 				POWER_SUPPLY_PROP_TEMP_AMBIENT;
-		cm->charger_psy.num_properties++;
+		cm->charger_psy_desc.num_properties++;
 		cm->desc->measure_battery_temp = true;
 		ret = 0;
 	}
@@ -1606,6 +1605,7 @@ static int charger_manager_probe(struct platform_device *pdev)
 	int j = 0;
 	union power_supply_propval val;
 	struct power_supply *fuel_gauge;
+	struct power_supply_config psy_cfg = {};
 
 	if (IS_ERR(desc)) {
 		dev_err(&pdev->dev, "No platform data (desc) found\n");
@@ -1620,6 +1620,7 @@ static int charger_manager_probe(struct platform_device *pdev)
 	/* Basic Values. Unspecified are Null or 0 */
 	cm->dev = &pdev->dev;
 	cm->desc = desc;
+	psy_cfg.drv_data = cm;
 
 	/* Initialize alarm timer */
 	if (alarmtimer_get_rtcdev()) {
@@ -1699,40 +1700,40 @@ static int charger_manager_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, cm);
 
-	memcpy(&cm->charger_psy, &psy_default, sizeof(psy_default));
+	memcpy(&cm->charger_psy_desc, &psy_default, sizeof(psy_default));
 
 	if (!desc->psy_name)
 		strncpy(cm->psy_name_buf, psy_default.name, PSY_NAME_MAX);
 	else
 		strncpy(cm->psy_name_buf, desc->psy_name, PSY_NAME_MAX);
-	cm->charger_psy.name = cm->psy_name_buf;
+	cm->charger_psy_desc.name = cm->psy_name_buf;
 
 	/* Allocate for psy properties because they may vary */
-	cm->charger_psy.properties = devm_kzalloc(&pdev->dev,
+	cm->charger_psy_desc.properties = devm_kzalloc(&pdev->dev,
 				sizeof(enum power_supply_property)
 				* (ARRAY_SIZE(default_charger_props) +
 				NUM_CHARGER_PSY_OPTIONAL), GFP_KERNEL);
-	if (!cm->charger_psy.properties)
+	if (!cm->charger_psy_desc.properties)
 		return -ENOMEM;
 
-	memcpy(cm->charger_psy.properties, default_charger_props,
+	memcpy(cm->charger_psy_desc.properties, default_charger_props,
 		sizeof(enum power_supply_property) *
 		ARRAY_SIZE(default_charger_props));
-	cm->charger_psy.num_properties = psy_default.num_properties;
+	cm->charger_psy_desc.num_properties = psy_default.num_properties;
 
 	/* Find which optional psy-properties are available */
 	if (!power_supply_get_property(fuel_gauge,
 					  POWER_SUPPLY_PROP_CHARGE_NOW, &val)) {
-		cm->charger_psy.properties[cm->charger_psy.num_properties] =
+		cm->charger_psy_desc.properties[cm->charger_psy_desc.num_properties] =
 				POWER_SUPPLY_PROP_CHARGE_NOW;
-		cm->charger_psy.num_properties++;
+		cm->charger_psy_desc.num_properties++;
 	}
 	if (!power_supply_get_property(fuel_gauge,
 					  POWER_SUPPLY_PROP_CURRENT_NOW,
 					  &val)) {
-		cm->charger_psy.properties[cm->charger_psy.num_properties] =
+		cm->charger_psy_desc.properties[cm->charger_psy_desc.num_properties] =
 				POWER_SUPPLY_PROP_CURRENT_NOW;
-		cm->charger_psy.num_properties++;
+		cm->charger_psy_desc.num_properties++;
 	}
 
 	ret = cm_init_thermal_data(cm, fuel_gauge);
@@ -1743,11 +1744,12 @@ static int charger_manager_probe(struct platform_device *pdev)
 
 	INIT_DELAYED_WORK(&cm->fullbatt_vchk_work, fullbatt_vchk);
 
-	ret = power_supply_register(NULL, &cm->charger_psy, NULL);
-	if (ret) {
+	cm->charger_psy = power_supply_register(NULL, &cm->charger_psy_desc,
+						&psy_cfg);
+	if (IS_ERR(cm->charger_psy)) {
 		dev_err(&pdev->dev, "Cannot register charger-manager with name \"%s\"\n",
-			cm->charger_psy.name);
-		return ret;
+			cm->charger_psy->desc->name);
+		return PTR_ERR(cm->charger_psy);
 	}
 
 	/* Register extcon device for charger cable */
@@ -1793,7 +1795,7 @@ err_reg_sysfs:
 		struct charger_regulator *charger;
 
 		charger = &desc->charger_regulators[i];
-		sysfs_remove_group(&cm->charger_psy.dev->kobj,
+		sysfs_remove_group(&cm->charger_psy->dev.kobj,
 				&charger->attr_g);
 	}
 err_reg_extcon:
@@ -1811,7 +1813,7 @@ err_reg_extcon:
 		regulator_put(desc->charger_regulators[i].consumer);
 	}
 
-	power_supply_unregister(&cm->charger_psy);
+	power_supply_unregister(cm->charger_psy);
 
 	return ret;
 }
@@ -1843,7 +1845,7 @@ static int charger_manager_remove(struct platform_device *pdev)
 	for (i = 0 ; i < desc->num_charger_regulators ; i++)
 		regulator_put(desc->charger_regulators[i].consumer);
 
-	power_supply_unregister(&cm->charger_psy);
+	power_supply_unregister(cm->charger_psy);
 
 	try_charger_enable(cm, false);
 
@@ -2002,7 +2004,7 @@ static bool find_power_supply(struct charger_manager *cm,
 	bool found = false;
 
 	for (i = 0; cm->desc->psy_charger_stat[i]; i++) {
-		if (!strcmp(psy->name, cm->desc->psy_charger_stat[i])) {
+		if (!strcmp(psy->desc->name, cm->desc->psy_charger_stat[i])) {
 			found = true;
 			break;
 		}
diff --git a/drivers/power/collie_battery.c b/drivers/power/collie_battery.c
index e7a808d1758a..2da9ed8ccbb5 100644
--- a/drivers/power/collie_battery.c
+++ b/drivers/power/collie_battery.c
@@ -30,7 +30,7 @@ static int wakeup_enabled;
 
 struct collie_bat {
 	int status;
-	struct power_supply psy;
+	struct power_supply *psy;
 	int full_chrg;
 
 	struct mutex work_lock; /* protects data */
@@ -98,7 +98,7 @@ static int collie_bat_get_property(struct power_supply *psy,
 			    union power_supply_propval *val)
 {
 	int ret = 0;
-	struct collie_bat *bat = container_of(psy, struct collie_bat, psy);
+	struct collie_bat *bat = power_supply_get_drvdata(psy);
 
 	if (bat->is_present && !bat->is_present(bat)
 			&& psp != POWER_SUPPLY_PROP_PRESENT) {
@@ -155,14 +155,14 @@ static irqreturn_t collie_bat_gpio_isr(int irq, void *data)
 static void collie_bat_update(struct collie_bat *bat)
 {
 	int old;
-	struct power_supply *psy = &bat->psy;
+	struct power_supply *psy = bat->psy;
 
 	mutex_lock(&bat->work_lock);
 
 	old = bat->status;
 
 	if (bat->is_present && !bat->is_present(bat)) {
-		printk(KERN_NOTICE "%s not present\n", psy->name);
+		printk(KERN_NOTICE "%s not present\n", psy->desc->name);
 		bat->status = POWER_SUPPLY_STATUS_UNKNOWN;
 		bat->full_chrg = -1;
 	} else if (power_supply_am_i_supplied(psy)) {
@@ -220,18 +220,20 @@ static enum power_supply_property collie_bat_bu_props[] = {
 	POWER_SUPPLY_PROP_PRESENT,
 };
 
+static const struct power_supply_desc collie_bat_main_desc = {
+	.name		= "main-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= collie_bat_main_props,
+	.num_properties	= ARRAY_SIZE(collie_bat_main_props),
+	.get_property	= collie_bat_get_property,
+	.external_power_changed = collie_bat_external_power_changed,
+	.use_for_apm	= 1,
+};
+
 static struct collie_bat collie_bat_main = {
 	.status = POWER_SUPPLY_STATUS_DISCHARGING,
 	.full_chrg = -1,
-	.psy = {
-		.name		= "main-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= collie_bat_main_props,
-		.num_properties	= ARRAY_SIZE(collie_bat_main_props),
-		.get_property	= collie_bat_get_property,
-		.external_power_changed = collie_bat_external_power_changed,
-		.use_for_apm	= 1,
-	},
+	.psy = NULL,
 
 	.gpio_full = COLLIE_GPIO_CO,
 	.gpio_charge_on = COLLIE_GPIO_CHARGE_ON,
@@ -249,18 +251,19 @@ static struct collie_bat collie_bat_main = {
 	.adc_temp_divider = 10000,
 };
 
+static const struct power_supply_desc collie_bat_bu_desc = {
+	.name		= "backup-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= collie_bat_bu_props,
+	.num_properties	= ARRAY_SIZE(collie_bat_bu_props),
+	.get_property	= collie_bat_get_property,
+	.external_power_changed = collie_bat_external_power_changed,
+};
+
 static struct collie_bat collie_bat_bu = {
 	.status = POWER_SUPPLY_STATUS_UNKNOWN,
 	.full_chrg = -1,
-
-	.psy = {
-		.name		= "backup-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= collie_bat_bu_props,
-		.num_properties	= ARRAY_SIZE(collie_bat_bu_props),
-		.get_property	= collie_bat_get_property,
-		.external_power_changed = collie_bat_external_power_changed,
-	},
+	.psy = NULL,
 
 	.gpio_full = -1,
 	.gpio_charge_on = -1,
@@ -319,6 +322,7 @@ static int collie_bat_resume(struct ucb1x00_dev *dev)
 static int collie_bat_probe(struct ucb1x00_dev *dev)
 {
 	int ret;
+	struct power_supply_config psy_main_cfg = {}, psy_bu_cfg = {};
 
 	if (!machine_is_collie())
 		return -ENODEV;
@@ -334,12 +338,23 @@ static int collie_bat_probe(struct ucb1x00_dev *dev)
 
 	INIT_WORK(&bat_work, collie_bat_work);
 
-	ret = power_supply_register(&dev->ucb->dev, &collie_bat_main.psy, NULL);
-	if (ret)
+	psy_main_cfg.drv_data = &collie_bat_main;
+	collie_bat_main.psy = power_supply_register(&dev->ucb->dev,
+						    &collie_bat_main_desc,
+						    &psy_main_cfg);
+	if (IS_ERR(collie_bat_main.psy)) {
+		ret = PTR_ERR(collie_bat_main.psy);
 		goto err_psy_reg_main;
-	ret = power_supply_register(&dev->ucb->dev, &collie_bat_bu.psy, NULL);
-	if (ret)
+	}
+
+	psy_main_cfg.drv_data = &collie_bat_bu;
+	collie_bat_bu.psy = power_supply_register(&dev->ucb->dev,
+						  &collie_bat_bu_desc,
+						  &psy_bu_cfg);
+	if (IS_ERR(collie_bat_bu.psy)) {
+		ret = PTR_ERR(collie_bat_bu.psy);
 		goto err_psy_reg_bu;
+	}
 
 	ret = request_irq(gpio_to_irq(COLLIE_GPIO_CO),
 				collie_bat_gpio_isr,
@@ -354,9 +369,9 @@ static int collie_bat_probe(struct ucb1x00_dev *dev)
 	return 0;
 
 err_irq:
-	power_supply_unregister(&collie_bat_bu.psy);
+	power_supply_unregister(collie_bat_bu.psy);
 err_psy_reg_bu:
-	power_supply_unregister(&collie_bat_main.psy);
+	power_supply_unregister(collie_bat_main.psy);
 err_psy_reg_main:
 
 	/* see comment in collie_bat_remove */
@@ -369,8 +384,8 @@ static void collie_bat_remove(struct ucb1x00_dev *dev)
 {
 	free_irq(gpio_to_irq(COLLIE_GPIO_CO), &collie_bat_main);
 
-	power_supply_unregister(&collie_bat_bu.psy);
-	power_supply_unregister(&collie_bat_main.psy);
+	power_supply_unregister(collie_bat_bu.psy);
+	power_supply_unregister(collie_bat_main.psy);
 
 	/*
 	 * Now cancel the bat_work.  We won't get any more schedules,
diff --git a/drivers/power/da9030_battery.c b/drivers/power/da9030_battery.c
index a87406ef18ee..5ca0f4d90792 100644
--- a/drivers/power/da9030_battery.c
+++ b/drivers/power/da9030_battery.c
@@ -89,7 +89,8 @@ struct da9030_battery_thresholds {
 };
 
 struct da9030_charger {
-	struct power_supply psy;
+	struct power_supply *psy;
+	struct power_supply_desc psy_desc;
 
 	struct device *master;
 
@@ -245,7 +246,7 @@ static void da9030_set_charge(struct da9030_charger *charger, int on)
 
 	da903x_write(charger->master, DA9030_CHARGE_CONTROL, val);
 
-	power_supply_changed(&charger->psy);
+	power_supply_changed(charger->psy);
 }
 
 static void da9030_charger_check_state(struct da9030_charger *charger)
@@ -341,8 +342,7 @@ static int da9030_battery_get_property(struct power_supply *psy,
 				   enum power_supply_property psp,
 				   union power_supply_propval *val)
 {
-	struct da9030_charger *charger;
-	charger = container_of(psy, struct da9030_charger, psy);
+	struct da9030_charger *charger = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -447,16 +447,16 @@ static void da9030_battery_convert_thresholds(struct da9030_charger *charger,
 
 static void da9030_battery_setup_psy(struct da9030_charger *charger)
 {
-	struct power_supply *psy = &charger->psy;
+	struct power_supply_desc *psy_desc = &charger->psy_desc;
 	struct power_supply_info *info = charger->battery_info;
 
-	psy->name = info->name;
-	psy->use_for_apm = info->use_for_apm;
-	psy->type = POWER_SUPPLY_TYPE_BATTERY;
-	psy->get_property = da9030_battery_get_property;
+	psy_desc->name = info->name;
+	psy_desc->use_for_apm = info->use_for_apm;
+	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+	psy_desc->get_property = da9030_battery_get_property;
 
-	psy->properties = da9030_battery_props;
-	psy->num_properties = ARRAY_SIZE(da9030_battery_props);
+	psy_desc->properties = da9030_battery_props;
+	psy_desc->num_properties = ARRAY_SIZE(da9030_battery_props);
 };
 
 static int da9030_battery_charger_init(struct da9030_charger *charger)
@@ -494,6 +494,7 @@ static int da9030_battery_charger_init(struct da9030_charger *charger)
 static int da9030_battery_probe(struct platform_device *pdev)
 {
 	struct da9030_charger *charger;
+	struct power_supply_config psy_cfg = {};
 	struct da9030_battery_info *pdata = pdev->dev.platform_data;
 	int ret;
 
@@ -541,9 +542,13 @@ static int da9030_battery_probe(struct platform_device *pdev)
 		goto err_notifier;
 
 	da9030_battery_setup_psy(charger);
-	ret = power_supply_register(&pdev->dev, &charger->psy, NULL);
-	if (ret)
+	psy_cfg.drv_data = charger;
+	charger->psy = power_supply_register(&pdev->dev, &charger->psy_desc,
+					     &psy_cfg);
+	if (IS_ERR(charger->psy)) {
+		ret = PTR_ERR(charger->psy);
 		goto err_ps_register;
+	}
 
 	charger->debug_file = da9030_bat_create_debugfs(charger);
 	platform_set_drvdata(pdev, charger);
@@ -571,7 +576,7 @@ static int da9030_battery_remove(struct platform_device *dev)
 				   DA9030_EVENT_CHIOVER | DA9030_EVENT_TBAT);
 	cancel_delayed_work_sync(&charger->work);
 	da9030_set_charge(charger, 0);
-	power_supply_unregister(&charger->psy);
+	power_supply_unregister(charger->psy);
 
 	return 0;
 }
diff --git a/drivers/power/da9052-battery.c b/drivers/power/da9052-battery.c
index 54ba9ddb6d4f..830ec46fe7d0 100644
--- a/drivers/power/da9052-battery.c
+++ b/drivers/power/da9052-battery.c
@@ -169,7 +169,7 @@ static u32 const vc_tbl[3][68][2] = {
 
 struct da9052_battery {
 	struct da9052 *da9052;
-	struct power_supply psy;
+	struct power_supply *psy;
 	struct notifier_block nb;
 	int charger_type;
 	int status;
@@ -452,7 +452,7 @@ static irqreturn_t da9052_bat_irq(int irq, void *data)
 
 	if (irq == DA9052_IRQ_CHGEND || irq == DA9052_IRQ_DCIN ||
 	    irq == DA9052_IRQ_VBUS || irq == DA9052_IRQ_TBAT) {
-		power_supply_changed(&bat->psy);
+		power_supply_changed(bat->psy);
 	}
 
 	return IRQ_HANDLED;
@@ -499,8 +499,7 @@ static int da9052_bat_get_property(struct power_supply *psy,
 {
 	int ret;
 	int illegal;
-	struct da9052_battery *bat = container_of(psy, struct da9052_battery,
-						  psy);
+	struct da9052_battery *bat = power_supply_get_drvdata(psy);
 
 	ret = da9052_bat_check_presence(bat, &illegal);
 	if (ret < 0)
@@ -561,7 +560,7 @@ static enum power_supply_property da9052_bat_props[] = {
 	POWER_SUPPLY_PROP_TECHNOLOGY,
 };
 
-static struct power_supply template_battery = {
+static struct power_supply_desc psy_desc = {
 	.name		= "da9052-bat",
 	.type		= POWER_SUPPLY_TYPE_BATTERY,
 	.properties	= da9052_bat_props,
@@ -591,6 +590,7 @@ static s32 da9052_bat_probe(struct platform_device *pdev)
 {
 	struct da9052_pdata *pdata;
 	struct da9052_battery *bat;
+	struct power_supply_config psy_cfg = {};
 	int ret;
 	int i;
 
@@ -599,8 +599,9 @@ static s32 da9052_bat_probe(struct platform_device *pdev)
 	if (!bat)
 		return -ENOMEM;
 
+	psy_cfg.drv_data = bat;
+
 	bat->da9052 = dev_get_drvdata(pdev->dev.parent);
-	bat->psy = template_battery;
 	bat->charger_type = DA9052_NOCHARGER;
 	bat->status = POWER_SUPPLY_STATUS_UNKNOWN;
 	bat->health = POWER_SUPPLY_HEALTH_UNKNOWN;
@@ -608,9 +609,9 @@ static s32 da9052_bat_probe(struct platform_device *pdev)
 
 	pdata = bat->da9052->dev->platform_data;
 	if (pdata != NULL && pdata->use_for_apm)
-		bat->psy.use_for_apm = pdata->use_for_apm;
+		psy_desc.use_for_apm = pdata->use_for_apm;
 	else
-		bat->psy.use_for_apm = 1;
+		psy_desc.use_for_apm = 1;
 
 	for (i = 0; i < ARRAY_SIZE(da9052_bat_irqs); i++) {
 		ret = da9052_request_irq(bat->da9052,
@@ -625,9 +626,11 @@ static s32 da9052_bat_probe(struct platform_device *pdev)
 		}
 	}
 
-	ret = power_supply_register(&pdev->dev, &bat->psy, NULL);
-	 if (ret)
+	bat->psy = power_supply_register(&pdev->dev, &psy_desc, &psy_cfg);
+	if (IS_ERR(bat->psy)) {
+		ret = PTR_ERR(bat->psy);
 		goto err;
+	}
 
 	platform_set_drvdata(pdev, bat);
 	return 0;
@@ -646,7 +649,7 @@ static int da9052_bat_remove(struct platform_device *pdev)
 	for (i = 0; i < ARRAY_SIZE(da9052_bat_irqs); i++)
 		da9052_free_irq(bat->da9052, da9052_bat_irq_bits[i], bat);
 
-	power_supply_unregister(&bat->psy);
+	power_supply_unregister(bat->psy);
 
 	return 0;
 }
diff --git a/drivers/power/da9150-charger.c b/drivers/power/da9150-charger.c
index db8ba5d8d1e3..60099815296e 100644
--- a/drivers/power/da9150-charger.c
+++ b/drivers/power/da9150-charger.c
@@ -30,8 +30,8 @@ struct da9150_charger {
 	struct da9150 *da9150;
 	struct device *dev;
 
-	struct power_supply usb;
-	struct power_supply battery;
+	struct power_supply *usb;
+	struct power_supply *battery;
 	struct power_supply *supply_online;
 
 	struct usb_phy *usb_phy;
@@ -114,7 +114,7 @@ static int da9150_charger_get_prop(struct power_supply *psy,
 				   enum power_supply_property psp,
 				   union power_supply_propval *val)
 {
-	struct da9150_charger *charger = dev_get_drvdata(psy->dev->parent);
+	struct da9150_charger *charger = dev_get_drvdata(psy->dev.parent);
 	int ret;
 
 	switch (psp) {
@@ -326,7 +326,7 @@ static int da9150_charger_battery_get_prop(struct power_supply *psy,
 					   enum power_supply_property psp,
 					   union power_supply_propval *val)
 {
-	struct da9150_charger *charger = dev_get_drvdata(psy->dev->parent);
+	struct da9150_charger *charger = dev_get_drvdata(psy->dev.parent);
 	int ret;
 
 	switch (psp) {
@@ -369,7 +369,7 @@ static irqreturn_t da9150_charger_chg_irq(int irq, void *data)
 {
 	struct da9150_charger *charger = data;
 
-	power_supply_changed(&charger->battery);
+	power_supply_changed(charger->battery);
 
 	return IRQ_HANDLED;
 }
@@ -380,7 +380,7 @@ static irqreturn_t da9150_charger_tjunc_irq(int irq, void *data)
 
 	/* Nothing we can really do except report this. */
 	dev_crit(charger->dev, "TJunc over temperature!!!\n");
-	power_supply_changed(&charger->usb);
+	power_supply_changed(charger->usb);
 
 	return IRQ_HANDLED;
 }
@@ -391,8 +391,8 @@ static irqreturn_t da9150_charger_vfault_irq(int irq, void *data)
 
 	/* Nothing we can really do except report this. */
 	dev_crit(charger->dev, "VSYS under voltage!!!\n");
-	power_supply_changed(&charger->usb);
-	power_supply_changed(&charger->battery);
+	power_supply_changed(charger->usb);
+	power_supply_changed(charger->battery);
 
 	return IRQ_HANDLED;
 }
@@ -408,10 +408,10 @@ static irqreturn_t da9150_charger_vbus_irq(int irq, void *data)
 	switch (reg & DA9150_VBUS_STAT_MASK) {
 	case DA9150_VBUS_STAT_OFF:
 	case DA9150_VBUS_STAT_WAIT:
-		charger->supply_online = &charger->battery;
+		charger->supply_online = charger->battery;
 		break;
 	case DA9150_VBUS_STAT_CHG:
-		charger->supply_online = &charger->usb;
+		charger->supply_online = charger->usb;
 		break;
 	default:
 		dev_warn(charger->dev, "Unknown VBUS state - reg = 0x%x\n",
@@ -420,8 +420,8 @@ static irqreturn_t da9150_charger_vbus_irq(int irq, void *data)
 		break;
 	}
 
-	power_supply_changed(&charger->usb);
-	power_supply_changed(&charger->battery);
+	power_supply_changed(charger->usb);
+	power_supply_changed(charger->battery);
 
 	return IRQ_HANDLED;
 }
@@ -439,8 +439,8 @@ static void da9150_charger_otg_work(struct work_struct *data)
 		break;
 	case USB_EVENT_NONE:
 		/* Revert to charge mode */
-		power_supply_changed(&charger->usb);
-		power_supply_changed(&charger->battery);
+		power_supply_changed(charger->usb);
+		power_supply_changed(charger->battery);
 		da9150_set_bits(charger->da9150, DA9150_PPR_BKCTRL_A,
 				DA9150_VBUS_MODE_MASK, DA9150_VBUS_MODE_CHG);
 		break;
@@ -499,12 +499,27 @@ static void da9150_charger_unregister_irq(struct platform_device *pdev,
 	free_irq(irq, charger);
 }
 
+static const struct power_supply_desc usb_desc = {
+	.name		= "da9150-usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= da9150_charger_props,
+	.num_properties	= ARRAY_SIZE(da9150_charger_props),
+	.get_property	= da9150_charger_get_prop,
+};
+
+static const struct power_supply_desc battery_desc = {
+	.name		= "da9150-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= da9150_charger_bat_props,
+	.num_properties	= ARRAY_SIZE(da9150_charger_bat_props),
+	.get_property	= da9150_charger_battery_get_prop,
+};
+
 static int da9150_charger_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct da9150 *da9150 = dev_get_drvdata(dev->parent);
 	struct da9150_charger *charger;
-	struct power_supply *usb, *battery;
 	u8 reg;
 	int ret;
 
@@ -542,26 +557,17 @@ static int da9150_charger_probe(struct platform_device *pdev)
 	}
 
 	/* Register power supplies */
-	usb = &charger->usb;
-	battery = &charger->battery;
-
-	usb->name = "da9150-usb",
-	usb->type = POWER_SUPPLY_TYPE_USB;
-	usb->properties = da9150_charger_props;
-	usb->num_properties = ARRAY_SIZE(da9150_charger_props);
-	usb->get_property = da9150_charger_get_prop;
-	ret = power_supply_register(dev, usb, NULL);
-	if (ret)
+	charger->usb = power_supply_register(dev, &usb_desc, NULL);
+	if (IS_ERR(charger->usb)) {
+		ret = PTR_ERR(charger->usb);
 		goto usb_fail;
+	}
 
-	battery->name = "da9150-battery";
-	battery->type = POWER_SUPPLY_TYPE_BATTERY;
-	battery->properties = da9150_charger_bat_props;
-	battery->num_properties = ARRAY_SIZE(da9150_charger_bat_props);
-	battery->get_property = da9150_charger_battery_get_prop;
-	ret = power_supply_register(dev, battery, NULL);
-	if (ret)
+	charger->battery = power_supply_register(dev, &battery_desc, NULL);
+	if (IS_ERR(charger->battery)) {
+		ret = PTR_ERR(charger->battery);
 		goto battery_fail;
+	}
 
 	/* Get initial online supply */
 	reg = da9150_reg_read(da9150, DA9150_STATUS_H);
@@ -569,10 +575,10 @@ static int da9150_charger_probe(struct platform_device *pdev)
 	switch (reg & DA9150_VBUS_STAT_MASK) {
 	case DA9150_VBUS_STAT_OFF:
 	case DA9150_VBUS_STAT_WAIT:
-		charger->supply_online = &charger->battery;
+		charger->supply_online = charger->battery;
 		break;
 	case DA9150_VBUS_STAT_CHG:
-		charger->supply_online = &charger->usb;
+		charger->supply_online = charger->usb;
 		break;
 	default:
 		dev_warn(dev, "Unknown VBUS state - reg = 0x%x\n", reg);
@@ -622,7 +628,7 @@ chg_irq_fail:
 	if (!IS_ERR_OR_NULL(charger->usb_phy))
 		usb_unregister_notifier(charger->usb_phy, &charger->otg_nb);
 battery_fail:
-	power_supply_unregister(usb);
+	power_supply_unregister(charger->usb);
 
 usb_fail:
 	iio_channel_release(charger->vbat_chan);
@@ -661,8 +667,8 @@ static int da9150_charger_remove(struct platform_device *pdev)
 	if (!IS_ERR_OR_NULL(charger->usb_phy))
 		usb_unregister_notifier(charger->usb_phy, &charger->otg_nb);
 
-	power_supply_unregister(&charger->battery);
-	power_supply_unregister(&charger->usb);
+	power_supply_unregister(charger->battery);
+	power_supply_unregister(charger->usb);
 
 	/* Release ADC channels */
 	iio_channel_release(charger->ibus_chan);
diff --git a/drivers/power/ds2760_battery.c b/drivers/power/ds2760_battery.c
index e82dff0bbb20..80f73ccb77ab 100644
--- a/drivers/power/ds2760_battery.c
+++ b/drivers/power/ds2760_battery.c
@@ -53,7 +53,8 @@ struct ds2760_device_info {
 	int charge_status;		/* POWER_SUPPLY_STATUS_* */
 
 	int full_counter;
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct device *w1_dev;
 	struct workqueue_struct *monitor_wqueue;
 	struct delayed_work monitor_work;
@@ -254,7 +255,7 @@ static void ds2760_battery_update_status(struct ds2760_device_info *di)
 	if (di->charge_status == POWER_SUPPLY_STATUS_UNKNOWN)
 		di->full_counter = 0;
 
-	if (power_supply_am_i_supplied(&di->bat)) {
+	if (power_supply_am_i_supplied(di->bat)) {
 		if (di->current_uA > 10000) {
 			di->charge_status = POWER_SUPPLY_STATUS_CHARGING;
 			di->full_counter = 0;
@@ -287,7 +288,7 @@ static void ds2760_battery_update_status(struct ds2760_device_info *di)
 	}
 
 	if (di->charge_status != old_charge_status)
-		power_supply_changed(&di->bat);
+		power_supply_changed(di->bat);
 }
 
 static void ds2760_battery_write_status(struct ds2760_device_info *di,
@@ -346,12 +347,9 @@ static void ds2760_battery_work(struct work_struct *work)
 	queue_delayed_work(di->monitor_wqueue, &di->monitor_work, interval);
 }
 
-#define to_ds2760_device_info(x) container_of((x), struct ds2760_device_info, \
-					      bat);
-
 static void ds2760_battery_external_power_changed(struct power_supply *psy)
 {
-	struct ds2760_device_info *di = to_ds2760_device_info(psy);
+	struct ds2760_device_info *di = power_supply_get_drvdata(psy);
 
 	dev_dbg(di->dev, "%s\n", __func__);
 
@@ -377,7 +375,7 @@ static void ds2760_battery_set_charged_work(struct work_struct *work)
 	 * that error.
 	 */
 
-	if (!power_supply_am_i_supplied(&di->bat))
+	if (!power_supply_am_i_supplied(di->bat))
 		return;
 
 	bias = (signed char) di->current_raw +
@@ -396,7 +394,7 @@ static void ds2760_battery_set_charged_work(struct work_struct *work)
 
 static void ds2760_battery_set_charged(struct power_supply *psy)
 {
-	struct ds2760_device_info *di = to_ds2760_device_info(psy);
+	struct ds2760_device_info *di = power_supply_get_drvdata(psy);
 
 	/* postpone the actual work by 20 secs. This is for debouncing GPIO
 	 * signals and to let the current value settle. See AN4188. */
@@ -407,7 +405,7 @@ static int ds2760_battery_get_property(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       union power_supply_propval *val)
 {
-	struct ds2760_device_info *di = to_ds2760_device_info(psy);
+	struct ds2760_device_info *di = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -458,7 +456,7 @@ static int ds2760_battery_set_property(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       const union power_supply_propval *val)
 {
-	struct ds2760_device_info *di = to_ds2760_device_info(psy);
+	struct ds2760_device_info *di = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_CHARGE_FULL:
@@ -508,6 +506,7 @@ static enum power_supply_property ds2760_battery_props[] = {
 
 static int ds2760_battery_probe(struct platform_device *pdev)
 {
+	struct power_supply_config psy_cfg = {};
 	char status;
 	int retval = 0;
 	struct ds2760_device_info *di;
@@ -520,20 +519,22 @@ static int ds2760_battery_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, di);
 
-	di->dev			= &pdev->dev;
-	di->w1_dev		= pdev->dev.parent;
-	di->bat.name		= dev_name(&pdev->dev);
-	di->bat.type		= POWER_SUPPLY_TYPE_BATTERY;
-	di->bat.properties	= ds2760_battery_props;
-	di->bat.num_properties	= ARRAY_SIZE(ds2760_battery_props);
-	di->bat.get_property	= ds2760_battery_get_property;
-	di->bat.set_property	= ds2760_battery_set_property;
-	di->bat.property_is_writeable =
+	di->dev				= &pdev->dev;
+	di->w1_dev			= pdev->dev.parent;
+	di->bat_desc.name		= dev_name(&pdev->dev);
+	di->bat_desc.type		= POWER_SUPPLY_TYPE_BATTERY;
+	di->bat_desc.properties		= ds2760_battery_props;
+	di->bat_desc.num_properties	= ARRAY_SIZE(ds2760_battery_props);
+	di->bat_desc.get_property	= ds2760_battery_get_property;
+	di->bat_desc.set_property	= ds2760_battery_set_property;
+	di->bat_desc.property_is_writeable =
 				  ds2760_battery_property_is_writeable;
-	di->bat.set_charged	= ds2760_battery_set_charged;
-	di->bat.external_power_changed =
+	di->bat_desc.set_charged	= ds2760_battery_set_charged;
+	di->bat_desc.external_power_changed =
 				  ds2760_battery_external_power_changed;
 
+	psy_cfg.drv_data		= di;
+
 	di->charge_status = POWER_SUPPLY_STATUS_UNKNOWN;
 
 	/* enable sleep mode feature */
@@ -555,9 +556,10 @@ static int ds2760_battery_probe(struct platform_device *pdev)
 	if (current_accum)
 		ds2760_battery_set_current_accum(di, current_accum);
 
-	retval = power_supply_register(&pdev->dev, &di->bat, NULL);
-	if (retval) {
+	di->bat = power_supply_register(&pdev->dev, &di->bat_desc, &psy_cfg);
+	if (IS_ERR(di->bat)) {
 		dev_err(di->dev, "failed to register battery\n");
+		retval = PTR_ERR(di->bat);
 		goto batt_failed;
 	}
 
@@ -574,7 +576,7 @@ static int ds2760_battery_probe(struct platform_device *pdev)
 	goto success;
 
 workqueue_failed:
-	power_supply_unregister(&di->bat);
+	power_supply_unregister(di->bat);
 batt_failed:
 di_alloc_failed:
 success:
@@ -588,7 +590,7 @@ static int ds2760_battery_remove(struct platform_device *pdev)
 	cancel_delayed_work_sync(&di->monitor_work);
 	cancel_delayed_work_sync(&di->set_charged_work);
 	destroy_workqueue(di->monitor_wqueue);
-	power_supply_unregister(&di->bat);
+	power_supply_unregister(di->bat);
 
 	return 0;
 }
@@ -610,7 +612,7 @@ static int ds2760_battery_resume(struct platform_device *pdev)
 	struct ds2760_device_info *di = platform_get_drvdata(pdev);
 
 	di->charge_status = POWER_SUPPLY_STATUS_UNKNOWN;
-	power_supply_changed(&di->bat);
+	power_supply_changed(di->bat);
 
 	mod_delayed_work(di->monitor_wqueue, &di->monitor_work, HZ);
 
diff --git a/drivers/power/ds2780_battery.c b/drivers/power/ds2780_battery.c
index b1d3570ea730..a7a0427343f3 100644
--- a/drivers/power/ds2780_battery.c
+++ b/drivers/power/ds2780_battery.c
@@ -37,7 +37,8 @@
 
 struct ds2780_device_info {
 	struct device *dev;
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct device *w1_dev;
 };
 
@@ -52,7 +53,7 @@ static const char manufacturer[] = "Maxim/Dallas";
 static inline struct ds2780_device_info *
 to_ds2780_device_info(struct power_supply *psy)
 {
-	return container_of(psy, struct ds2780_device_info, bat);
+	return power_supply_get_drvdata(psy);
 }
 
 static inline struct power_supply *to_power_supply(struct device *dev)
@@ -757,6 +758,7 @@ static const struct attribute_group ds2780_attr_group = {
 
 static int ds2780_battery_probe(struct platform_device *pdev)
 {
+	struct power_supply_config psy_cfg = {};
 	int ret = 0;
 	struct ds2780_device_info *dev_info;
 
@@ -770,25 +772,29 @@ static int ds2780_battery_probe(struct platform_device *pdev)
 
 	dev_info->dev			= &pdev->dev;
 	dev_info->w1_dev		= pdev->dev.parent;
-	dev_info->bat.name		= dev_name(&pdev->dev);
-	dev_info->bat.type		= POWER_SUPPLY_TYPE_BATTERY;
-	dev_info->bat.properties	= ds2780_battery_props;
-	dev_info->bat.num_properties	= ARRAY_SIZE(ds2780_battery_props);
-	dev_info->bat.get_property	= ds2780_battery_get_property;
+	dev_info->bat_desc.name		= dev_name(&pdev->dev);
+	dev_info->bat_desc.type		= POWER_SUPPLY_TYPE_BATTERY;
+	dev_info->bat_desc.properties	= ds2780_battery_props;
+	dev_info->bat_desc.num_properties = ARRAY_SIZE(ds2780_battery_props);
+	dev_info->bat_desc.get_property	= ds2780_battery_get_property;
 
-	ret = power_supply_register(&pdev->dev, &dev_info->bat, NULL);
-	if (ret) {
+	psy_cfg.drv_data		= dev_info;
+
+	dev_info->bat = power_supply_register(&pdev->dev, &dev_info->bat_desc,
+					      &psy_cfg);
+	if (IS_ERR(dev_info->bat)) {
 		dev_err(dev_info->dev, "failed to register battery\n");
+		ret = PTR_ERR(dev_info->bat);
 		goto fail;
 	}
 
-	ret = sysfs_create_group(&dev_info->bat.dev->kobj, &ds2780_attr_group);
+	ret = sysfs_create_group(&dev_info->bat->dev.kobj, &ds2780_attr_group);
 	if (ret) {
 		dev_err(dev_info->dev, "failed to create sysfs group\n");
 		goto fail_unregister;
 	}
 
-	ret = sysfs_create_bin_file(&dev_info->bat.dev->kobj,
+	ret = sysfs_create_bin_file(&dev_info->bat->dev.kobj,
 					&ds2780_param_eeprom_bin_attr);
 	if (ret) {
 		dev_err(dev_info->dev,
@@ -796,7 +802,7 @@ static int ds2780_battery_probe(struct platform_device *pdev)
 		goto fail_remove_group;
 	}
 
-	ret = sysfs_create_bin_file(&dev_info->bat.dev->kobj,
+	ret = sysfs_create_bin_file(&dev_info->bat->dev.kobj,
 					&ds2780_user_eeprom_bin_attr);
 	if (ret) {
 		dev_err(dev_info->dev,
@@ -807,12 +813,12 @@ static int ds2780_battery_probe(struct platform_device *pdev)
 	return 0;
 
 fail_remove_bin_file:
-	sysfs_remove_bin_file(&dev_info->bat.dev->kobj,
+	sysfs_remove_bin_file(&dev_info->bat->dev.kobj,
 				&ds2780_param_eeprom_bin_attr);
 fail_remove_group:
-	sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2780_attr_group);
+	sysfs_remove_group(&dev_info->bat->dev.kobj, &ds2780_attr_group);
 fail_unregister:
-	power_supply_unregister(&dev_info->bat);
+	power_supply_unregister(dev_info->bat);
 fail:
 	return ret;
 }
@@ -821,10 +827,13 @@ static int ds2780_battery_remove(struct platform_device *pdev)
 {
 	struct ds2780_device_info *dev_info = platform_get_drvdata(pdev);
 
-	/* remove attributes */
-	sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2780_attr_group);
+	/*
+	 * Remove attributes before unregistering power supply
+	 * because 'bat' will be freed on power_supply_unregister() call.
+	 */
+	sysfs_remove_group(&dev_info->bat->dev.kobj, &ds2780_attr_group);
 
-	power_supply_unregister(&dev_info->bat);
+	power_supply_unregister(dev_info->bat);
 
 	return 0;
 }
diff --git a/drivers/power/ds2781_battery.c b/drivers/power/ds2781_battery.c
index 50686dc59711..56d583dae908 100644
--- a/drivers/power/ds2781_battery.c
+++ b/drivers/power/ds2781_battery.c
@@ -35,7 +35,8 @@
 
 struct ds2781_device_info {
 	struct device *dev;
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct device *w1_dev;
 };
 
@@ -50,7 +51,7 @@ static const char manufacturer[] = "Maxim/Dallas";
 static inline struct ds2781_device_info *
 to_ds2781_device_info(struct power_supply *psy)
 {
-	return container_of(psy, struct ds2781_device_info, bat);
+	return power_supply_get_drvdata(psy);
 }
 
 static inline struct power_supply *to_power_supply(struct device *dev)
@@ -328,7 +329,7 @@ static int ds2781_get_status(struct ds2781_device_info *dev_info, int *status)
 	if (ret < 0)
 		return ret;
 
-	if (power_supply_am_i_supplied(&dev_info->bat)) {
+	if (power_supply_am_i_supplied(dev_info->bat)) {
 		if (capacity == 100)
 			*status = POWER_SUPPLY_STATUS_FULL;
 		else if (current_uA > 50000)
@@ -752,6 +753,7 @@ static const struct attribute_group ds2781_attr_group = {
 
 static int ds2781_battery_probe(struct platform_device *pdev)
 {
+	struct power_supply_config psy_cfg = {};
 	int ret = 0;
 	struct ds2781_device_info *dev_info;
 
@@ -763,25 +765,29 @@ static int ds2781_battery_probe(struct platform_device *pdev)
 
 	dev_info->dev			= &pdev->dev;
 	dev_info->w1_dev		= pdev->dev.parent;
-	dev_info->bat.name		= dev_name(&pdev->dev);
-	dev_info->bat.type		= POWER_SUPPLY_TYPE_BATTERY;
-	dev_info->bat.properties	= ds2781_battery_props;
-	dev_info->bat.num_properties	= ARRAY_SIZE(ds2781_battery_props);
-	dev_info->bat.get_property	= ds2781_battery_get_property;
+	dev_info->bat_desc.name		= dev_name(&pdev->dev);
+	dev_info->bat_desc.type		= POWER_SUPPLY_TYPE_BATTERY;
+	dev_info->bat_desc.properties	= ds2781_battery_props;
+	dev_info->bat_desc.num_properties = ARRAY_SIZE(ds2781_battery_props);
+	dev_info->bat_desc.get_property	= ds2781_battery_get_property;
 
-	ret = power_supply_register(&pdev->dev, &dev_info->bat, NULL);
-	if (ret) {
+	psy_cfg.drv_data		= dev_info;
+
+	dev_info->bat = power_supply_register(&pdev->dev, &dev_info->bat_desc,
+						&psy_cfg);
+	if (IS_ERR(dev_info->bat)) {
 		dev_err(dev_info->dev, "failed to register battery\n");
+		ret = PTR_ERR(dev_info->bat);
 		goto fail;
 	}
 
-	ret = sysfs_create_group(&dev_info->bat.dev->kobj, &ds2781_attr_group);
+	ret = sysfs_create_group(&dev_info->bat->dev.kobj, &ds2781_attr_group);
 	if (ret) {
 		dev_err(dev_info->dev, "failed to create sysfs group\n");
 		goto fail_unregister;
 	}
 
-	ret = sysfs_create_bin_file(&dev_info->bat.dev->kobj,
+	ret = sysfs_create_bin_file(&dev_info->bat->dev.kobj,
 					&ds2781_param_eeprom_bin_attr);
 	if (ret) {
 		dev_err(dev_info->dev,
@@ -789,7 +795,7 @@ static int ds2781_battery_probe(struct platform_device *pdev)
 		goto fail_remove_group;
 	}
 
-	ret = sysfs_create_bin_file(&dev_info->bat.dev->kobj,
+	ret = sysfs_create_bin_file(&dev_info->bat->dev.kobj,
 					&ds2781_user_eeprom_bin_attr);
 	if (ret) {
 		dev_err(dev_info->dev,
@@ -800,12 +806,12 @@ static int ds2781_battery_probe(struct platform_device *pdev)
 	return 0;
 
 fail_remove_bin_file:
-	sysfs_remove_bin_file(&dev_info->bat.dev->kobj,
+	sysfs_remove_bin_file(&dev_info->bat->dev.kobj,
 				&ds2781_param_eeprom_bin_attr);
 fail_remove_group:
-	sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2781_attr_group);
+	sysfs_remove_group(&dev_info->bat->dev.kobj, &ds2781_attr_group);
 fail_unregister:
-	power_supply_unregister(&dev_info->bat);
+	power_supply_unregister(dev_info->bat);
 fail:
 	return ret;
 }
@@ -814,10 +820,13 @@ static int ds2781_battery_remove(struct platform_device *pdev)
 {
 	struct ds2781_device_info *dev_info = platform_get_drvdata(pdev);
 
-	/* remove attributes */
-	sysfs_remove_group(&dev_info->bat.dev->kobj, &ds2781_attr_group);
+	/*
+	 * Remove attributes before unregistering power supply
+	 * because 'bat' will be freed on power_supply_unregister() call.
+	 */
+	sysfs_remove_group(&dev_info->bat->dev.kobj, &ds2781_attr_group);
 
-	power_supply_unregister(&dev_info->bat);
+	power_supply_unregister(dev_info->bat);
 
 	return 0;
 }
diff --git a/drivers/power/ds2782_battery.c b/drivers/power/ds2782_battery.c
index 2dcb96a83cee..ed4d756d21e4 100644
--- a/drivers/power/ds2782_battery.c
+++ b/drivers/power/ds2782_battery.c
@@ -53,11 +53,12 @@ struct ds278x_battery_ops {
 	int (*get_battery_capacity)(struct ds278x_info *info, int *capacity);
 };
 
-#define to_ds278x_info(x) container_of(x, struct ds278x_info, battery)
+#define to_ds278x_info(x) power_supply_get_drvdata(x)
 
 struct ds278x_info {
 	struct i2c_client	*client;
-	struct power_supply	battery;
+	struct power_supply	*battery;
+	struct power_supply_desc	battery_desc;
 	struct ds278x_battery_ops  *ops;
 	struct delayed_work	bat_work;
 	int			id;
@@ -285,7 +286,7 @@ static void ds278x_bat_update(struct ds278x_info *info)
 	ds278x_get_status(info, &info->status);
 
 	if ((old_status != info->status) || (old_capacity != info->capacity))
-		power_supply_changed(&info->battery);
+		power_supply_changed(info->battery);
 }
 
 static void ds278x_bat_work(struct work_struct *work)
@@ -306,7 +307,7 @@ static enum power_supply_property ds278x_battery_props[] = {
 	POWER_SUPPLY_PROP_TEMP,
 };
 
-static void ds278x_power_supply_init(struct power_supply *battery)
+static void ds278x_power_supply_init(struct power_supply_desc *battery)
 {
 	battery->type			= POWER_SUPPLY_TYPE_BATTERY;
 	battery->properties		= ds278x_battery_props;
@@ -319,8 +320,8 @@ static int ds278x_battery_remove(struct i2c_client *client)
 {
 	struct ds278x_info *info = i2c_get_clientdata(client);
 
-	power_supply_unregister(&info->battery);
-	kfree(info->battery.name);
+	power_supply_unregister(info->battery);
+	kfree(info->battery_desc.name);
 
 	mutex_lock(&battery_lock);
 	idr_remove(&battery_id, info->id);
@@ -377,6 +378,7 @@ static int ds278x_battery_probe(struct i2c_client *client,
 				const struct i2c_device_id *id)
 {
 	struct ds278x_platform_data *pdata = client->dev.platform_data;
+	struct power_supply_config psy_cfg = {};
 	struct ds278x_info *info;
 	int ret;
 	int num;
@@ -404,8 +406,9 @@ static int ds278x_battery_probe(struct i2c_client *client,
 		goto fail_info;
 	}
 
-	info->battery.name = kasprintf(GFP_KERNEL, "%s-%d", client->name, num);
-	if (!info->battery.name) {
+	info->battery_desc.name = kasprintf(GFP_KERNEL, "%s-%d",
+					    client->name, num);
+	if (!info->battery_desc.name) {
 		ret = -ENOMEM;
 		goto fail_name;
 	}
@@ -417,16 +420,19 @@ static int ds278x_battery_probe(struct i2c_client *client,
 	info->client = client;
 	info->id = num;
 	info->ops  = &ds278x_ops[id->driver_data];
-	ds278x_power_supply_init(&info->battery);
+	ds278x_power_supply_init(&info->battery_desc);
+	psy_cfg.drv_data = info;
 
 	info->capacity = 100;
 	info->status = POWER_SUPPLY_STATUS_FULL;
 
 	INIT_DELAYED_WORK(&info->bat_work, ds278x_bat_work);
 
-	ret = power_supply_register(&client->dev, &info->battery, NULL);
-	if (ret) {
+	info->battery = power_supply_register(&client->dev,
+					      &info->battery_desc, &psy_cfg);
+	if (IS_ERR(info->battery)) {
 		dev_err(&client->dev, "failed to register battery\n");
+		ret = PTR_ERR(info->battery);
 		goto fail_register;
 	} else {
 		schedule_delayed_work(&info->bat_work, DS278x_DELAY);
@@ -435,7 +441,7 @@ static int ds278x_battery_probe(struct i2c_client *client,
 	return 0;
 
 fail_register:
-	kfree(info->battery.name);
+	kfree(info->battery_desc.name);
 fail_name:
 	kfree(info);
 fail_info:
diff --git a/drivers/power/generic-adc-battery.c b/drivers/power/generic-adc-battery.c
index 4575955de7c5..fedc5818fab7 100644
--- a/drivers/power/generic-adc-battery.c
+++ b/drivers/power/generic-adc-battery.c
@@ -44,7 +44,8 @@ static const char *const gab_chan_name[] = {
 };
 
 struct gab {
-	struct power_supply	psy;
+	struct power_supply		*psy;
+	struct power_supply_desc	psy_desc;
 	struct iio_channel	*channel[GAB_MAX_CHAN_TYPE];
 	struct gab_platform_data	*pdata;
 	struct delayed_work bat_work;
@@ -55,7 +56,7 @@ struct gab {
 
 static struct gab *to_generic_bat(struct power_supply *psy)
 {
-	return container_of(psy, struct gab, psy);
+	return power_supply_get_drvdata(psy);
 }
 
 static void gab_ext_power_changed(struct power_supply *psy)
@@ -151,7 +152,7 @@ static int gab_get_property(struct power_supply *psy,
 
 	adc_bat = to_generic_bat(psy);
 	if (!adc_bat) {
-		dev_err(psy->dev, "no battery infos ?!\n");
+		dev_err(&psy->dev, "no battery infos ?!\n");
 		return -EINVAL;
 	}
 	pdata = adc_bat->pdata;
@@ -210,7 +211,7 @@ static void gab_work(struct work_struct *work)
 	pdata = adc_bat->pdata;
 	status = adc_bat->status;
 
-	is_plugged = power_supply_am_i_supplied(&adc_bat->psy);
+	is_plugged = power_supply_am_i_supplied(adc_bat->psy);
 	adc_bat->cable_plugged = is_plugged;
 
 	if (!is_plugged)
@@ -221,7 +222,7 @@ static void gab_work(struct work_struct *work)
 		adc_bat->status = POWER_SUPPLY_STATUS_CHARGING;
 
 	if (status != adc_bat->status)
-		power_supply_changed(&adc_bat->psy);
+		power_supply_changed(adc_bat->psy);
 }
 
 static irqreturn_t gab_charged(int irq, void *dev_id)
@@ -239,7 +240,8 @@ static irqreturn_t gab_charged(int irq, void *dev_id)
 static int gab_probe(struct platform_device *pdev)
 {
 	struct gab *adc_bat;
-	struct power_supply *psy;
+	struct power_supply_desc *psy_desc;
+	struct power_supply_config psy_cfg = {};
 	struct gab_platform_data *pdata = pdev->dev.platform_data;
 	enum power_supply_property *properties;
 	int ret = 0;
@@ -252,32 +254,34 @@ static int gab_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	psy = &adc_bat->psy;
-	psy->name = pdata->battery_info.name;
+	psy_cfg.drv_data = adc_bat;
+	psy_desc = &adc_bat->psy_desc;
+	psy_desc->name = pdata->battery_info.name;
 
 	/* bootup default values for the battery */
 	adc_bat->cable_plugged = false;
 	adc_bat->status = POWER_SUPPLY_STATUS_DISCHARGING;
-	psy->type = POWER_SUPPLY_TYPE_BATTERY;
-	psy->get_property = gab_get_property;
-	psy->external_power_changed = gab_ext_power_changed;
+	psy_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+	psy_desc->get_property = gab_get_property;
+	psy_desc->external_power_changed = gab_ext_power_changed;
 	adc_bat->pdata = pdata;
 
 	/*
 	 * copying the static properties and allocating extra memory for holding
 	 * the extra configurable properties received from platform data.
 	 */
-	psy->properties = kcalloc(ARRAY_SIZE(gab_props) +
+	psy_desc->properties = kcalloc(ARRAY_SIZE(gab_props) +
 					ARRAY_SIZE(gab_chan_name),
-					sizeof(*psy->properties), GFP_KERNEL);
-	if (!psy->properties) {
+					sizeof(*psy_desc->properties),
+					GFP_KERNEL);
+	if (!psy_desc->properties) {
 		ret = -ENOMEM;
 		goto first_mem_fail;
 	}
 
-	memcpy(psy->properties, gab_props, sizeof(gab_props));
+	memcpy(psy_desc->properties, gab_props, sizeof(gab_props));
 	properties = (enum power_supply_property *)
-				((char *)psy->properties + sizeof(gab_props));
+			((char *)psy_desc->properties + sizeof(gab_props));
 
 	/*
 	 * getting channel from iio and copying the battery properties
@@ -291,7 +295,7 @@ static int gab_probe(struct platform_device *pdev)
 			adc_bat->channel[chan] = NULL;
 		} else {
 			/* copying properties for supported channels only */
-			memcpy(properties + sizeof(*(psy->properties)) * index,
+			memcpy(properties + sizeof(*(psy_desc->properties)) * index,
 					&gab_dyn_props[chan],
 					sizeof(gab_dyn_props[chan]));
 			index++;
@@ -310,11 +314,13 @@ static int gab_probe(struct platform_device *pdev)
 	 * as come channels may be not be supported by the device.So
 	 * we need to take care of that.
 	 */
-	psy->num_properties = ARRAY_SIZE(gab_props) + index;
+	psy_desc->num_properties = ARRAY_SIZE(gab_props) + index;
 
-	ret = power_supply_register(&pdev->dev, psy, NULL);
-	if (ret)
+	adc_bat->psy = power_supply_register(&pdev->dev, psy_desc, &psy_cfg);
+	if (IS_ERR(adc_bat->psy)) {
+		ret = PTR_ERR(adc_bat->psy);
 		goto err_reg_fail;
+	}
 
 	INIT_DELAYED_WORK(&adc_bat->bat_work, gab_work);
 
@@ -342,14 +348,14 @@ static int gab_probe(struct platform_device *pdev)
 err_gpio:
 	gpio_free(pdata->gpio_charge_finished);
 gpio_req_fail:
-	power_supply_unregister(psy);
+	power_supply_unregister(adc_bat->psy);
 err_reg_fail:
 	for (chan = 0; chan < ARRAY_SIZE(gab_chan_name); chan++) {
 		if (adc_bat->channel[chan])
 			iio_channel_release(adc_bat->channel[chan]);
 	}
 second_mem_fail:
-	kfree(psy->properties);
+	kfree(psy_desc->properties);
 first_mem_fail:
 	return ret;
 }
@@ -360,7 +366,7 @@ static int gab_remove(struct platform_device *pdev)
 	struct gab *adc_bat = platform_get_drvdata(pdev);
 	struct gab_platform_data *pdata = adc_bat->pdata;
 
-	power_supply_unregister(&adc_bat->psy);
+	power_supply_unregister(adc_bat->psy);
 
 	if (gpio_is_valid(pdata->gpio_charge_finished)) {
 		free_irq(gpio_to_irq(pdata->gpio_charge_finished), adc_bat);
@@ -372,7 +378,7 @@ static int gab_remove(struct platform_device *pdev)
 			iio_channel_release(adc_bat->channel[chan]);
 	}
 
-	kfree(adc_bat->psy.properties);
+	kfree(adc_bat->psy_desc.properties);
 	cancel_delayed_work(&adc_bat->bat_work);
 	return 0;
 }
diff --git a/drivers/power/goldfish_battery.c b/drivers/power/goldfish_battery.c
index 61d437f8cf76..a50bb988c69a 100644
--- a/drivers/power/goldfish_battery.c
+++ b/drivers/power/goldfish_battery.c
@@ -30,8 +30,8 @@ struct goldfish_battery_data {
 	int irq;
 	spinlock_t lock;
 
-	struct power_supply battery;
-	struct power_supply ac;
+	struct power_supply *battery;
+	struct power_supply *ac;
 };
 
 #define GOLDFISH_BATTERY_READ(data, addr) \
@@ -67,8 +67,7 @@ static int goldfish_ac_get_property(struct power_supply *psy,
 			enum power_supply_property psp,
 			union power_supply_propval *val)
 {
-	struct goldfish_battery_data *data = container_of(psy,
-		struct goldfish_battery_data, ac);
+	struct goldfish_battery_data *data = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -86,8 +85,7 @@ static int goldfish_battery_get_property(struct power_supply *psy,
 				 enum power_supply_property psp,
 				 union power_supply_propval *val)
 {
-	struct goldfish_battery_data *data = container_of(psy,
-		struct goldfish_battery_data, battery);
+	struct goldfish_battery_data *data = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -139,20 +137,36 @@ static irqreturn_t goldfish_battery_interrupt(int irq, void *dev_id)
 	status &= BATTERY_INT_MASK;
 
 	if (status & BATTERY_STATUS_CHANGED)
-		power_supply_changed(&data->battery);
+		power_supply_changed(data->battery);
 	if (status & AC_STATUS_CHANGED)
-		power_supply_changed(&data->ac);
+		power_supply_changed(data->ac);
 
 	spin_unlock_irqrestore(&data->lock, irq_flags);
 	return status ? IRQ_HANDLED : IRQ_NONE;
 }
 
+static const struct power_supply_desc battery_desc = {
+	.properties	= goldfish_battery_props,
+	.num_properties	= ARRAY_SIZE(goldfish_battery_props),
+	.get_property	= goldfish_battery_get_property,
+	.name		= "battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+};
+
+static const struct power_supply_desc ac_desc = {
+	.properties	= goldfish_ac_props,
+	.num_properties	= ARRAY_SIZE(goldfish_ac_props),
+	.get_property	= goldfish_ac_get_property,
+	.name		= "ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+};
 
 static int goldfish_battery_probe(struct platform_device *pdev)
 {
 	int ret;
 	struct resource *r;
 	struct goldfish_battery_data *data;
+	struct power_supply_config psy_cfg = {};
 
 	data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
 	if (data == NULL)
@@ -160,18 +174,6 @@ static int goldfish_battery_probe(struct platform_device *pdev)
 
 	spin_lock_init(&data->lock);
 
-	data->battery.properties = goldfish_battery_props;
-	data->battery.num_properties = ARRAY_SIZE(goldfish_battery_props);
-	data->battery.get_property = goldfish_battery_get_property;
-	data->battery.name = "battery";
-	data->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-
-	data->ac.properties = goldfish_ac_props;
-	data->ac.num_properties = ARRAY_SIZE(goldfish_ac_props);
-	data->ac.get_property = goldfish_ac_get_property;
-	data->ac.name = "ac";
-	data->ac.type = POWER_SUPPLY_TYPE_MAINS;
-
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	if (r == NULL) {
 		dev_err(&pdev->dev, "platform_get_resource failed\n");
@@ -195,14 +197,17 @@ static int goldfish_battery_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = power_supply_register(&pdev->dev, &data->ac, NULL);
-	if (ret)
-		return ret;
+	psy_cfg.drv_data = data;
 
-	ret = power_supply_register(&pdev->dev, &data->battery, NULL);
-	if (ret) {
-		power_supply_unregister(&data->ac);
-		return ret;
+	data->ac = power_supply_register(&pdev->dev, &ac_desc, &psy_cfg);
+	if (IS_ERR(data->ac))
+		return PTR_ERR(data->ac);
+
+	data->battery = power_supply_register(&pdev->dev, &battery_desc,
+						&psy_cfg);
+	if (IS_ERR(data->battery)) {
+		power_supply_unregister(data->ac);
+		return PTR_ERR(data->battery);
 	}
 
 	platform_set_drvdata(pdev, data);
@@ -216,8 +221,8 @@ static int goldfish_battery_remove(struct platform_device *pdev)
 {
 	struct goldfish_battery_data *data = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&data->battery);
-	power_supply_unregister(&data->ac);
+	power_supply_unregister(data->battery);
+	power_supply_unregister(data->ac);
 	battery_data = NULL;
 	return 0;
 }
diff --git a/drivers/power/gpio-charger.c b/drivers/power/gpio-charger.c
index 47a9e2bd94d9..c5869b1941ac 100644
--- a/drivers/power/gpio-charger.c
+++ b/drivers/power/gpio-charger.c
@@ -32,7 +32,8 @@ struct gpio_charger {
 	unsigned int irq;
 	bool wakeup_enabled;
 
-	struct power_supply charger;
+	struct power_supply *charger;
+	struct power_supply_desc charger_desc;
 };
 
 static irqreturn_t gpio_charger_irq(int irq, void *devid)
@@ -46,7 +47,7 @@ static irqreturn_t gpio_charger_irq(int irq, void *devid)
 
 static inline struct gpio_charger *psy_to_gpio_charger(struct power_supply *psy)
 {
-	return container_of(psy, struct gpio_charger, charger);
+	return power_supply_get_drvdata(psy);
 }
 
 static int gpio_charger_get_property(struct power_supply *psy,
@@ -129,7 +130,7 @@ static int gpio_charger_probe(struct platform_device *pdev)
 	const struct gpio_charger_platform_data *pdata = pdev->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
 	struct gpio_charger *gpio_charger;
-	struct power_supply *charger;
+	struct power_supply_desc *charger_desc;
 	int ret;
 	int irq;
 
@@ -155,17 +156,18 @@ static int gpio_charger_probe(struct platform_device *pdev)
 		return -ENOMEM;
 	}
 
-	charger = &gpio_charger->charger;
+	charger_desc = &gpio_charger->charger_desc;
 
-	charger->name = pdata->name ? pdata->name : "gpio-charger";
-	charger->type = pdata->type;
-	charger->properties = gpio_charger_properties;
-	charger->num_properties = ARRAY_SIZE(gpio_charger_properties);
-	charger->get_property = gpio_charger_get_property;
+	charger_desc->name = pdata->name ? pdata->name : "gpio-charger";
+	charger_desc->type = pdata->type;
+	charger_desc->properties = gpio_charger_properties;
+	charger_desc->num_properties = ARRAY_SIZE(gpio_charger_properties);
+	charger_desc->get_property = gpio_charger_get_property;
 
 	psy_cfg.supplied_to = pdata->supplied_to;
 	psy_cfg.num_supplicants = pdata->num_supplicants;
 	psy_cfg.of_node = pdev->dev.of_node;
+	psy_cfg.drv_data = gpio_charger;
 
 	ret = gpio_request(pdata->gpio, dev_name(&pdev->dev));
 	if (ret) {
@@ -180,8 +182,10 @@ static int gpio_charger_probe(struct platform_device *pdev)
 
 	gpio_charger->pdata = pdata;
 
-	ret = power_supply_register(&pdev->dev, charger, &psy_cfg);
-	if (ret < 0) {
+	gpio_charger->charger = power_supply_register(&pdev->dev,
+						      charger_desc, &psy_cfg);
+	if (IS_ERR(gpio_charger->charger)) {
+		ret = PTR_ERR(gpio_charger->charger);
 		dev_err(&pdev->dev, "Failed to register power supply: %d\n",
 			ret);
 		goto err_gpio_free;
@@ -191,7 +195,7 @@ static int gpio_charger_probe(struct platform_device *pdev)
 	if (irq > 0) {
 		ret = request_any_context_irq(irq, gpio_charger_irq,
 				IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-				dev_name(&pdev->dev), charger);
+				dev_name(&pdev->dev), gpio_charger->charger);
 		if (ret < 0)
 			dev_warn(&pdev->dev, "Failed to request irq: %d\n", ret);
 		else
@@ -215,9 +219,9 @@ static int gpio_charger_remove(struct platform_device *pdev)
 	struct gpio_charger *gpio_charger = platform_get_drvdata(pdev);
 
 	if (gpio_charger->irq)
-		free_irq(gpio_charger->irq, &gpio_charger->charger);
+		free_irq(gpio_charger->irq, gpio_charger->charger);
 
-	power_supply_unregister(&gpio_charger->charger);
+	power_supply_unregister(gpio_charger->charger);
 
 	gpio_free(gpio_charger->pdata->gpio);
 
@@ -243,7 +247,7 @@ static int gpio_charger_resume(struct device *dev)
 
 	if (device_may_wakeup(dev) && gpio_charger->wakeup_enabled)
 		disable_irq_wake(gpio_charger->irq);
-	power_supply_changed(&gpio_charger->charger);
+	power_supply_changed(gpio_charger->charger);
 
 	return 0;
 }
diff --git a/drivers/power/intel_mid_battery.c b/drivers/power/intel_mid_battery.c
index 8a149657cd71..9fa4acc107ca 100644
--- a/drivers/power/intel_mid_battery.c
+++ b/drivers/power/intel_mid_battery.c
@@ -107,8 +107,8 @@ struct pmic_power_module_info {
 	unsigned int batt_prev_charge_full;	/* in mAS */
 	unsigned int batt_charge_rate;		/* in units per second */
 
-	struct power_supply usb;
-	struct power_supply batt;
+	struct power_supply *usb;
+	struct power_supply *batt;
 	int irq;				/* GPE_ID or IRQ# */
 	struct workqueue_struct *monitor_wqueue;
 	struct delayed_work monitor_battery;
@@ -404,8 +404,7 @@ static int pmic_usb_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct pmic_power_module_info *pbi = container_of(psy,
-				struct pmic_power_module_info, usb);
+	struct pmic_power_module_info *pbi = power_supply_get_drvdata(psy);
 
 	/* update pmic_power_module_info members */
 	pmic_battery_read_status(pbi);
@@ -444,8 +443,7 @@ static int pmic_battery_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct pmic_power_module_info *pbi = container_of(psy,
-				struct pmic_power_module_info, batt);
+	struct pmic_power_module_info *pbi = power_supply_get_drvdata(psy);
 
 	/* update pmic_power_module_info members */
 	pmic_battery_read_status(pbi);
@@ -640,6 +638,25 @@ static void pmic_battery_handle_intrpt(struct work_struct *work)
 			__func__);
 }
 
+/*
+ * Description of power supplies
+ */
+static const struct power_supply_desc pmic_usb_desc = {
+	.name		= "pmic-usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= pmic_usb_props,
+	.num_properties	= ARRAY_SIZE(pmic_usb_props),
+	.get_property	= pmic_usb_get_property,
+};
+
+static const struct power_supply_desc pmic_batt_desc = {
+	.name		= "pmic-batt",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= pmic_battery_props,
+	.num_properties	= ARRAY_SIZE(pmic_battery_props),
+	.get_property	= pmic_battery_get_property,
+};
+
 /**
  * pmic_battery_probe - pmic battery initialize
  * @irq: pmic battery device irq
@@ -653,6 +670,7 @@ static int probe(int irq, struct device *dev)
 {
 	int retval = 0;
 	struct pmic_power_module_info *pbi;
+	struct power_supply_config psy_cfg = {};
 
 	dev_dbg(dev, "pmic-battery: found pmic battery device\n");
 
@@ -666,6 +684,7 @@ static int probe(int irq, struct device *dev)
 	pbi->dev = dev;
 	pbi->irq = irq;
 	dev_set_drvdata(dev, pbi);
+	psy_cfg.drv_data = pbi;
 
 	/* initialize all required framework before enabling interrupts */
 	INIT_WORK(&pbi->handler, pmic_battery_handle_intrpt);
@@ -687,16 +706,12 @@ static int probe(int irq, struct device *dev)
 	}
 
 	/* register pmic-batt with power supply subsystem */
-	pbi->batt.name = "pmic-batt";
-	pbi->batt.type = POWER_SUPPLY_TYPE_BATTERY;
-	pbi->batt.properties = pmic_battery_props;
-	pbi->batt.num_properties = ARRAY_SIZE(pmic_battery_props);
-	pbi->batt.get_property = pmic_battery_get_property;
-	retval = power_supply_register(dev, &pbi->batt, NULL);
-	if (retval) {
+	pbi->batt = power_supply_register(dev, &pmic_usb_desc, &psy_cfg);
+	if (IS_ERR(pbi->batt)) {
 		dev_err(dev,
 			"%s(): failed to register pmic battery device with power supply subsystem\n",
 				__func__);
+		retval = PTR_ERR(pbi->batt);
 		goto power_reg_failed;
 	}
 
@@ -707,16 +722,12 @@ static int probe(int irq, struct device *dev)
 	queue_delayed_work(pbi->monitor_wqueue, &pbi->monitor_battery, HZ * 1);
 
 	/* register pmic-usb with power supply subsystem */
-	pbi->usb.name = "pmic-usb";
-	pbi->usb.type = POWER_SUPPLY_TYPE_USB;
-	pbi->usb.properties = pmic_usb_props;
-	pbi->usb.num_properties = ARRAY_SIZE(pmic_usb_props);
-	pbi->usb.get_property = pmic_usb_get_property;
-	retval = power_supply_register(dev, &pbi->usb, NULL);
-	if (retval) {
+	pbi->usb = power_supply_register(dev, &pmic_batt_desc, &psy_cfg);
+	if (IS_ERR(pbi->usb)) {
 		dev_err(dev,
 			"%s(): failed to register pmic usb device with power supply subsystem\n",
 				__func__);
+		retval = PTR_ERR(pbi->usb);
 		goto power_reg_failed_1;
 	}
 
@@ -728,7 +739,7 @@ static int probe(int irq, struct device *dev)
 	return retval;
 
 power_reg_failed_1:
-	power_supply_unregister(&pbi->batt);
+	power_supply_unregister(pbi->batt);
 power_reg_failed:
 	cancel_delayed_work_sync(&pbi->monitor_battery);
 requestirq_failed:
@@ -762,8 +773,8 @@ static int platform_pmic_battery_remove(struct platform_device *pdev)
 	cancel_delayed_work_sync(&pbi->monitor_battery);
 	destroy_workqueue(pbi->monitor_wqueue);
 
-	power_supply_unregister(&pbi->usb);
-	power_supply_unregister(&pbi->batt);
+	power_supply_unregister(pbi->usb);
+	power_supply_unregister(pbi->batt);
 
 	cancel_work_sync(&pbi->handler);
 	kfree(pbi);
diff --git a/drivers/power/ipaq_micro_battery.c b/drivers/power/ipaq_micro_battery.c
index 842e7e2e1cb5..f03014ea1dc4 100644
--- a/drivers/power/ipaq_micro_battery.c
+++ b/drivers/power/ipaq_micro_battery.c
@@ -93,7 +93,7 @@ static void micro_battery_work(struct work_struct *work)
 
 static int get_capacity(struct power_supply *b)
 {
-	struct micro_battery *mb = dev_get_drvdata(b->dev->parent);
+	struct micro_battery *mb = dev_get_drvdata(b->dev.parent);
 
 	switch (mb->flag & 0x07) {
 	case MICRO_BATT_STATUS_HIGH:
@@ -113,7 +113,7 @@ static int get_capacity(struct power_supply *b)
 
 static int get_status(struct power_supply *b)
 {
-	struct micro_battery *mb = dev_get_drvdata(b->dev->parent);
+	struct micro_battery *mb = dev_get_drvdata(b->dev.parent);
 
 	if (mb->flag == MICRO_BATT_STATUS_UNKNOWN)
 		return POWER_SUPPLY_STATUS_UNKNOWN;
@@ -132,7 +132,7 @@ static int micro_batt_get_property(struct power_supply *b,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct micro_battery *mb = dev_get_drvdata(b->dev->parent);
+	struct micro_battery *mb = dev_get_drvdata(b->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_TECHNOLOGY:
@@ -180,7 +180,7 @@ static int micro_ac_get_property(struct power_supply *b,
 				 enum power_supply_property psp,
 				 union power_supply_propval *val)
 {
-	struct micro_battery *mb = dev_get_drvdata(b->dev->parent);
+	struct micro_battery *mb = dev_get_drvdata(b->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_ONLINE:
@@ -202,7 +202,7 @@ static enum power_supply_property micro_batt_power_props[] = {
 	POWER_SUPPLY_PROP_VOLTAGE_NOW,
 };
 
-static struct power_supply micro_batt_power = {
+static const struct power_supply_desc micro_batt_power_desc = {
 	.name			= "main-battery",
 	.type			= POWER_SUPPLY_TYPE_BATTERY,
 	.properties		= micro_batt_power_props,
@@ -215,7 +215,7 @@ static enum power_supply_property micro_ac_power_props[] = {
 	POWER_SUPPLY_PROP_ONLINE,
 };
 
-static struct power_supply micro_ac_power = {
+static const struct power_supply_desc micro_ac_power_desc = {
 	.name			= "ac",
 	.type			= POWER_SUPPLY_TYPE_MAINS,
 	.properties		= micro_ac_power_props,
@@ -223,6 +223,8 @@ static struct power_supply micro_ac_power = {
 	.get_property		= micro_ac_get_property,
 };
 
+static struct power_supply *micro_batt_power, *micro_ac_power;
+
 static int micro_batt_probe(struct platform_device *pdev)
 {
 	struct micro_battery *mb;
@@ -241,19 +243,25 @@ static int micro_batt_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, mb);
 	queue_delayed_work(mb->wq, &mb->update, 1);
 
-	ret = power_supply_register(&pdev->dev, &micro_batt_power, NULL);
-	if (ret < 0)
+	micro_batt_power = power_supply_register(&pdev->dev,
+						 &micro_batt_power_desc, NULL);
+	if (IS_ERR(micro_batt_power)) {
+		ret = PTR_ERR(micro_batt_power);
 		goto batt_err;
+	}
 
-	ret = power_supply_register(&pdev->dev, &micro_ac_power, NULL);
-	if (ret < 0)
+	micro_ac_power = power_supply_register(&pdev->dev,
+					       &micro_ac_power_desc, NULL);
+	if (IS_ERR(micro_ac_power)) {
+		ret = PTR_ERR(micro_ac_power);
 		goto ac_err;
+	}
 
 	dev_info(&pdev->dev, "iPAQ micro battery driver\n");
 	return 0;
 
 ac_err:
-	power_supply_unregister(&micro_ac_power);
+	power_supply_unregister(micro_ac_power);
 batt_err:
 	cancel_delayed_work_sync(&mb->update);
 	destroy_workqueue(mb->wq);
@@ -265,8 +273,8 @@ static int micro_batt_remove(struct platform_device *pdev)
 {
 	struct micro_battery *mb = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&micro_ac_power);
-	power_supply_unregister(&micro_batt_power);
+	power_supply_unregister(micro_ac_power);
+	power_supply_unregister(micro_batt_power);
 	cancel_delayed_work_sync(&mb->update);
 	destroy_workqueue(mb->wq);
 
diff --git a/drivers/power/isp1704_charger.c b/drivers/power/isp1704_charger.c
index 5521178bdc08..f2a7d970388f 100644
--- a/drivers/power/isp1704_charger.c
+++ b/drivers/power/isp1704_charger.c
@@ -57,11 +57,12 @@ static u16 isp170x_id[] = {
 };
 
 struct isp1704_charger {
-	struct device		*dev;
-	struct power_supply	psy;
-	struct usb_phy		*phy;
-	struct notifier_block	nb;
-	struct work_struct	work;
+	struct device			*dev;
+	struct power_supply		*psy;
+	struct power_supply_desc	psy_desc;
+	struct usb_phy			*phy;
+	struct notifier_block		nb;
+	struct work_struct		work;
 
 	/* properties */
 	char			model[8];
@@ -259,10 +260,10 @@ static void isp1704_charger_work(struct work_struct *data)
 
 			/* detect wall charger */
 			if (isp1704_charger_detect_dcp(isp)) {
-				isp->psy.type = POWER_SUPPLY_TYPE_USB_DCP;
+				isp->psy_desc.type = POWER_SUPPLY_TYPE_USB_DCP;
 				isp->current_max = 1800;
 			} else {
-				isp->psy.type = POWER_SUPPLY_TYPE_USB;
+				isp->psy_desc.type = POWER_SUPPLY_TYPE_USB;
 				isp->current_max = 500;
 			}
 
@@ -271,7 +272,7 @@ static void isp1704_charger_work(struct work_struct *data)
 				usb_gadget_connect(isp->phy->otg->gadget);
 		}
 
-		if (isp->psy.type != POWER_SUPPLY_TYPE_USB_DCP) {
+		if (isp->psy_desc.type != POWER_SUPPLY_TYPE_USB_DCP) {
 			/*
 			 * Only 500mA here or high speed chirp
 			 * handshaking may break
@@ -280,14 +281,14 @@ static void isp1704_charger_work(struct work_struct *data)
 				isp->current_max = 500;
 
 			if (isp->current_max > 100)
-				isp->psy.type = POWER_SUPPLY_TYPE_USB_CDP;
+				isp->psy_desc.type = POWER_SUPPLY_TYPE_USB_CDP;
 		}
 		break;
 	case USB_EVENT_NONE:
 		isp->online = false;
 		isp->present = 0;
 		isp->current_max = 0;
-		isp->psy.type = POWER_SUPPLY_TYPE_USB;
+		isp->psy_desc.type = POWER_SUPPLY_TYPE_USB;
 
 		/*
 		 * Disable data pullups. We need to prevent the controller from
@@ -306,7 +307,7 @@ static void isp1704_charger_work(struct work_struct *data)
 		goto out;
 	}
 
-	power_supply_changed(&isp->psy);
+	power_supply_changed(isp->psy);
 out:
 	mutex_unlock(&lock);
 }
@@ -326,8 +327,7 @@ static int isp1704_charger_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct isp1704_charger *isp =
-		container_of(psy, struct isp1704_charger, psy);
+	struct isp1704_charger *isp = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_PRESENT:
@@ -403,6 +403,7 @@ static int isp1704_charger_probe(struct platform_device *pdev)
 {
 	struct isp1704_charger	*isp;
 	int			ret = -ENODEV;
+	struct power_supply_config psy_cfg = {};
 
 	struct isp1704_charger_data *pdata = dev_get_platdata(&pdev->dev);
 	struct device_node *np = pdev->dev.of_node;
@@ -454,15 +455,19 @@ static int isp1704_charger_probe(struct platform_device *pdev)
 	if (ret < 0)
 		goto fail1;
 
-	isp->psy.name		= "isp1704";
-	isp->psy.type		= POWER_SUPPLY_TYPE_USB;
-	isp->psy.properties	= power_props;
-	isp->psy.num_properties	= ARRAY_SIZE(power_props);
-	isp->psy.get_property	= isp1704_charger_get_property;
+	isp->psy_desc.name		= "isp1704";
+	isp->psy_desc.type		= POWER_SUPPLY_TYPE_USB;
+	isp->psy_desc.properties	= power_props;
+	isp->psy_desc.num_properties	= ARRAY_SIZE(power_props);
+	isp->psy_desc.get_property	= isp1704_charger_get_property;
 
-	ret = power_supply_register(isp->dev, &isp->psy, NULL);
-	if (ret)
+	psy_cfg.drv_data		= isp;
+
+	isp->psy = power_supply_register(isp->dev, &isp->psy_desc, &psy_cfg);
+	if (IS_ERR(isp->psy)) {
+		ret = PTR_ERR(isp->psy);
 		goto fail1;
+	}
 
 	/*
 	 * REVISIT: using work in order to allow the usb notifications to be
@@ -498,7 +503,7 @@ static int isp1704_charger_probe(struct platform_device *pdev)
 
 	return 0;
 fail2:
-	power_supply_unregister(&isp->psy);
+	power_supply_unregister(isp->psy);
 fail1:
 	isp1704_charger_set_power(isp, 0);
 fail0:
@@ -512,7 +517,7 @@ static int isp1704_charger_remove(struct platform_device *pdev)
 	struct isp1704_charger *isp = platform_get_drvdata(pdev);
 
 	usb_unregister_notifier(isp->phy, &isp->nb);
-	power_supply_unregister(&isp->psy);
+	power_supply_unregister(isp->psy);
 	isp1704_charger_set_power(isp, 0);
 
 	return 0;
diff --git a/drivers/power/jz4740-battery.c b/drivers/power/jz4740-battery.c
index 0444434e1927..abdfc21ec13f 100644
--- a/drivers/power/jz4740-battery.c
+++ b/drivers/power/jz4740-battery.c
@@ -46,7 +46,8 @@ struct jz_battery {
 
 	struct completion read_completion;
 
-	struct power_supply battery;
+	struct power_supply *battery;
+	struct power_supply_desc battery_desc;
 	struct delayed_work work;
 
 	struct mutex lock;
@@ -54,7 +55,7 @@ struct jz_battery {
 
 static inline struct jz_battery *psy_to_jz_battery(struct power_supply *psy)
 {
-	return container_of(psy, struct jz_battery, battery);
+	return power_supply_get_drvdata(psy);
 }
 
 static irqreturn_t jz_battery_irq_handler(int irq, void *devid)
@@ -213,7 +214,7 @@ static void jz_battery_update(struct jz_battery *jz_battery)
 	}
 
 	if (has_changed)
-		power_supply_changed(&jz_battery->battery);
+		power_supply_changed(jz_battery->battery);
 }
 
 static enum power_supply_property jz_battery_properties[] = {
@@ -242,8 +243,9 @@ static int jz_battery_probe(struct platform_device *pdev)
 {
 	int ret = 0;
 	struct jz_battery_platform_data *pdata = pdev->dev.parent->platform_data;
+	struct power_supply_config psy_cfg = {};
 	struct jz_battery *jz_battery;
-	struct power_supply *battery;
+	struct power_supply_desc *battery_desc;
 	struct resource *mem;
 
 	if (!pdata) {
@@ -271,14 +273,17 @@ static int jz_battery_probe(struct platform_device *pdev)
 	if (IS_ERR(jz_battery->base))
 		return PTR_ERR(jz_battery->base);
 
-	battery = &jz_battery->battery;
-	battery->name = pdata->info.name;
-	battery->type = POWER_SUPPLY_TYPE_BATTERY;
-	battery->properties	= jz_battery_properties;
-	battery->num_properties	= ARRAY_SIZE(jz_battery_properties);
-	battery->get_property = jz_battery_get_property;
-	battery->external_power_changed = jz_battery_external_power_changed;
-	battery->use_for_apm = 1;
+	battery_desc = &jz_battery->battery_desc;
+	battery_desc->name = pdata->info.name;
+	battery_desc->type = POWER_SUPPLY_TYPE_BATTERY;
+	battery_desc->properties	= jz_battery_properties;
+	battery_desc->num_properties	= ARRAY_SIZE(jz_battery_properties);
+	battery_desc->get_property	= jz_battery_get_property;
+	battery_desc->external_power_changed =
+					jz_battery_external_power_changed;
+	battery_desc->use_for_apm	= 1;
+
+	psy_cfg.drv_data = jz_battery;
 
 	jz_battery->pdata = pdata;
 	jz_battery->pdev = pdev;
@@ -330,9 +335,11 @@ static int jz_battery_probe(struct platform_device *pdev)
 	else
 		jz4740_adc_set_config(pdev->dev.parent, JZ_ADC_CONFIG_BAT_MB, 0);
 
-	ret = power_supply_register(&pdev->dev, &jz_battery->battery, NULL);
-	if (ret) {
+	jz_battery->battery = power_supply_register(&pdev->dev, battery_desc,
+							&psy_cfg);
+	if (IS_ERR(jz_battery->battery)) {
 		dev_err(&pdev->dev, "power supply battery register failed.\n");
+		ret = PTR_ERR(jz_battery->battery);
 		goto err_free_charge_irq;
 	}
 
@@ -364,7 +371,7 @@ static int jz_battery_remove(struct platform_device *pdev)
 		gpio_free(jz_battery->pdata->gpio_charge);
 	}
 
-	power_supply_unregister(&jz_battery->battery);
+	power_supply_unregister(jz_battery->battery);
 
 	free_irq(jz_battery->irq, jz_battery);
 
diff --git a/drivers/power/lp8727_charger.c b/drivers/power/lp8727_charger.c
index 1f71af7a3811..7e741f1d3cd5 100644
--- a/drivers/power/lp8727_charger.c
+++ b/drivers/power/lp8727_charger.c
@@ -80,9 +80,9 @@ enum lp8727_die_temp {
 };
 
 struct lp8727_psy {
-	struct power_supply ac;
-	struct power_supply usb;
-	struct power_supply batt;
+	struct power_supply *ac;
+	struct power_supply *usb;
+	struct power_supply *batt;
 };
 
 struct lp8727_chg {
@@ -242,9 +242,9 @@ static void lp8727_delayed_func(struct work_struct *_work)
 	lp8727_id_detection(pchg, idno, vbus);
 	lp8727_enable_chgdet(pchg);
 
-	power_supply_changed(&pchg->psy->ac);
-	power_supply_changed(&pchg->psy->usb);
-	power_supply_changed(&pchg->psy->batt);
+	power_supply_changed(pchg->psy->ac);
+	power_supply_changed(pchg->psy->usb);
+	power_supply_changed(pchg->psy->batt);
 }
 
 static irqreturn_t lp8727_isr_func(int irq, void *ptr)
@@ -311,12 +311,12 @@ static int lp8727_charger_get_property(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       union power_supply_propval *val)
 {
-	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev->parent);
+	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev.parent);
 
 	if (psp != POWER_SUPPLY_PROP_ONLINE)
 		return -EINVAL;
 
-	val->intval = lp8727_is_charger_attached(psy->name, pchg->devid);
+	val->intval = lp8727_is_charger_attached(psy->desc->name, pchg->devid);
 
 	return 0;
 }
@@ -337,14 +337,14 @@ static int lp8727_battery_get_property(struct power_supply *psy,
 				       enum power_supply_property psp,
 				       union power_supply_propval *val)
 {
-	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev->parent);
+	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev.parent);
 	struct lp8727_platform_data *pdata = pchg->pdata;
 	enum lp8727_die_temp temp;
 	u8 read;
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
-		if (!lp8727_is_charger_attached(psy->name, pchg->devid)) {
+		if (!lp8727_is_charger_attached(psy->desc->name, pchg->devid)) {
 			val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
 			return 0;
 		}
@@ -400,13 +400,13 @@ static int lp8727_battery_get_property(struct power_supply *psy,
 
 static void lp8727_charger_changed(struct power_supply *psy)
 {
-	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev->parent);
+	struct lp8727_chg *pchg = dev_get_drvdata(psy->dev.parent);
 	u8 eoc_level;
 	u8 ichg;
 	u8 val;
 
 	/* skip if no charger exists */
-	if (!lp8727_is_charger_attached(psy->name, pchg->devid))
+	if (!lp8727_is_charger_attached(psy->desc->name, pchg->devid))
 		return;
 
 	/* update charging parameters */
@@ -418,6 +418,31 @@ static void lp8727_charger_changed(struct power_supply *psy)
 	}
 }
 
+static const struct power_supply_desc lp8727_ac_desc = {
+	.name			= "ac",
+	.type			= POWER_SUPPLY_TYPE_MAINS,
+	.properties		= lp8727_charger_prop,
+	.num_properties		= ARRAY_SIZE(lp8727_charger_prop),
+	.get_property		= lp8727_charger_get_property,
+};
+
+static const struct power_supply_desc lp8727_usb_desc = {
+	.name			= "usb",
+	.type			= POWER_SUPPLY_TYPE_USB,
+	.properties		= lp8727_charger_prop,
+	.num_properties		= ARRAY_SIZE(lp8727_charger_prop),
+	.get_property		= lp8727_charger_get_property,
+};
+
+static const struct power_supply_desc lp8727_batt_desc = {
+	.name			= "main_batt",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= lp8727_battery_prop,
+	.num_properties		= ARRAY_SIZE(lp8727_battery_prop),
+	.get_property		= lp8727_battery_get_property,
+	.external_power_changed	= lp8727_charger_changed,
+};
+
 static int lp8727_register_psy(struct lp8727_chg *pchg)
 {
 	struct power_supply_config psy_cfg = {}; /* Only for ac and usb */
@@ -432,40 +457,25 @@ static int lp8727_register_psy(struct lp8727_chg *pchg)
 	psy_cfg.supplied_to = battery_supplied_to;
 	psy_cfg.num_supplicants = ARRAY_SIZE(battery_supplied_to);
 
-	psy->ac.name = "ac";
-	psy->ac.type = POWER_SUPPLY_TYPE_MAINS;
-	psy->ac.properties = lp8727_charger_prop;
-	psy->ac.num_properties = ARRAY_SIZE(lp8727_charger_prop);
-	psy->ac.get_property = lp8727_charger_get_property;
-
-	if (power_supply_register(pchg->dev, &psy->ac, &psy_cfg))
+	psy->ac = power_supply_register(pchg->dev, &lp8727_ac_desc, &psy_cfg);
+	if (IS_ERR(psy->ac))
 		goto err_psy_ac;
 
-	psy->usb.name = "usb";
-	psy->usb.type = POWER_SUPPLY_TYPE_USB;
-	psy->usb.properties = lp8727_charger_prop;
-	psy->usb.num_properties = ARRAY_SIZE(lp8727_charger_prop);
-	psy->usb.get_property = lp8727_charger_get_property;
-
-	if (power_supply_register(pchg->dev, &psy->usb, &psy_cfg))
+	psy->usb = power_supply_register(pchg->dev, &lp8727_usb_desc,
+					 &psy_cfg);
+	if (IS_ERR(psy->usb))
 		goto err_psy_usb;
 
-	psy->batt.name = "main_batt";
-	psy->batt.type = POWER_SUPPLY_TYPE_BATTERY;
-	psy->batt.properties = lp8727_battery_prop;
-	psy->batt.num_properties = ARRAY_SIZE(lp8727_battery_prop);
-	psy->batt.get_property = lp8727_battery_get_property;
-	psy->batt.external_power_changed = lp8727_charger_changed;
-
-	if (power_supply_register(pchg->dev, &psy->batt, NULL))
+	psy->batt = power_supply_register(pchg->dev, &lp8727_batt_desc, NULL);
+	if (IS_ERR(psy->batt))
 		goto err_psy_batt;
 
 	return 0;
 
 err_psy_batt:
-	power_supply_unregister(&psy->usb);
+	power_supply_unregister(psy->usb);
 err_psy_usb:
-	power_supply_unregister(&psy->ac);
+	power_supply_unregister(psy->ac);
 err_psy_ac:
 	return -EPERM;
 }
@@ -477,9 +487,9 @@ static void lp8727_unregister_psy(struct lp8727_chg *pchg)
 	if (!psy)
 		return;
 
-	power_supply_unregister(&psy->ac);
-	power_supply_unregister(&psy->usb);
-	power_supply_unregister(&psy->batt);
+	power_supply_unregister(psy->ac);
+	power_supply_unregister(psy->usb);
+	power_supply_unregister(psy->batt);
 }
 
 #ifdef CONFIG_OF
diff --git a/drivers/power/lp8788-charger.c b/drivers/power/lp8788-charger.c
index 8e4d228519c1..f5a48fd68b01 100644
--- a/drivers/power/lp8788-charger.c
+++ b/drivers/power/lp8788-charger.c
@@ -105,8 +105,8 @@ struct lp8788_chg_irq {
  */
 struct lp8788_charger {
 	struct lp8788 *lp;
-	struct power_supply charger;
-	struct power_supply battery;
+	struct power_supply *charger;
+	struct power_supply *battery;
 	struct work_struct charger_work;
 	struct iio_channel *chan[LP8788_NUM_CHG_ADC];
 	struct lp8788_chg_irq irqs[LP8788_MAX_CHG_IRQS];
@@ -148,7 +148,7 @@ static int lp8788_charger_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct lp8788_charger *pchg = dev_get_drvdata(psy->dev->parent);
+	struct lp8788_charger *pchg = dev_get_drvdata(psy->dev.parent);
 	u8 read;
 
 	switch (psp) {
@@ -337,7 +337,7 @@ static int lp8788_battery_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct lp8788_charger *pchg = dev_get_drvdata(psy->dev->parent);
+	struct lp8788_charger *pchg = dev_get_drvdata(psy->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -397,31 +397,40 @@ static int lp8788_update_charger_params(struct platform_device *pdev,
 	return 0;
 }
 
+static const struct power_supply_desc lp8788_psy_charger_desc = {
+	.name		= LP8788_CHARGER_NAME,
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= lp8788_charger_prop,
+	.num_properties	= ARRAY_SIZE(lp8788_charger_prop),
+	.get_property	= lp8788_charger_get_property,
+};
+
+static const struct power_supply_desc lp8788_psy_battery_desc = {
+	.name		= LP8788_BATTERY_NAME,
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= lp8788_battery_prop,
+	.num_properties	= ARRAY_SIZE(lp8788_battery_prop),
+	.get_property	= lp8788_battery_get_property,
+};
+
 static int lp8788_psy_register(struct platform_device *pdev,
 				struct lp8788_charger *pchg)
 {
 	struct power_supply_config charger_cfg = {};
 
-	pchg->charger.name = LP8788_CHARGER_NAME;
-	pchg->charger.type = POWER_SUPPLY_TYPE_MAINS;
-	pchg->charger.properties = lp8788_charger_prop;
-	pchg->charger.num_properties = ARRAY_SIZE(lp8788_charger_prop);
-	pchg->charger.get_property = lp8788_charger_get_property;
-
 	charger_cfg.supplied_to = battery_supplied_to;
 	charger_cfg.num_supplicants = ARRAY_SIZE(battery_supplied_to);
 
-	if (power_supply_register(&pdev->dev, &pchg->charger, &charger_cfg))
+	pchg->charger = power_supply_register(&pdev->dev,
+					      &lp8788_psy_charger_desc,
+					      &charger_cfg);
+	if (IS_ERR(pchg->charger))
 		return -EPERM;
 
-	pchg->battery.name = LP8788_BATTERY_NAME;
-	pchg->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	pchg->battery.properties = lp8788_battery_prop;
-	pchg->battery.num_properties = ARRAY_SIZE(lp8788_battery_prop);
-	pchg->battery.get_property = lp8788_battery_get_property;
-
-	if (power_supply_register(&pdev->dev, &pchg->battery, NULL)) {
-		power_supply_unregister(&pchg->charger);
+	pchg->battery = power_supply_register(&pdev->dev,
+					      &lp8788_psy_battery_desc, NULL);
+	if (IS_ERR(pchg->battery)) {
+		power_supply_unregister(pchg->charger);
 		return -EPERM;
 	}
 
@@ -430,8 +439,8 @@ static int lp8788_psy_register(struct platform_device *pdev,
 
 static void lp8788_psy_unregister(struct lp8788_charger *pchg)
 {
-	power_supply_unregister(&pchg->battery);
-	power_supply_unregister(&pchg->charger);
+	power_supply_unregister(pchg->battery);
+	power_supply_unregister(pchg->charger);
 }
 
 static void lp8788_charger_event(struct work_struct *work)
@@ -475,8 +484,8 @@ static irqreturn_t lp8788_charger_irq_thread(int virq, void *ptr)
 	case LP8788_INT_EOC:
 	case LP8788_INT_BATT_LOW:
 	case LP8788_INT_NO_BATT:
-		power_supply_changed(&pchg->charger);
-		power_supply_changed(&pchg->battery);
+		power_supply_changed(pchg->charger);
+		power_supply_changed(pchg->battery);
 		break;
 	default:
 		break;
diff --git a/drivers/power/ltc2941-battery-gauge.c b/drivers/power/ltc2941-battery-gauge.c
index 9bc545393ef8..daeb0860736c 100644
--- a/drivers/power/ltc2941-battery-gauge.c
+++ b/drivers/power/ltc2941-battery-gauge.c
@@ -59,7 +59,8 @@ enum ltc294x_reg {
 
 struct ltc294x_info {
 	struct i2c_client *client;	/* I2C Client pointer */
-	struct power_supply supply;	/* Supply pointer */
+	struct power_supply *supply;	/* Supply pointer */
+	struct power_supply_desc supply_desc;	/* Supply description */
 	struct delayed_work work;	/* Work scheduler */
 	int num_regs;	/* Number of registers (chip type) */
 	int id;		/* Identifier of ltc294x chip */
@@ -294,8 +295,7 @@ static int ltc294x_get_property(struct power_supply *psy,
 				enum power_supply_property prop,
 				union power_supply_propval *val)
 {
-	struct ltc294x_info *info =
-		container_of(psy, struct ltc294x_info, supply);
+	struct ltc294x_info *info = power_supply_get_drvdata(psy);
 
 	switch (prop) {
 	case POWER_SUPPLY_PROP_CHARGE_NOW:
@@ -317,8 +317,7 @@ static int ltc294x_set_property(struct power_supply *psy,
 	enum power_supply_property psp,
 	const union power_supply_propval *val)
 {
-	struct ltc294x_info *info =
-		container_of(psy, struct ltc294x_info, supply);
+	struct ltc294x_info *info = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_CHARGE_NOW:
@@ -345,7 +344,7 @@ static void ltc294x_update(struct ltc294x_info *info)
 
 	if (charge != info->charge) {
 		info->charge = charge;
-		power_supply_changed(&info->supply);
+		power_supply_changed(info->supply);
 	}
 }
 
@@ -371,8 +370,8 @@ static int ltc294x_i2c_remove(struct i2c_client *client)
 	struct ltc294x_info *info = i2c_get_clientdata(client);
 
 	cancel_delayed_work(&info->work);
-	power_supply_unregister(&info->supply);
-	kfree(info->supply.name);
+	power_supply_unregister(info->supply);
+	kfree(info->supply_desc.name);
 	mutex_lock(&ltc294x_lock);
 	idr_remove(&ltc294x_id, info->id);
 	mutex_unlock(&ltc294x_lock);
@@ -382,6 +381,7 @@ static int ltc294x_i2c_remove(struct i2c_client *client)
 static int ltc294x_i2c_probe(struct i2c_client *client,
 	const struct i2c_device_id *id)
 {
+	struct power_supply_config psy_cfg = {};
 	struct ltc294x_info *info;
 	int ret;
 	int num;
@@ -406,8 +406,9 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
 	i2c_set_clientdata(client, info);
 
 	info->num_regs = id->driver_data;
-	info->supply.name = kasprintf(GFP_KERNEL, "%s-%d", client->name, num);
-	if (!info->supply.name) {
+	info->supply_desc.name = kasprintf(GFP_KERNEL, "%s-%d", client->name,
+					   num);
+	if (!info->supply_desc.name) {
 		ret = -ENOMEM;
 		goto fail_name;
 	}
@@ -446,24 +447,26 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
 
 	info->client = client;
 	info->id = num;
-	info->supply.type = POWER_SUPPLY_TYPE_BATTERY;
-	info->supply.properties = ltc294x_properties;
+	info->supply_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	info->supply_desc.properties = ltc294x_properties;
 	if (info->num_regs >= LTC294X_REG_TEMPERATURE_LSB)
-		info->supply.num_properties =
+		info->supply_desc.num_properties =
 			ARRAY_SIZE(ltc294x_properties);
 	else if (info->num_regs >= LTC294X_REG_CURRENT_LSB)
-		info->supply.num_properties =
+		info->supply_desc.num_properties =
 			ARRAY_SIZE(ltc294x_properties) - 1;
 	else if (info->num_regs >= LTC294X_REG_VOLTAGE_LSB)
-		info->supply.num_properties =
+		info->supply_desc.num_properties =
 			ARRAY_SIZE(ltc294x_properties) - 2;
 	else
-		info->supply.num_properties =
+		info->supply_desc.num_properties =
 			ARRAY_SIZE(ltc294x_properties) - 3;
-	info->supply.get_property = ltc294x_get_property;
-	info->supply.set_property = ltc294x_set_property;
-	info->supply.property_is_writeable = ltc294x_property_is_writeable;
-	info->supply.external_power_changed	= NULL;
+	info->supply_desc.get_property = ltc294x_get_property;
+	info->supply_desc.set_property = ltc294x_set_property;
+	info->supply_desc.property_is_writeable = ltc294x_property_is_writeable;
+	info->supply_desc.external_power_changed	= NULL;
+
+	psy_cfg.drv_data = info;
 
 	INIT_DELAYED_WORK(&info->work, ltc294x_work);
 
@@ -473,9 +476,11 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
 		goto fail_comm;
 	}
 
-	ret = power_supply_register(&client->dev, &info->supply, NULL);
-	if (ret) {
+	info->supply = power_supply_register(&client->dev, &info->supply_desc,
+					     &psy_cfg);
+	if (IS_ERR(info->supply)) {
 		dev_err(&client->dev, "failed to register ltc2941\n");
+		ret = PTR_ERR(info->supply);
 		goto fail_register;
 	} else {
 		schedule_delayed_work(&info->work, LTC294X_WORK_DELAY * HZ);
@@ -484,7 +489,7 @@ static int ltc294x_i2c_probe(struct i2c_client *client,
 	return 0;
 
 fail_register:
-	kfree(info->supply.name);
+	kfree(info->supply_desc.name);
 fail_comm:
 fail_name:
 fail_info:
diff --git a/drivers/power/max14577_charger.c b/drivers/power/max14577_charger.c
index c5f2a535c81a..a36bcaf62dd4 100644
--- a/drivers/power/max14577_charger.c
+++ b/drivers/power/max14577_charger.c
@@ -22,9 +22,9 @@
 #include <linux/mfd/max14577.h>
 
 struct max14577_charger {
-	struct device *dev;
-	struct max14577	*max14577;
-	struct power_supply	charger;
+	struct device		*dev;
+	struct max14577		*max14577;
+	struct power_supply	*charger;
 
 	struct max14577_charger_platform_data	*pdata;
 };
@@ -421,9 +421,7 @@ static int max14577_charger_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct max14577_charger *chg = container_of(psy,
-						  struct max14577_charger,
-						  charger);
+	struct max14577_charger *chg = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -456,6 +454,14 @@ static int max14577_charger_get_property(struct power_supply *psy,
 	return ret;
 }
 
+static const struct power_supply_desc max14577_charger_desc = {
+	.name = "max14577-charger",
+	.type = POWER_SUPPLY_TYPE_BATTERY,
+	.properties = max14577_charger_props,
+	.num_properties = ARRAY_SIZE(max14577_charger_props),
+	.get_property = max14577_charger_get_property,
+};
+
 #ifdef CONFIG_OF
 static struct max14577_charger_platform_data *max14577_charger_dt_init(
 		struct platform_device *pdev)
@@ -563,6 +569,7 @@ static DEVICE_ATTR(fast_charge_timer, S_IRUGO | S_IWUSR,
 static int max14577_charger_probe(struct platform_device *pdev)
 {
 	struct max14577_charger *chg;
+	struct power_supply_config psy_cfg = {};
 	struct max14577 *max14577 = dev_get_drvdata(pdev->dev.parent);
 	int ret;
 
@@ -582,21 +589,18 @@ static int max14577_charger_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	chg->charger.name = "max14577-charger",
-	chg->charger.type = POWER_SUPPLY_TYPE_BATTERY,
-	chg->charger.properties = max14577_charger_props,
-	chg->charger.num_properties = ARRAY_SIZE(max14577_charger_props),
-	chg->charger.get_property = max14577_charger_get_property,
-
 	ret = device_create_file(&pdev->dev, &dev_attr_fast_charge_timer);
 	if (ret) {
 		dev_err(&pdev->dev, "failed: create sysfs entry\n");
 		return ret;
 	}
 
-	ret = power_supply_register(&pdev->dev, &chg->charger, NULL);
-	if (ret) {
+	psy_cfg.drv_data = chg;
+	chg->charger = power_supply_register(&pdev->dev, &max14577_charger_desc,
+						&psy_cfg);
+	if (IS_ERR(chg->charger)) {
 		dev_err(&pdev->dev, "failed: power supply register\n");
+		ret = PTR_ERR(chg->charger);
 		goto err;
 	}
 
@@ -617,7 +621,7 @@ static int max14577_charger_remove(struct platform_device *pdev)
 	struct max14577_charger *chg = platform_get_drvdata(pdev);
 
 	device_remove_file(&pdev->dev, &dev_attr_fast_charge_timer);
-	power_supply_unregister(&chg->charger);
+	power_supply_unregister(chg->charger);
 
 	return 0;
 }
diff --git a/drivers/power/max17040_battery.c b/drivers/power/max17040_battery.c
index d36b2f6c2053..8689c80202b5 100644
--- a/drivers/power/max17040_battery.c
+++ b/drivers/power/max17040_battery.c
@@ -40,7 +40,7 @@
 struct max17040_chip {
 	struct i2c_client		*client;
 	struct delayed_work		work;
-	struct power_supply		battery;
+	struct power_supply		*battery;
 	struct max17040_platform_data	*pdata;
 
 	/* State Of Connect */
@@ -57,8 +57,7 @@ static int max17040_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct max17040_chip *chip = container_of(psy,
-				struct max17040_chip, battery);
+	struct max17040_chip *chip = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -199,12 +198,20 @@ static enum power_supply_property max17040_battery_props[] = {
 	POWER_SUPPLY_PROP_CAPACITY,
 };
 
+static const struct power_supply_desc max17040_battery_desc = {
+	.name		= "battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= max17040_get_property,
+	.properties	= max17040_battery_props,
+	.num_properties	= ARRAY_SIZE(max17040_battery_props),
+};
+
 static int max17040_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct power_supply_config psy_cfg = {};
 	struct max17040_chip *chip;
-	int ret;
 
 	if (!i2c_check_functionality(adapter, I2C_FUNC_SMBUS_BYTE))
 		return -EIO;
@@ -217,17 +224,13 @@ static int max17040_probe(struct i2c_client *client,
 	chip->pdata = client->dev.platform_data;
 
 	i2c_set_clientdata(client, chip);
+	psy_cfg.drv_data = chip;
 
-	chip->battery.name		= "battery";
-	chip->battery.type		= POWER_SUPPLY_TYPE_BATTERY;
-	chip->battery.get_property	= max17040_get_property;
-	chip->battery.properties	= max17040_battery_props;
-	chip->battery.num_properties	= ARRAY_SIZE(max17040_battery_props);
-
-	ret = power_supply_register(&client->dev, &chip->battery, NULL);
-	if (ret) {
+	chip->battery = power_supply_register(&client->dev,
+				&max17040_battery_desc, &psy_cfg);
+	if (IS_ERR(chip->battery)) {
 		dev_err(&client->dev, "failed: power supply register\n");
-		return ret;
+		return PTR_ERR(chip->battery);
 	}
 
 	max17040_reset(client);
@@ -244,7 +247,7 @@ static int max17040_remove(struct i2c_client *client)
 {
 	struct max17040_chip *chip = i2c_get_clientdata(client);
 
-	power_supply_unregister(&chip->battery);
+	power_supply_unregister(chip->battery);
 	cancel_delayed_work(&chip->work);
 	return 0;
 }
diff --git a/drivers/power/max17042_battery.c b/drivers/power/max17042_battery.c
index b1ebbc3b8640..e5645ea6f9d8 100644
--- a/drivers/power/max17042_battery.c
+++ b/drivers/power/max17042_battery.c
@@ -69,7 +69,7 @@
 struct max17042_chip {
 	struct i2c_client *client;
 	struct regmap *regmap;
-	struct power_supply battery;
+	struct power_supply *battery;
 	enum max170xx_chip_type chip_type;
 	struct max17042_platform_data *pdata;
 	struct work_struct work;
@@ -96,8 +96,7 @@ static int max17042_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct max17042_chip *chip = container_of(psy,
-				struct max17042_chip, battery);
+	struct max17042_chip *chip = power_supply_get_drvdata(psy);
 	struct regmap *map = chip->regmap;
 	int ret;
 	u32 data;
@@ -602,7 +601,7 @@ static irqreturn_t max17042_thread_handler(int id, void *dev)
 		max17042_set_soc_threshold(chip, 1);
 	}
 
-	power_supply_changed(&chip->battery);
+	power_supply_changed(chip->battery);
 	return IRQ_HANDLED;
 }
 
@@ -662,10 +661,28 @@ static const struct regmap_config max17042_regmap_config = {
 	.val_format_endian = REGMAP_ENDIAN_NATIVE,
 };
 
+static const struct power_supply_desc max17042_psy_desc = {
+	.name		= "max170xx_battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= max17042_get_property,
+	.properties	= max17042_battery_props,
+	.num_properties	= ARRAY_SIZE(max17042_battery_props),
+};
+
+static const struct power_supply_desc max17042_no_current_sense_psy_desc = {
+	.name		= "max170xx_battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= max17042_get_property,
+	.properties	= max17042_battery_props,
+	.num_properties	= ARRAY_SIZE(max17042_battery_props) - 2,
+};
+
 static int max17042_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	const struct power_supply_desc *max17042_desc = &max17042_psy_desc;
+	struct power_supply_config psy_cfg = {};
 	struct max17042_chip *chip;
 	int ret;
 	int i;
@@ -692,6 +709,7 @@ static int max17042_probe(struct i2c_client *client,
 	}
 
 	i2c_set_clientdata(client, chip);
+	psy_cfg.drv_data = chip;
 
 	regmap_read(chip->regmap, MAX17042_DevName, &val);
 	if (val == MAX17042_IC_VERSION) {
@@ -705,16 +723,10 @@ static int max17042_probe(struct i2c_client *client,
 		return -EIO;
 	}
 
-	chip->battery.name		= "max170xx_battery";
-	chip->battery.type		= POWER_SUPPLY_TYPE_BATTERY;
-	chip->battery.get_property	= max17042_get_property;
-	chip->battery.properties	= max17042_battery_props;
-	chip->battery.num_properties	= ARRAY_SIZE(max17042_battery_props);
-
 	/* When current is not measured,
 	 * CURRENT_NOW and CURRENT_AVG properties should be invisible. */
 	if (!chip->pdata->enable_current_sense)
-		chip->battery.num_properties -= 2;
+		max17042_desc = &max17042_no_current_sense_psy_desc;
 
 	if (chip->pdata->r_sns == 0)
 		chip->pdata->r_sns = MAX17042_DEFAULT_SNS_RESISTOR;
@@ -731,17 +743,18 @@ static int max17042_probe(struct i2c_client *client,
 		regmap_write(chip->regmap, MAX17042_LearnCFG, 0x0007);
 	}
 
-	ret = power_supply_register(&client->dev, &chip->battery, NULL);
-	if (ret) {
+	chip->battery = power_supply_register(&client->dev, max17042_desc,
+						&psy_cfg);
+	if (IS_ERR(chip->battery)) {
 		dev_err(&client->dev, "failed: power supply register\n");
-		return ret;
+		return PTR_ERR(chip->battery);
 	}
 
 	if (client->irq) {
 		ret = request_threaded_irq(client->irq, NULL,
 					max17042_thread_handler,
 					IRQF_TRIGGER_FALLING | IRQF_ONESHOT,
-					chip->battery.name, chip);
+					chip->battery->desc->name, chip);
 		if (!ret) {
 			regmap_update_bits(chip->regmap, MAX17042_CONFIG,
 					CONFIG_ALRT_BIT_ENBL,
@@ -771,7 +784,7 @@ static int max17042_remove(struct i2c_client *client)
 
 	if (client->irq)
 		free_irq(client->irq, chip);
-	power_supply_unregister(&chip->battery);
+	power_supply_unregister(chip->battery);
 	return 0;
 }
 
diff --git a/drivers/power/max77693_charger.c b/drivers/power/max77693_charger.c
index 86ea0231175c..754879eb59f6 100644
--- a/drivers/power/max77693_charger.c
+++ b/drivers/power/max77693_charger.c
@@ -22,14 +22,14 @@
 #include <linux/mfd/max77693.h>
 #include <linux/mfd/max77693-private.h>
 
-static const char *max77693_charger_name		= "max77693-charger";
+#define MAX77693_CHARGER_NAME				"max77693-charger"
 static const char *max77693_charger_model		= "MAX77693";
 static const char *max77693_charger_manufacturer	= "Maxim Integrated";
 
 struct max77693_charger {
 	struct device		*dev;
 	struct max77693_dev	*max77693;
-	struct power_supply	charger;
+	struct power_supply	*charger;
 
 	u32 constant_volt;
 	u32 min_system_volt;
@@ -220,9 +220,7 @@ static int max77693_charger_get_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct max77693_charger *chg = container_of(psy,
-						  struct max77693_charger,
-						  charger);
+	struct max77693_charger *chg = power_supply_get_drvdata(psy);
 	struct regmap *regmap = chg->max77693->regmap;
 	int ret = 0;
 
@@ -255,6 +253,14 @@ static int max77693_charger_get_property(struct power_supply *psy,
 	return ret;
 }
 
+static const struct power_supply_desc max77693_charger_desc = {
+	.name		= MAX77693_CHARGER_NAME,
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= max77693_charger_props,
+	.num_properties	= ARRAY_SIZE(max77693_charger_props),
+	.get_property	= max77693_charger_get_property,
+};
+
 static ssize_t device_attr_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count,
 		int (*fn)(struct max77693_charger *, unsigned long))
@@ -670,6 +676,7 @@ static int max77693_dt_init(struct device *dev, struct max77693_charger *chg)
 static int max77693_charger_probe(struct platform_device *pdev)
 {
 	struct max77693_charger *chg;
+	struct power_supply_config psy_cfg = {};
 	struct max77693_dev *max77693 = dev_get_drvdata(pdev->dev.parent);
 	int ret;
 
@@ -689,11 +696,7 @@ static int max77693_charger_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	chg->charger.name = max77693_charger_name;
-	chg->charger.type = POWER_SUPPLY_TYPE_BATTERY;
-	chg->charger.properties = max77693_charger_props;
-	chg->charger.num_properties = ARRAY_SIZE(max77693_charger_props);
-	chg->charger.get_property = max77693_charger_get_property;
+	psy_cfg.drv_data = chg;
 
 	ret = device_create_file(&pdev->dev, &dev_attr_fast_charge_timer);
 	if (ret) {
@@ -714,9 +717,12 @@ static int max77693_charger_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	ret = power_supply_register(&pdev->dev, &chg->charger, NULL);
-	if (ret) {
+	chg->charger = power_supply_register(&pdev->dev,
+						&max77693_charger_desc,
+						&psy_cfg);
+	if (IS_ERR(chg->charger)) {
 		dev_err(&pdev->dev, "failed: power supply register\n");
+		ret = PTR_ERR(chg->charger);
 		goto err;
 	}
 
@@ -738,7 +744,7 @@ static int max77693_charger_remove(struct platform_device *pdev)
 	device_remove_file(&pdev->dev, &dev_attr_top_off_threshold_current);
 	device_remove_file(&pdev->dev, &dev_attr_fast_charge_timer);
 
-	power_supply_unregister(&chg->charger);
+	power_supply_unregister(chg->charger);
 
 	return 0;
 }
diff --git a/drivers/power/max8903_charger.c b/drivers/power/max8903_charger.c
index 2f769faa85f6..bf2b4b3a7cae 100644
--- a/drivers/power/max8903_charger.c
+++ b/drivers/power/max8903_charger.c
@@ -31,7 +31,8 @@
 struct max8903_data {
 	struct max8903_pdata pdata;
 	struct device *dev;
-	struct power_supply psy;
+	struct power_supply *psy;
+	struct power_supply_desc psy_desc;
 	bool fault;
 	bool usb_in;
 	bool ta_in;
@@ -47,8 +48,7 @@ static int max8903_get_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		union power_supply_propval *val)
 {
-	struct max8903_data *data = container_of(psy,
-			struct max8903_data, psy);
+	struct max8903_data *data = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -104,17 +104,17 @@ static irqreturn_t max8903_dcin(int irq, void *_data)
 	dev_dbg(data->dev, "TA(DC-IN) Charger %s.\n", ta_in ?
 			"Connected" : "Disconnected");
 
-	old_type = data->psy.type;
+	old_type = data->psy_desc.type;
 
 	if (data->ta_in)
-		data->psy.type = POWER_SUPPLY_TYPE_MAINS;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_MAINS;
 	else if (data->usb_in)
-		data->psy.type = POWER_SUPPLY_TYPE_USB;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_USB;
 	else
-		data->psy.type = POWER_SUPPLY_TYPE_BATTERY;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_BATTERY;
 
-	if (old_type != data->psy.type)
-		power_supply_changed(&data->psy);
+	if (old_type != data->psy_desc.type)
+		power_supply_changed(data->psy);
 
 	return IRQ_HANDLED;
 }
@@ -143,17 +143,17 @@ static irqreturn_t max8903_usbin(int irq, void *_data)
 	dev_dbg(data->dev, "USB Charger %s.\n", usb_in ?
 			"Connected" : "Disconnected");
 
-	old_type = data->psy.type;
+	old_type = data->psy_desc.type;
 
 	if (data->ta_in)
-		data->psy.type = POWER_SUPPLY_TYPE_MAINS;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_MAINS;
 	else if (data->usb_in)
-		data->psy.type = POWER_SUPPLY_TYPE_USB;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_USB;
 	else
-		data->psy.type = POWER_SUPPLY_TYPE_BATTERY;
+		data->psy_desc.type = POWER_SUPPLY_TYPE_BATTERY;
 
-	if (old_type != data->psy.type)
-		power_supply_changed(&data->psy);
+	if (old_type != data->psy_desc.type)
+		power_supply_changed(data->psy);
 
 	return IRQ_HANDLED;
 }
@@ -184,6 +184,7 @@ static int max8903_probe(struct platform_device *pdev)
 	struct max8903_data *data;
 	struct device *dev = &pdev->dev;
 	struct max8903_pdata *pdata = pdev->dev.platform_data;
+	struct power_supply_config psy_cfg = {};
 	int ret = 0;
 	int gpio;
 	int ta_in = 0;
@@ -280,17 +281,20 @@ static int max8903_probe(struct platform_device *pdev)
 	data->ta_in = ta_in;
 	data->usb_in = usb_in;
 
-	data->psy.name = "max8903_charger";
-	data->psy.type = (ta_in) ? POWER_SUPPLY_TYPE_MAINS :
+	data->psy_desc.name = "max8903_charger";
+	data->psy_desc.type = (ta_in) ? POWER_SUPPLY_TYPE_MAINS :
 			((usb_in) ? POWER_SUPPLY_TYPE_USB :
 			 POWER_SUPPLY_TYPE_BATTERY);
-	data->psy.get_property = max8903_get_property;
-	data->psy.properties = max8903_charger_props;
-	data->psy.num_properties = ARRAY_SIZE(max8903_charger_props);
+	data->psy_desc.get_property = max8903_get_property;
+	data->psy_desc.properties = max8903_charger_props;
+	data->psy_desc.num_properties = ARRAY_SIZE(max8903_charger_props);
 
-	ret = power_supply_register(dev, &data->psy, NULL);
-	if (ret) {
+	psy_cfg.drv_data = data;
+
+	data->psy = power_supply_register(dev, &data->psy_desc, &psy_cfg);
+	if (IS_ERR(data->psy)) {
 		dev_err(dev, "failed: power supply register.\n");
+		ret = PTR_ERR(data->psy);
 		goto err;
 	}
 
@@ -339,7 +343,7 @@ err_dc_irq:
 	if (pdata->dc_valid)
 		free_irq(gpio_to_irq(pdata->dok), data);
 err_psy:
-	power_supply_unregister(&data->psy);
+	power_supply_unregister(data->psy);
 err:
 	return ret;
 }
@@ -357,7 +361,7 @@ static int max8903_remove(struct platform_device *pdev)
 			free_irq(gpio_to_irq(pdata->uok), data);
 		if (pdata->dc_valid)
 			free_irq(gpio_to_irq(pdata->dok), data);
-		power_supply_unregister(&data->psy);
+		power_supply_unregister(data->psy);
 	}
 
 	return 0;
diff --git a/drivers/power/max8925_power.c b/drivers/power/max8925_power.c
index 71c087e3feaf..57eb5c2bfc21 100644
--- a/drivers/power/max8925_power.c
+++ b/drivers/power/max8925_power.c
@@ -68,9 +68,9 @@ struct max8925_power_info {
 	struct i2c_client	*gpm;
 	struct i2c_client	*adc;
 
-	struct power_supply	ac;
-	struct power_supply	usb;
-	struct power_supply	battery;
+	struct power_supply	*ac;
+	struct power_supply	*usb;
+	struct power_supply	*battery;
 	int			irq_base;
 	unsigned		ac_online:1;
 	unsigned		usb_online:1;
@@ -196,7 +196,7 @@ static int max8925_ac_get_prop(struct power_supply *psy,
 			       enum power_supply_property psp,
 			       union power_supply_propval *val)
 {
-	struct max8925_power_info *info = dev_get_drvdata(psy->dev->parent);
+	struct max8925_power_info *info = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -230,7 +230,7 @@ static int max8925_usb_get_prop(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct max8925_power_info *info = dev_get_drvdata(psy->dev->parent);
+	struct max8925_power_info *info = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -264,7 +264,7 @@ static int max8925_bat_get_prop(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct max8925_power_info *info = dev_get_drvdata(psy->dev->parent);
+	struct max8925_power_info *info = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -347,6 +347,30 @@ static enum power_supply_property max8925_battery_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 };
 
+static const struct power_supply_desc ac_desc = {
+	.name		= "max8925-ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= max8925_ac_props,
+	.num_properties	= ARRAY_SIZE(max8925_ac_props),
+	.get_property	= max8925_ac_get_prop,
+};
+
+static const struct power_supply_desc usb_desc = {
+	.name		= "max8925-usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= max8925_usb_props,
+	.num_properties	= ARRAY_SIZE(max8925_usb_props),
+	.get_property	= max8925_usb_get_prop,
+};
+
+static const struct power_supply_desc battery_desc = {
+	.name		= "max8925-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= max8925_battery_props,
+	.num_properties	= ARRAY_SIZE(max8925_battery_props),
+	.get_property	= max8925_bat_get_prop,
+};
+
 #define REQUEST_IRQ(_irq, _name)					\
 do {									\
 	ret = request_threaded_irq(chip->irq_base + _irq, NULL,		\
@@ -506,36 +530,26 @@ static int max8925_power_probe(struct platform_device *pdev)
 	psy_cfg.supplied_to = pdata->supplied_to;
 	psy_cfg.num_supplicants = pdata->num_supplicants;
 
-	info->ac.name = "max8925-ac";
-	info->ac.type = POWER_SUPPLY_TYPE_MAINS;
-	info->ac.properties = max8925_ac_props;
-	info->ac.num_properties = ARRAY_SIZE(max8925_ac_props);
-	info->ac.get_property = max8925_ac_get_prop;
-	ret = power_supply_register(&pdev->dev, &info->ac, &psy_cfg);
-	if (ret)
+	info->ac = power_supply_register(&pdev->dev, &ac_desc, &psy_cfg);
+	if (IS_ERR(info->ac)) {
+		ret = PTR_ERR(info->ac);
 		goto out;
-	info->ac.dev->parent = &pdev->dev;
-
-	info->usb.name = "max8925-usb";
-	info->usb.type = POWER_SUPPLY_TYPE_USB;
-	info->usb.properties = max8925_usb_props;
-	info->usb.num_properties = ARRAY_SIZE(max8925_usb_props);
-	info->usb.get_property = max8925_usb_get_prop;
+	}
+	info->ac->dev.parent = &pdev->dev;
 
-	ret = power_supply_register(&pdev->dev, &info->usb, &psy_cfg);
-	if (ret)
+	info->usb = power_supply_register(&pdev->dev, &usb_desc, &psy_cfg);
+	if (IS_ERR(info->usb)) {
+		ret = PTR_ERR(info->usb);
 		goto out_usb;
-	info->usb.dev->parent = &pdev->dev;
-
-	info->battery.name = "max8925-battery";
-	info->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	info->battery.properties = max8925_battery_props;
-	info->battery.num_properties = ARRAY_SIZE(max8925_battery_props);
-	info->battery.get_property = max8925_bat_get_prop;
-	ret = power_supply_register(&pdev->dev, &info->battery, NULL);
-	if (ret)
+	}
+	info->usb->dev.parent = &pdev->dev;
+
+	info->battery = power_supply_register(&pdev->dev, &battery_desc, NULL);
+	if (IS_ERR(info->battery)) {
+		ret = PTR_ERR(info->battery);
 		goto out_battery;
-	info->battery.dev->parent = &pdev->dev;
+	}
+	info->battery->dev.parent = &pdev->dev;
 
 	info->batt_detect = pdata->batt_detect;
 	info->topoff_threshold = pdata->topoff_threshold;
@@ -547,9 +561,9 @@ static int max8925_power_probe(struct platform_device *pdev)
 	max8925_init_charger(chip, info);
 	return 0;
 out_battery:
-	power_supply_unregister(&info->battery);
+	power_supply_unregister(info->battery);
 out_usb:
-	power_supply_unregister(&info->ac);
+	power_supply_unregister(info->ac);
 out:
 	return ret;
 }
@@ -559,9 +573,9 @@ static int max8925_power_remove(struct platform_device *pdev)
 	struct max8925_power_info *info = platform_get_drvdata(pdev);
 
 	if (info) {
-		power_supply_unregister(&info->ac);
-		power_supply_unregister(&info->usb);
-		power_supply_unregister(&info->battery);
+		power_supply_unregister(info->ac);
+		power_supply_unregister(info->usb);
+		power_supply_unregister(info->battery);
 		max8925_deinit_charger(info);
 	}
 	return 0;
diff --git a/drivers/power/max8997_charger.c b/drivers/power/max8997_charger.c
index a9f4d506eb44..0b2eab571528 100644
--- a/drivers/power/max8997_charger.c
+++ b/drivers/power/max8997_charger.c
@@ -30,7 +30,7 @@
 struct charger_data {
 	struct device *dev;
 	struct max8997_dev *iodev;
-	struct power_supply battery;
+	struct power_supply *battery;
 };
 
 static enum power_supply_property max8997_battery_props[] = {
@@ -44,8 +44,7 @@ static int max8997_battery_get_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		union power_supply_propval *val)
 {
-	struct charger_data *charger = container_of(psy,
-			struct charger_data, battery);
+	struct charger_data *charger = power_supply_get_drvdata(psy);
 	struct i2c_client *i2c = charger->iodev->i2c;
 	int ret;
 	u8 reg;
@@ -86,12 +85,21 @@ static int max8997_battery_get_property(struct power_supply *psy,
 	return 0;
 }
 
+static const struct power_supply_desc max8997_battery_desc = {
+	.name		= "max8997_pmic",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= max8997_battery_get_property,
+	.properties	= max8997_battery_props,
+	.num_properties	= ARRAY_SIZE(max8997_battery_props),
+};
+
 static int max8997_battery_probe(struct platform_device *pdev)
 {
 	int ret = 0;
 	struct charger_data *charger;
 	struct max8997_dev *iodev = dev_get_drvdata(pdev->dev.parent);
 	struct max8997_platform_data *pdata = dev_get_platdata(iodev->dev);
+	struct power_supply_config psy_cfg = {};
 
 	if (!pdata)
 		return -EINVAL;
@@ -147,19 +155,18 @@ static int max8997_battery_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, charger);
 
-	charger->battery.name = "max8997_pmic";
-	charger->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	charger->battery.get_property = max8997_battery_get_property;
-	charger->battery.properties = max8997_battery_props;
-	charger->battery.num_properties = ARRAY_SIZE(max8997_battery_props);
 
 	charger->dev = &pdev->dev;
 	charger->iodev = iodev;
 
-	ret = power_supply_register(&pdev->dev, &charger->battery, NULL);
-	if (ret) {
+	psy_cfg.drv_data = charger;
+
+	charger->battery = power_supply_register(&pdev->dev,
+						 &max8997_battery_desc,
+						 &psy_cfg);
+	if (IS_ERR(charger->battery)) {
 		dev_err(&pdev->dev, "failed: power supply register\n");
-		return ret;
+		return PTR_ERR(charger->battery);
 	}
 
 	return 0;
@@ -169,7 +176,7 @@ static int max8997_battery_remove(struct platform_device *pdev)
 {
 	struct charger_data *charger = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&charger->battery);
+	power_supply_unregister(charger->battery);
 	return 0;
 }
 
diff --git a/drivers/power/max8998_charger.c b/drivers/power/max8998_charger.c
index 9ee72314b3d0..47448d4bc6cd 100644
--- a/drivers/power/max8998_charger.c
+++ b/drivers/power/max8998_charger.c
@@ -30,7 +30,7 @@
 struct max8998_battery_data {
 	struct device *dev;
 	struct max8998_dev *iodev;
-	struct power_supply battery;
+	struct power_supply *battery;
 };
 
 static enum power_supply_property max8998_battery_props[] = {
@@ -43,8 +43,7 @@ static int max8998_battery_get_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		union power_supply_propval *val)
 {
-	struct max8998_battery_data *max8998 = container_of(psy,
-			struct max8998_battery_data, battery);
+	struct max8998_battery_data *max8998 = power_supply_get_drvdata(psy);
 	struct i2c_client *i2c = max8998->iodev->i2c;
 	int ret;
 	u8 reg;
@@ -75,10 +74,19 @@ static int max8998_battery_get_property(struct power_supply *psy,
 	return 0;
 }
 
+static const struct power_supply_desc max8998_battery_desc = {
+	.name		= "max8998_pmic",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= max8998_battery_get_property,
+	.properties	= max8998_battery_props,
+	.num_properties	= ARRAY_SIZE(max8998_battery_props),
+};
+
 static int max8998_battery_probe(struct platform_device *pdev)
 {
 	struct max8998_dev *iodev = dev_get_drvdata(pdev->dev.parent);
 	struct max8998_platform_data *pdata = dev_get_platdata(iodev->dev);
+	struct power_supply_config psy_cfg = {};
 	struct max8998_battery_data *max8998;
 	struct i2c_client *i2c;
 	int ret = 0;
@@ -161,15 +169,15 @@ static int max8998_battery_probe(struct platform_device *pdev)
 		goto err;
 	}
 
-	max8998->battery.name = "max8998_pmic";
-	max8998->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	max8998->battery.get_property = max8998_battery_get_property;
-	max8998->battery.properties = max8998_battery_props;
-	max8998->battery.num_properties = ARRAY_SIZE(max8998_battery_props);
+	psy_cfg.drv_data = max8998;
 
-	ret = power_supply_register(max8998->dev, &max8998->battery, NULL);
-	if (ret) {
-		dev_err(max8998->dev, "failed: power supply register\n");
+	max8998->battery = power_supply_register(max8998->dev,
+						 &max8998_battery_desc,
+						 &psy_cfg);
+	if (IS_ERR(max8998->battery)) {
+		ret = PTR_ERR(max8998->battery);
+		dev_err(max8998->dev, "failed: power supply register: %d\n",
+			ret);
 		goto err;
 	}
 
@@ -182,7 +190,7 @@ static int max8998_battery_remove(struct platform_device *pdev)
 {
 	struct max8998_battery_data *max8998 = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&max8998->battery);
+	power_supply_unregister(max8998->battery);
 
 	return 0;
 }
diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 1340a1a75325..a944338a39de 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -81,7 +81,7 @@ static enum power_supply_property olpc_ac_props[] = {
 	POWER_SUPPLY_PROP_ONLINE,
 };
 
-static struct power_supply olpc_ac = {
+static const struct power_supply_desc olpc_ac_desc = {
 	.name = "olpc-ac",
 	.type = POWER_SUPPLY_TYPE_MAINS,
 	.properties = olpc_ac_props,
@@ -89,6 +89,8 @@ static struct power_supply olpc_ac = {
 	.get_property = olpc_ac_get_prop,
 };
 
+static struct power_supply *olpc_ac;
+
 static char bat_serial[17]; /* Ick */
 
 static int olpc_bat_get_status(union power_supply_propval *val, uint8_t ec_byte)
@@ -574,21 +576,23 @@ static struct device_attribute olpc_bat_error = {
  *		Initialisation
  *********************************************************************/
 
-static struct power_supply olpc_bat = {
+static struct power_supply_desc olpc_bat_desc = {
 	.name = "olpc-battery",
 	.get_property = olpc_bat_get_property,
 	.use_for_apm = 1,
 };
 
+static struct power_supply *olpc_bat;
+
 static int olpc_battery_suspend(struct platform_device *pdev,
 				pm_message_t state)
 {
-	if (device_may_wakeup(olpc_ac.dev))
+	if (device_may_wakeup(&olpc_ac->dev))
 		olpc_ec_wakeup_set(EC_SCI_SRC_ACPWR);
 	else
 		olpc_ec_wakeup_clear(EC_SCI_SRC_ACPWR);
 
-	if (device_may_wakeup(olpc_bat.dev))
+	if (device_may_wakeup(&olpc_bat->dev))
 		olpc_ec_wakeup_set(EC_SCI_SRC_BATTERY | EC_SCI_SRC_BATSOC
 				   | EC_SCI_SRC_BATERR);
 	else
@@ -619,52 +623,54 @@ static int olpc_battery_probe(struct platform_device *pdev)
 
 	/* Ignore the status. It doesn't actually matter */
 
-	ret = power_supply_register(&pdev->dev, &olpc_ac, NULL);
-	if (ret)
-		return ret;
+	olpc_ac = power_supply_register(&pdev->dev, &olpc_ac_desc, NULL);
+	if (IS_ERR(olpc_ac))
+		return PTR_ERR(olpc_ac);
 
 	if (olpc_board_at_least(olpc_board_pre(0xd0))) { /* XO-1.5 */
-		olpc_bat.properties = olpc_xo15_bat_props;
-		olpc_bat.num_properties = ARRAY_SIZE(olpc_xo15_bat_props);
+		olpc_bat_desc.properties = olpc_xo15_bat_props;
+		olpc_bat_desc.num_properties = ARRAY_SIZE(olpc_xo15_bat_props);
 	} else { /* XO-1 */
-		olpc_bat.properties = olpc_xo1_bat_props;
-		olpc_bat.num_properties = ARRAY_SIZE(olpc_xo1_bat_props);
+		olpc_bat_desc.properties = olpc_xo1_bat_props;
+		olpc_bat_desc.num_properties = ARRAY_SIZE(olpc_xo1_bat_props);
 	}
 
-	ret = power_supply_register(&pdev->dev, &olpc_bat, NULL);
-	if (ret)
+	olpc_bat = power_supply_register(&pdev->dev, &olpc_bat_desc, NULL);
+	if (IS_ERR(olpc_bat)) {
+		ret = PTR_ERR(olpc_bat);
 		goto battery_failed;
+	}
 
-	ret = device_create_bin_file(olpc_bat.dev, &olpc_bat_eeprom);
+	ret = device_create_bin_file(&olpc_bat->dev, &olpc_bat_eeprom);
 	if (ret)
 		goto eeprom_failed;
 
-	ret = device_create_file(olpc_bat.dev, &olpc_bat_error);
+	ret = device_create_file(&olpc_bat->dev, &olpc_bat_error);
 	if (ret)
 		goto error_failed;
 
 	if (olpc_ec_wakeup_available()) {
-		device_set_wakeup_capable(olpc_ac.dev, true);
-		device_set_wakeup_capable(olpc_bat.dev, true);
+		device_set_wakeup_capable(&olpc_ac->dev, true);
+		device_set_wakeup_capable(&olpc_bat->dev, true);
 	}
 
 	return 0;
 
 error_failed:
-	device_remove_bin_file(olpc_bat.dev, &olpc_bat_eeprom);
+	device_remove_bin_file(&olpc_bat->dev, &olpc_bat_eeprom);
 eeprom_failed:
-	power_supply_unregister(&olpc_bat);
+	power_supply_unregister(olpc_bat);
 battery_failed:
-	power_supply_unregister(&olpc_ac);
+	power_supply_unregister(olpc_ac);
 	return ret;
 }
 
 static int olpc_battery_remove(struct platform_device *pdev)
 {
-	device_remove_file(olpc_bat.dev, &olpc_bat_error);
-	device_remove_bin_file(olpc_bat.dev, &olpc_bat_eeprom);
-	power_supply_unregister(&olpc_bat);
-	power_supply_unregister(&olpc_ac);
+	device_remove_file(&olpc_bat->dev, &olpc_bat_error);
+	device_remove_bin_file(&olpc_bat->dev, &olpc_bat_eeprom);
+	power_supply_unregister(olpc_bat);
+	power_supply_unregister(olpc_ac);
 	return 0;
 }
 
diff --git a/drivers/power/pcf50633-charger.c b/drivers/power/pcf50633-charger.c
index 88fe7db2afcf..d05597b4e40f 100644
--- a/drivers/power/pcf50633-charger.c
+++ b/drivers/power/pcf50633-charger.c
@@ -33,9 +33,9 @@ struct pcf50633_mbc {
 	int adapter_online;
 	int usb_online;
 
-	struct power_supply usb;
-	struct power_supply adapter;
-	struct power_supply ac;
+	struct power_supply *usb;
+	struct power_supply *adapter;
+	struct power_supply *ac;
 };
 
 int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
@@ -104,7 +104,7 @@ int pcf50633_mbc_usb_curlim_set(struct pcf50633 *pcf, int ma)
 				PCF50633_MBCC1_CHGENA, PCF50633_MBCC1_CHGENA);
 	}
 
-	power_supply_changed(&mbc->usb);
+	power_supply_changed(mbc->usb);
 
 	return ret;
 }
@@ -278,9 +278,9 @@ pcf50633_mbc_irq_handler(int irq, void *data)
 	else if (irq == PCF50633_IRQ_ADPREM)
 		mbc->adapter_online = 0;
 
-	power_supply_changed(&mbc->ac);
-	power_supply_changed(&mbc->usb);
-	power_supply_changed(&mbc->adapter);
+	power_supply_changed(mbc->ac);
+	power_supply_changed(mbc->usb);
+	power_supply_changed(mbc->adapter);
 
 	if (mbc->pcf->pdata->mbc_event_callback)
 		mbc->pcf->pdata->mbc_event_callback(mbc->pcf, irq);
@@ -290,8 +290,7 @@ static int adapter_get_property(struct power_supply *psy,
 			enum power_supply_property psp,
 			union power_supply_propval *val)
 {
-	struct pcf50633_mbc *mbc = container_of(psy,
-				struct pcf50633_mbc, adapter);
+	struct pcf50633_mbc *mbc = power_supply_get_drvdata(psy);
 	int ret = 0;
 
 	switch (psp) {
@@ -309,7 +308,7 @@ static int usb_get_property(struct power_supply *psy,
 			enum power_supply_property psp,
 			union power_supply_propval *val)
 {
-	struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, usb);
+	struct pcf50633_mbc *mbc = power_supply_get_drvdata(psy);
 	int ret = 0;
 	u8 usblim = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCC7) &
 						PCF50633_MBCC7_USB_MASK;
@@ -330,7 +329,7 @@ static int ac_get_property(struct power_supply *psy,
 			enum power_supply_property psp,
 			union power_supply_propval *val)
 {
-	struct pcf50633_mbc *mbc = container_of(psy, struct pcf50633_mbc, ac);
+	struct pcf50633_mbc *mbc = power_supply_get_drvdata(psy);
 	int ret = 0;
 	u8 usblim = pcf50633_reg_read(mbc->pcf, PCF50633_REG_MBCC7) &
 						PCF50633_MBCC7_USB_MASK;
@@ -366,6 +365,30 @@ static const u8 mbc_irq_handlers[] = {
 	PCF50633_IRQ_LOWBAT,
 };
 
+static const struct power_supply_desc pcf50633_mbc_adapter_desc = {
+	.name		= "adapter",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= power_props,
+	.num_properties	= ARRAY_SIZE(power_props),
+	.get_property	= &adapter_get_property,
+};
+
+static const struct power_supply_desc pcf50633_mbc_usb_desc = {
+	.name		= "usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= power_props,
+	.num_properties	= ARRAY_SIZE(power_props),
+	.get_property	= usb_get_property,
+};
+
+static const struct power_supply_desc pcf50633_mbc_ac_desc = {
+	.name		= "ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= power_props,
+	.num_properties	= ARRAY_SIZE(power_props),
+	.get_property	= ac_get_property,
+};
+
 static int pcf50633_mbc_probe(struct platform_device *pdev)
 {
 	struct power_supply_config psy_cfg = {};
@@ -388,44 +411,34 @@ static int pcf50633_mbc_probe(struct platform_device *pdev)
 
 	psy_cfg.supplied_to		= mbc->pcf->pdata->batteries;
 	psy_cfg.num_supplicants		= mbc->pcf->pdata->num_batteries;
+	psy_cfg.drv_data		= mbc;
 
 	/* Create power supplies */
-	mbc->adapter.name		= "adapter";
-	mbc->adapter.type		= POWER_SUPPLY_TYPE_MAINS;
-	mbc->adapter.properties		= power_props;
-	mbc->adapter.num_properties	= ARRAY_SIZE(power_props);
-	mbc->adapter.get_property	= &adapter_get_property;
-
-	mbc->usb.name			= "usb";
-	mbc->usb.type			= POWER_SUPPLY_TYPE_USB;
-	mbc->usb.properties		= power_props;
-	mbc->usb.num_properties		= ARRAY_SIZE(power_props);
-	mbc->usb.get_property		= usb_get_property;
-
-	mbc->ac.name			= "ac";
-	mbc->ac.type			= POWER_SUPPLY_TYPE_MAINS;
-	mbc->ac.properties		= power_props;
-	mbc->ac.num_properties		= ARRAY_SIZE(power_props);
-	mbc->ac.get_property		= ac_get_property;
-
-	ret = power_supply_register(&pdev->dev, &mbc->adapter, &psy_cfg);
-	if (ret) {
+	mbc->adapter = power_supply_register(&pdev->dev,
+					     &pcf50633_mbc_adapter_desc,
+					     &psy_cfg);
+	if (IS_ERR(mbc->adapter)) {
 		dev_err(mbc->pcf->dev, "failed to register adapter\n");
+		ret = PTR_ERR(mbc->adapter);
 		return ret;
 	}
 
-	ret = power_supply_register(&pdev->dev, &mbc->usb, &psy_cfg);
-	if (ret) {
+	mbc->usb = power_supply_register(&pdev->dev, &pcf50633_mbc_usb_desc,
+					 &psy_cfg);
+	if (IS_ERR(mbc->usb)) {
 		dev_err(mbc->pcf->dev, "failed to register usb\n");
-		power_supply_unregister(&mbc->adapter);
+		power_supply_unregister(mbc->adapter);
+		ret = PTR_ERR(mbc->usb);
 		return ret;
 	}
 
-	ret = power_supply_register(&pdev->dev, &mbc->ac, &psy_cfg);
-	if (ret) {
+	mbc->ac = power_supply_register(&pdev->dev, &pcf50633_mbc_ac_desc,
+					&psy_cfg);
+	if (IS_ERR(mbc->ac)) {
 		dev_err(mbc->pcf->dev, "failed to register ac\n");
-		power_supply_unregister(&mbc->adapter);
-		power_supply_unregister(&mbc->usb);
+		power_supply_unregister(mbc->adapter);
+		power_supply_unregister(mbc->usb);
+		ret = PTR_ERR(mbc->ac);
 		return ret;
 	}
 
@@ -452,9 +465,9 @@ static int pcf50633_mbc_remove(struct platform_device *pdev)
 		pcf50633_free_irq(mbc->pcf, mbc_irq_handlers[i]);
 
 	sysfs_remove_group(&pdev->dev.kobj, &mbc_attr_group);
-	power_supply_unregister(&mbc->usb);
-	power_supply_unregister(&mbc->adapter);
-	power_supply_unregister(&mbc->ac);
+	power_supply_unregister(mbc->usb);
+	power_supply_unregister(mbc->adapter);
+	power_supply_unregister(mbc->ac);
 
 	return 0;
 }
diff --git a/drivers/power/pda_power.c b/drivers/power/pda_power.c
index fd55fad1d0db..dfe1ee89f7c7 100644
--- a/drivers/power/pda_power.c
+++ b/drivers/power/pda_power.c
@@ -34,6 +34,7 @@ static struct timer_list charger_timer;
 static struct timer_list supply_timer;
 static struct timer_list polling_timer;
 static int polling;
+static struct power_supply *pda_psy_ac, *pda_psy_usb;
 
 #if IS_ENABLED(CONFIG_USB_PHY)
 static struct usb_phy *transceiver;
@@ -58,7 +59,7 @@ static int pda_power_get_property(struct power_supply *psy,
 {
 	switch (psp) {
 	case POWER_SUPPLY_PROP_ONLINE:
-		if (psy->type == POWER_SUPPLY_TYPE_MAINS)
+		if (psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
 			val->intval = pdata->is_ac_online ?
 				      pdata->is_ac_online() : 0;
 		else
@@ -80,7 +81,7 @@ static char *pda_power_supplied_to[] = {
 	"backup-battery",
 };
 
-static struct power_supply pda_psy_ac = {
+static const struct power_supply_desc pda_psy_ac_desc = {
 	.name = "ac",
 	.type = POWER_SUPPLY_TYPE_MAINS,
 	.properties = pda_power_props,
@@ -88,7 +89,7 @@ static struct power_supply pda_psy_ac = {
 	.get_property = pda_power_get_property,
 };
 
-static struct power_supply pda_psy_usb = {
+static const struct power_supply_desc pda_psy_usb_desc = {
 	.name = "usb",
 	.type = POWER_SUPPLY_TYPE_USB,
 	.properties = pda_power_props,
@@ -143,12 +144,12 @@ static void supply_timer_func(unsigned long unused)
 {
 	if (ac_status == PDA_PSY_TO_CHANGE) {
 		ac_status = new_ac_status;
-		power_supply_changed(&pda_psy_ac);
+		power_supply_changed(pda_psy_ac);
 	}
 
 	if (usb_status == PDA_PSY_TO_CHANGE) {
 		usb_status = new_usb_status;
-		power_supply_changed(&pda_psy_usb);
+		power_supply_changed(pda_psy_usb);
 	}
 }
 
@@ -172,9 +173,9 @@ static void charger_timer_func(unsigned long unused)
 
 static irqreturn_t power_changed_isr(int irq, void *power_supply)
 {
-	if (power_supply == &pda_psy_ac)
+	if (power_supply == pda_psy_ac)
 		ac_status = PDA_PSY_TO_CHANGE;
-	else if (power_supply == &pda_psy_usb)
+	else if (power_supply == pda_psy_usb)
 		usb_status = PDA_PSY_TO_CHANGE;
 	else
 		return IRQ_NONE;
@@ -324,17 +325,19 @@ static int pda_power_probe(struct platform_device *pdev)
 #endif
 
 	if (pdata->is_ac_online) {
-		ret = power_supply_register(&pdev->dev, &pda_psy_ac, &psy_cfg);
-		if (ret) {
+		pda_psy_ac = power_supply_register(&pdev->dev,
+						   &pda_psy_ac_desc, &psy_cfg);
+		if (IS_ERR(pda_psy_ac)) {
 			dev_err(dev, "failed to register %s power supply\n",
-				pda_psy_ac.name);
+				pda_psy_ac_desc.name);
+			ret = PTR_ERR(pda_psy_ac);
 			goto ac_supply_failed;
 		}
 
 		if (ac_irq) {
 			ret = request_irq(ac_irq->start, power_changed_isr,
 					  get_irq_flags(ac_irq), ac_irq->name,
-					  &pda_psy_ac);
+					  pda_psy_ac);
 			if (ret) {
 				dev_err(dev, "request ac irq failed\n");
 				goto ac_irq_failed;
@@ -345,17 +348,20 @@ static int pda_power_probe(struct platform_device *pdev)
 	}
 
 	if (pdata->is_usb_online) {
-		ret = power_supply_register(&pdev->dev, &pda_psy_usb, &psy_cfg);
-		if (ret) {
+		pda_psy_usb = power_supply_register(&pdev->dev,
+						    &pda_psy_usb_desc,
+						    &psy_cfg);
+		if (IS_ERR(pda_psy_usb)) {
 			dev_err(dev, "failed to register %s power supply\n",
-				pda_psy_usb.name);
+				pda_psy_usb_desc.name);
+			ret = PTR_ERR(pda_psy_usb);
 			goto usb_supply_failed;
 		}
 
 		if (usb_irq) {
 			ret = request_irq(usb_irq->start, power_changed_isr,
 					  get_irq_flags(usb_irq),
-					  usb_irq->name, &pda_psy_usb);
+					  usb_irq->name, pda_psy_usb);
 			if (ret) {
 				dev_err(dev, "request usb irq failed\n");
 				goto usb_irq_failed;
@@ -392,21 +398,21 @@ static int pda_power_probe(struct platform_device *pdev)
 #if IS_ENABLED(CONFIG_USB_PHY)
 otg_reg_notifier_failed:
 	if (pdata->is_usb_online && usb_irq)
-		free_irq(usb_irq->start, &pda_psy_usb);
+		free_irq(usb_irq->start, pda_psy_usb);
 #endif
 usb_irq_failed:
 	if (pdata->is_usb_online)
-		power_supply_unregister(&pda_psy_usb);
+		power_supply_unregister(pda_psy_usb);
 usb_supply_failed:
 	if (pdata->is_ac_online && ac_irq)
-		free_irq(ac_irq->start, &pda_psy_ac);
+		free_irq(ac_irq->start, pda_psy_ac);
 #if IS_ENABLED(CONFIG_USB_PHY)
 	if (!IS_ERR_OR_NULL(transceiver))
 		usb_put_phy(transceiver);
 #endif
 ac_irq_failed:
 	if (pdata->is_ac_online)
-		power_supply_unregister(&pda_psy_ac);
+		power_supply_unregister(pda_psy_ac);
 ac_supply_failed:
 	if (ac_draw) {
 		regulator_put(ac_draw);
@@ -422,9 +428,9 @@ wrongid:
 static int pda_power_remove(struct platform_device *pdev)
 {
 	if (pdata->is_usb_online && usb_irq)
-		free_irq(usb_irq->start, &pda_psy_usb);
+		free_irq(usb_irq->start, pda_psy_usb);
 	if (pdata->is_ac_online && ac_irq)
-		free_irq(ac_irq->start, &pda_psy_ac);
+		free_irq(ac_irq->start, pda_psy_ac);
 
 	if (polling)
 		del_timer_sync(&polling_timer);
@@ -432,9 +438,9 @@ static int pda_power_remove(struct platform_device *pdev)
 	del_timer_sync(&supply_timer);
 
 	if (pdata->is_usb_online)
-		power_supply_unregister(&pda_psy_usb);
+		power_supply_unregister(pda_psy_usb);
 	if (pdata->is_ac_online)
-		power_supply_unregister(&pda_psy_ac);
+		power_supply_unregister(pda_psy_ac);
 #if IS_ENABLED(CONFIG_USB_PHY)
 	if (!IS_ERR_OR_NULL(transceiver))
 		usb_put_phy(transceiver);
diff --git a/drivers/power/pm2301_charger.c b/drivers/power/pm2301_charger.c
index d2e88e473238..cc0893ffbf7e 100644
--- a/drivers/power/pm2301_charger.c
+++ b/drivers/power/pm2301_charger.c
@@ -216,7 +216,7 @@ static int pm2xxx_charger_ovv_mngt(struct pm2xxx_charger *pm2, int val)
 {
 	dev_err(pm2->dev, "Overvoltage detected\n");
 	pm2->flags.ovv = true;
-	power_supply_changed(&pm2->ac_chg.psy);
+	power_supply_changed(pm2->ac_chg.psy);
 
 	/* Schedule a new HW failure check */
 	queue_delayed_work(pm2->charger_wq, &pm2->check_hw_failure_work, 0);
@@ -229,7 +229,7 @@ static int pm2xxx_charger_wd_exp_mngt(struct pm2xxx_charger *pm2, int val)
 	dev_dbg(pm2->dev , "20 minutes watchdog expired\n");
 
 	pm2->ac.wd_expired = true;
-	power_supply_changed(&pm2->ac_chg.psy);
+	power_supply_changed(pm2->ac_chg.psy);
 
 	return 0;
 }
@@ -573,7 +573,7 @@ static int pm2xxx_charger_update_charger_current(struct ux500_charger *charger,
 	struct pm2xxx_charger *pm2;
 	u8 val;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_MAINS)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
 		pm2 = to_pm2xxx_charger_ac_device_info(charger);
 	else
 		return -ENXIO;
@@ -816,7 +816,7 @@ static int pm2xxx_charger_ac_en(struct ux500_charger *charger,
 
 		dev_dbg(pm2->dev, "PM2301: " "Disabled AC charging\n");
 	}
-	power_supply_changed(&pm2->ac_chg.psy);
+	power_supply_changed(pm2->ac_chg.psy);
 
 error_occured:
 	return ret;
@@ -827,7 +827,7 @@ static int pm2xxx_charger_watchdog_kick(struct ux500_charger *charger)
 	int ret;
 	struct pm2xxx_charger *pm2;
 
-	if (charger->psy.type == POWER_SUPPLY_TYPE_MAINS)
+	if (charger->psy->desc->type == POWER_SUPPLY_TYPE_MAINS)
 		pm2 = to_pm2xxx_charger_ac_device_info(charger);
 	else
 		return -ENXIO;
@@ -845,8 +845,8 @@ static void pm2xxx_charger_ac_work(struct work_struct *work)
 		struct pm2xxx_charger, ac_work);
 
 
-	power_supply_changed(&pm2->ac_chg.psy);
-	sysfs_notify(&pm2->ac_chg.psy.dev->kobj, NULL, "present");
+	power_supply_changed(pm2->ac_chg.psy);
+	sysfs_notify(&pm2->ac_chg.psy->dev.kobj, NULL, "present");
 };
 
 static void pm2xxx_charger_check_hw_failure_work(struct work_struct *work)
@@ -862,7 +862,7 @@ static void pm2xxx_charger_check_hw_failure_work(struct work_struct *work)
 		if (!(reg_value & (PM2XXX_INT4_S_ITVPWR1OVV |
 					PM2XXX_INT4_S_ITVPWR2OVV))) {
 			pm2->flags.ovv = false;
-			power_supply_changed(&pm2->ac_chg.psy);
+			power_supply_changed(pm2->ac_chg.psy);
 		}
 	}
 
@@ -895,7 +895,7 @@ static void pm2xxx_charger_check_main_thermal_prot_work(
 				| PM2XXX_INT5_S_ITTHERMALSHUTDOWNFALL))
 		pm2->flags.main_thermal_prot = false;
 
-	power_supply_changed(&pm2->ac_chg.psy);
+	power_supply_changed(pm2->ac_chg.psy);
 }
 
 static struct pm2xxx_interrupts pm2xxx_int = {
@@ -1043,11 +1043,11 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 
 	/* AC supply */
 	/* power_supply base class */
-	pm2->ac_chg.psy.name = pm2->pdata->label;
-	pm2->ac_chg.psy.type = POWER_SUPPLY_TYPE_MAINS;
-	pm2->ac_chg.psy.properties = pm2xxx_charger_ac_props;
-	pm2->ac_chg.psy.num_properties = ARRAY_SIZE(pm2xxx_charger_ac_props);
-	pm2->ac_chg.psy.get_property = pm2xxx_charger_ac_get_property;
+	pm2->ac_chg_desc.name = pm2->pdata->label;
+	pm2->ac_chg_desc.type = POWER_SUPPLY_TYPE_MAINS;
+	pm2->ac_chg_desc.properties = pm2xxx_charger_ac_props;
+	pm2->ac_chg_desc.num_properties = ARRAY_SIZE(pm2xxx_charger_ac_props);
+	pm2->ac_chg_desc.get_property = pm2xxx_charger_ac_get_property;
 
 	psy_cfg.supplied_to = pm2->pdata->supplied_to;
 	psy_cfg.num_supplicants = pm2->pdata->num_supplicants;
@@ -1095,9 +1095,11 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 	}
 
 	/* Register AC charger class */
-	ret = power_supply_register(pm2->dev, &pm2->ac_chg.psy, &psy_cfg);
-	if (ret) {
+	pm2->ac_chg.psy = power_supply_register(pm2->dev, &pm2->ac_chg_desc,
+						&psy_cfg);
+	if (IS_ERR(pm2->ac_chg.psy)) {
 		dev_err(pm2->dev, "failed to register AC charger\n");
+		ret = PTR_ERR(pm2->ac_chg.psy);
 		goto free_regulator;
 	}
 
@@ -1169,8 +1171,8 @@ static int pm2xxx_wall_charger_probe(struct i2c_client *i2c_client,
 		ab8500_override_turn_on_stat(~AB8500_POW_KEY_1_ON,
 					     AB8500_MAIN_CH_DET);
 		pm2->ac_conn = true;
-		power_supply_changed(&pm2->ac_chg.psy);
-		sysfs_notify(&pm2->ac_chg.psy.dev->kobj, NULL, "present");
+		power_supply_changed(pm2->ac_chg.psy);
+		sysfs_notify(&pm2->ac_chg.psy->dev.kobj, NULL, "present");
 	}
 
 	return 0;
@@ -1185,7 +1187,7 @@ unregister_pm2xxx_interrupt:
 	free_irq(gpio_to_irq(pm2->pdata->gpio_irq_number), pm2);
 unregister_pm2xxx_charger:
 	/* unregister power supply */
-	power_supply_unregister(&pm2->ac_chg.psy);
+	power_supply_unregister(pm2->ac_chg.psy);
 free_regulator:
 	/* disable the regulator */
 	regulator_put(pm2->regu);
@@ -1220,7 +1222,7 @@ static int pm2xxx_wall_charger_remove(struct i2c_client *i2c_client)
 	/* disable the regulator */
 	regulator_put(pm2->regu);
 
-	power_supply_unregister(&pm2->ac_chg.psy);
+	power_supply_unregister(pm2->ac_chg.psy);
 
 	if (gpio_is_valid(pm2->lpn_pin))
 		gpio_free(pm2->lpn_pin);
diff --git a/drivers/power/pm2301_charger.h b/drivers/power/pm2301_charger.h
index 8ce3cc0195df..24181cf9717b 100644
--- a/drivers/power/pm2301_charger.h
+++ b/drivers/power/pm2301_charger.h
@@ -486,6 +486,7 @@ struct pm2xxx_charger {
 	struct work_struct check_main_thermal_prot_work;
 	struct delayed_work check_hw_failure_work;
 	struct ux500_charger ac_chg;
+	struct power_supply_desc ac_chg_desc;
 	struct pm2xxx_charger_event_flags flags;
 };
 
diff --git a/drivers/power/pmu_battery.c b/drivers/power/pmu_battery.c
index fb026f19aa4a..9c8d5253812c 100644
--- a/drivers/power/pmu_battery.c
+++ b/drivers/power/pmu_battery.c
@@ -17,13 +17,14 @@
 #include <linux/slab.h>
 
 static struct pmu_battery_dev {
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct pmu_battery_info *pbi;
 	char name[16];
 	int propval;
 } *pbats[PMU_MAX_BATTERIES];
 
-#define to_pmu_battery_dev(x) container_of(x, struct pmu_battery_dev, bat)
+#define to_pmu_battery_dev(x) power_supply_get_drvdata(x)
 
 /*********************************************************************
  *		Power
@@ -49,7 +50,7 @@ static enum power_supply_property pmu_ac_props[] = {
 	POWER_SUPPLY_PROP_ONLINE,
 };
 
-static struct power_supply pmu_ac = {
+static const struct power_supply_desc pmu_ac_desc = {
 	.name = "pmu-ac",
 	.type = POWER_SUPPLY_TYPE_MAINS,
 	.properties = pmu_ac_props,
@@ -57,6 +58,8 @@ static struct power_supply pmu_ac = {
 	.get_property = pmu_get_ac_prop,
 };
 
+static struct power_supply *pmu_ac;
+
 /*********************************************************************
  *		Battery properties
  *********************************************************************/
@@ -142,7 +145,7 @@ static struct platform_device *bat_pdev;
 
 static int __init pmu_bat_init(void)
 {
-	int ret;
+	int ret = 0;
 	int i;
 
 	bat_pdev = platform_device_register_simple("pmu-battery",
@@ -152,25 +155,32 @@ static int __init pmu_bat_init(void)
 		goto pdev_register_failed;
 	}
 
-	ret = power_supply_register(&bat_pdev->dev, &pmu_ac, NULL);
-	if (ret)
+	pmu_ac = power_supply_register(&bat_pdev->dev, &pmu_ac_desc, NULL);
+	if (IS_ERR(pmu_ac)) {
+		ret = PTR_ERR(pmu_ac);
 		goto ac_register_failed;
+	}
 
 	for (i = 0; i < pmu_battery_count; i++) {
+		struct power_supply_config psy_cfg = {};
 		struct pmu_battery_dev *pbat = kzalloc(sizeof(*pbat),
 						       GFP_KERNEL);
 		if (!pbat)
 			break;
 
 		sprintf(pbat->name, "PMU_battery_%d", i);
-		pbat->bat.name = pbat->name;
-		pbat->bat.properties = pmu_bat_props;
-		pbat->bat.num_properties = ARRAY_SIZE(pmu_bat_props);
-		pbat->bat.get_property = pmu_bat_get_property;
+		pbat->bat_desc.name = pbat->name;
+		pbat->bat_desc.properties = pmu_bat_props;
+		pbat->bat_desc.num_properties = ARRAY_SIZE(pmu_bat_props);
+		pbat->bat_desc.get_property = pmu_bat_get_property;
 		pbat->pbi = &pmu_batteries[i];
+		psy_cfg.drv_data = pbat;
 
-		ret = power_supply_register(&bat_pdev->dev, &pbat->bat, NULL);
-		if (ret) {
+		pbat->bat = power_supply_register(&bat_pdev->dev,
+						  &pbat->bat_desc,
+						  &psy_cfg);
+		if (IS_ERR(pbat->bat)) {
+			ret = PTR_ERR(pbat->bat);
 			kfree(pbat);
 			goto battery_register_failed;
 		}
@@ -183,10 +193,10 @@ battery_register_failed:
 	while (i--) {
 		if (!pbats[i])
 			continue;
-		power_supply_unregister(&pbats[i]->bat);
+		power_supply_unregister(pbats[i]->bat);
 		kfree(pbats[i]);
 	}
-	power_supply_unregister(&pmu_ac);
+	power_supply_unregister(pmu_ac);
 ac_register_failed:
 	platform_device_unregister(bat_pdev);
 pdev_register_failed:
@@ -201,10 +211,10 @@ static void __exit pmu_bat_exit(void)
 	for (i = 0; i < PMU_MAX_BATTERIES; i++) {
 		if (!pbats[i])
 			continue;
-		power_supply_unregister(&pbats[i]->bat);
+		power_supply_unregister(pbats[i]->bat);
 		kfree(pbats[i]);
 	}
-	power_supply_unregister(&pmu_ac);
+	power_supply_unregister(pmu_ac);
 	platform_device_unregister(bat_pdev);
 }
 
diff --git a/drivers/power/power_supply_core.c b/drivers/power/power_supply_core.c
index 583dece8845b..e51405b176c8 100644
--- a/drivers/power/power_supply_core.c
+++ b/drivers/power/power_supply_core.c
@@ -40,16 +40,16 @@ static bool __power_supply_is_supplied_by(struct power_supply *supplier,
 
 	/* Support both supplied_to and supplied_from modes */
 	if (supply->supplied_from) {
-		if (!supplier->name)
+		if (!supplier->desc->name)
 			return false;
 		for (i = 0; i < supply->num_supplies; i++)
-			if (!strcmp(supplier->name, supply->supplied_from[i]))
+			if (!strcmp(supplier->desc->name, supply->supplied_from[i]))
 				return true;
 	} else {
-		if (!supply->name)
+		if (!supply->desc->name)
 			return false;
 		for (i = 0; i < supplier->num_supplicants; i++)
-			if (!strcmp(supplier->supplied_to[i], supply->name))
+			if (!strcmp(supplier->supplied_to[i], supply->desc->name))
 				return true;
 	}
 
@@ -62,8 +62,8 @@ static int __power_supply_changed_work(struct device *dev, void *data)
 	struct power_supply *pst = dev_get_drvdata(dev);
 
 	if (__power_supply_is_supplied_by(psy, pst)) {
-		if (pst->external_power_changed)
-			pst->external_power_changed(pst);
+		if (pst->desc->external_power_changed)
+			pst->desc->external_power_changed(pst);
 	}
 
 	return 0;
@@ -75,7 +75,7 @@ static void power_supply_changed_work(struct work_struct *work)
 	struct power_supply *psy = container_of(work, struct power_supply,
 						changed_work);
 
-	dev_dbg(psy->dev, "%s\n", __func__);
+	dev_dbg(&psy->dev, "%s\n", __func__);
 
 	spin_lock_irqsave(&psy->changed_lock, flags);
 	/*
@@ -93,7 +93,7 @@ static void power_supply_changed_work(struct work_struct *work)
 		power_supply_update_leds(psy);
 		atomic_notifier_call_chain(&power_supply_notifier,
 				PSY_EVENT_PROP_CHANGED, psy);
-		kobject_uevent(&psy->dev->kobj, KOBJ_CHANGE);
+		kobject_uevent(&psy->dev.kobj, KOBJ_CHANGE);
 		spin_lock_irqsave(&psy->changed_lock, flags);
 	}
 
@@ -103,7 +103,7 @@ static void power_supply_changed_work(struct work_struct *work)
 	 * to true.
 	 */
 	if (likely(!psy->changed))
-		pm_relax(psy->dev);
+		pm_relax(&psy->dev);
 	spin_unlock_irqrestore(&psy->changed_lock, flags);
 }
 
@@ -111,11 +111,11 @@ void power_supply_changed(struct power_supply *psy)
 {
 	unsigned long flags;
 
-	dev_dbg(psy->dev, "%s\n", __func__);
+	dev_dbg(&psy->dev, "%s\n", __func__);
 
 	spin_lock_irqsave(&psy->changed_lock, flags);
 	psy->changed = true;
-	pm_stay_awake(psy->dev);
+	pm_stay_awake(&psy->dev);
 	spin_unlock_irqrestore(&psy->changed_lock, flags);
 	schedule_work(&psy->changed_work);
 }
@@ -138,9 +138,9 @@ static int __power_supply_populate_supplied_from(struct device *dev,
 			break;
 
 		if (np == epsy->of_node) {
-			dev_info(psy->dev, "%s: Found supply : %s\n",
-				psy->name, epsy->name);
-			psy->supplied_from[i-1] = (char *)epsy->name;
+			dev_info(&psy->dev, "%s: Found supply : %s\n",
+				psy->desc->name, epsy->desc->name);
+			psy->supplied_from[i-1] = (char *)epsy->desc->name;
 			psy->num_supplies++;
 			of_node_put(np);
 			break;
@@ -158,7 +158,7 @@ static int power_supply_populate_supplied_from(struct power_supply *psy)
 	error = class_for_each_device(power_supply_class, NULL, psy,
 				      __power_supply_populate_supplied_from);
 
-	dev_dbg(psy->dev, "%s %d\n", __func__, error);
+	dev_dbg(&psy->dev, "%s %d\n", __func__, error);
 
 	return error;
 }
@@ -220,7 +220,7 @@ static int power_supply_check_supplies(struct power_supply *psy)
 		of_node_put(np);
 
 		if (ret) {
-			dev_dbg(psy->dev, "Failed to find supply!\n");
+			dev_dbg(&psy->dev, "Failed to find supply!\n");
 			return ret;
 		}
 	} while (np);
@@ -230,17 +230,18 @@ static int power_supply_check_supplies(struct power_supply *psy)
 		return 0;
 
 	/* All supplies found, allocate char ** array for filling */
-	psy->supplied_from = devm_kzalloc(psy->dev, sizeof(psy->supplied_from),
+	psy->supplied_from = devm_kzalloc(&psy->dev, sizeof(psy->supplied_from),
 					  GFP_KERNEL);
 	if (!psy->supplied_from) {
-		dev_err(psy->dev, "Couldn't allocate memory for supply list\n");
+		dev_err(&psy->dev, "Couldn't allocate memory for supply list\n");
 		return -ENOMEM;
 	}
 
-	*psy->supplied_from = devm_kzalloc(psy->dev, sizeof(char *) * (cnt - 1),
+	*psy->supplied_from = devm_kzalloc(&psy->dev,
+					   sizeof(char *) * (cnt - 1),
 					   GFP_KERNEL);
 	if (!*psy->supplied_from) {
-		dev_err(psy->dev, "Couldn't allocate memory for supply list\n");
+		dev_err(&psy->dev, "Couldn't allocate memory for supply list\n");
 		return -ENOMEM;
 	}
 
@@ -260,7 +261,8 @@ static int __power_supply_am_i_supplied(struct device *dev, void *data)
 	struct power_supply *epsy = dev_get_drvdata(dev);
 
 	if (__power_supply_is_supplied_by(epsy, psy))
-		if (!epsy->get_property(epsy, POWER_SUPPLY_PROP_ONLINE, &ret))
+		if (!epsy->desc->get_property(epsy, POWER_SUPPLY_PROP_ONLINE,
+					&ret))
 			return ret.intval;
 
 	return 0;
@@ -273,7 +275,7 @@ int power_supply_am_i_supplied(struct power_supply *psy)
 	error = class_for_each_device(power_supply_class, NULL, psy,
 				      __power_supply_am_i_supplied);
 
-	dev_dbg(psy->dev, "%s %d\n", __func__, error);
+	dev_dbg(&psy->dev, "%s %d\n", __func__, error);
 
 	return error;
 }
@@ -286,8 +288,9 @@ static int __power_supply_is_system_supplied(struct device *dev, void *data)
 	unsigned int *count = data;
 
 	(*count)++;
-	if (psy->type != POWER_SUPPLY_TYPE_BATTERY)
-		if (!psy->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &ret))
+	if (psy->desc->type != POWER_SUPPLY_TYPE_BATTERY)
+		if (!psy->desc->get_property(psy, POWER_SUPPLY_PROP_ONLINE,
+					&ret))
 			return ret.intval;
 
 	return 0;
@@ -315,9 +318,9 @@ EXPORT_SYMBOL_GPL(power_supply_is_system_supplied);
 int power_supply_set_battery_charged(struct power_supply *psy)
 {
 	if (atomic_read(&psy->use_cnt) >= 0 &&
-			psy->type == POWER_SUPPLY_TYPE_BATTERY &&
-			psy->set_charged) {
-		psy->set_charged(psy);
+			psy->desc->type == POWER_SUPPLY_TYPE_BATTERY &&
+			psy->desc->set_charged) {
+		psy->desc->set_charged(psy);
 		return 0;
 	}
 
@@ -330,7 +333,7 @@ static int power_supply_match_device_by_name(struct device *dev, const void *dat
 	const char *name = data;
 	struct power_supply *psy = dev_get_drvdata(dev);
 
-	return strcmp(psy->name, name) == 0;
+	return strcmp(psy->desc->name, name) == 0;
 }
 
 struct power_supply *power_supply_get_by_name(const char *name)
@@ -375,7 +378,7 @@ int power_supply_get_property(struct power_supply *psy,
 	if (atomic_read(&psy->use_cnt) <= 0)
 		return -ENODEV;
 
-	return psy->get_property(psy, psp, val);
+	return psy->desc->get_property(psy, psp, val);
 }
 EXPORT_SYMBOL_GPL(power_supply_get_property);
 
@@ -383,42 +386,45 @@ int power_supply_set_property(struct power_supply *psy,
 			    enum power_supply_property psp,
 			    const union power_supply_propval *val)
 {
-	if (atomic_read(&psy->use_cnt) <= 0 || !psy->set_property)
+	if (atomic_read(&psy->use_cnt) <= 0 || !psy->desc->set_property)
 		return -ENODEV;
 
-	return psy->set_property(psy, psp, val);
+	return psy->desc->set_property(psy, psp, val);
 }
 EXPORT_SYMBOL_GPL(power_supply_set_property);
 
 int power_supply_property_is_writeable(struct power_supply *psy,
 					enum power_supply_property psp)
 {
-	if (atomic_read(&psy->use_cnt) <= 0 || !psy->property_is_writeable)
+	if (atomic_read(&psy->use_cnt) <= 0 ||
+			!psy->desc->property_is_writeable)
 		return -ENODEV;
 
-	return psy->property_is_writeable(psy, psp);
+	return psy->desc->property_is_writeable(psy, psp);
 }
 EXPORT_SYMBOL_GPL(power_supply_property_is_writeable);
 
 void power_supply_external_power_changed(struct power_supply *psy)
 {
-	if (atomic_read(&psy->use_cnt) <= 0 || !psy->external_power_changed)
+	if (atomic_read(&psy->use_cnt) <= 0 ||
+			!psy->desc->external_power_changed)
 		return;
 
-	psy->external_power_changed(psy);
+	psy->desc->external_power_changed(psy);
 }
 EXPORT_SYMBOL_GPL(power_supply_external_power_changed);
 
 int power_supply_powers(struct power_supply *psy, struct device *dev)
 {
-	return sysfs_create_link(&psy->dev->kobj, &dev->kobj, "powers");
+	return sysfs_create_link(&psy->dev.kobj, &dev->kobj, "powers");
 }
 EXPORT_SYMBOL_GPL(power_supply_powers);
 
 static void power_supply_dev_release(struct device *dev)
 {
+	struct power_supply *psy = container_of(dev, struct power_supply, dev);
 	pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
-	kfree(dev);
+	kfree(psy);
 }
 
 int power_supply_reg_notifier(struct notifier_block *nb)
@@ -443,7 +449,7 @@ static int power_supply_read_temp(struct thermal_zone_device *tzd,
 
 	WARN_ON(tzd == NULL);
 	psy = tzd->devdata;
-	ret = psy->get_property(psy, POWER_SUPPLY_PROP_TEMP, &val);
+	ret = psy->desc->get_property(psy, POWER_SUPPLY_PROP_TEMP, &val);
 
 	/* Convert tenths of degree Celsius to milli degree Celsius. */
 	if (!ret)
@@ -460,14 +466,14 @@ static int psy_register_thermal(struct power_supply *psy)
 {
 	int i;
 
-	if (psy->no_thermal)
+	if (psy->desc->no_thermal)
 		return 0;
 
 	/* Register battery zone device psy reports temperature */
-	for (i = 0; i < psy->num_properties; i++) {
-		if (psy->properties[i] == POWER_SUPPLY_PROP_TEMP) {
-			psy->tzd = thermal_zone_device_register(psy->name, 0, 0,
-					psy, &psy_tzd_ops, NULL, 0, 0);
+	for (i = 0; i < psy->desc->num_properties; i++) {
+		if (psy->desc->properties[i] == POWER_SUPPLY_PROP_TEMP) {
+			psy->tzd = thermal_zone_device_register(psy->desc->name,
+					0, 0, psy, &psy_tzd_ops, NULL, 0, 0);
 			return PTR_ERR_OR_ZERO(psy->tzd);
 		}
 	}
@@ -490,7 +496,7 @@ static int ps_get_max_charge_cntl_limit(struct thermal_cooling_device *tcd,
 	int ret;
 
 	psy = tcd->devdata;
-	ret = psy->get_property(psy,
+	ret = psy->desc->get_property(psy,
 		POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT_MAX, &val);
 	if (!ret)
 		*state = val.intval;
@@ -506,7 +512,7 @@ static int ps_get_cur_chrage_cntl_limit(struct thermal_cooling_device *tcd,
 	int ret;
 
 	psy = tcd->devdata;
-	ret = psy->get_property(psy,
+	ret = psy->desc->get_property(psy,
 		POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val);
 	if (!ret)
 		*state = val.intval;
@@ -523,7 +529,7 @@ static int ps_set_cur_charge_cntl_limit(struct thermal_cooling_device *tcd,
 
 	psy = tcd->devdata;
 	val.intval = state;
-	ret = psy->set_property(psy,
+	ret = psy->desc->set_property(psy,
 		POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT, &val);
 
 	return ret;
@@ -540,11 +546,11 @@ static int psy_register_cooler(struct power_supply *psy)
 	int i;
 
 	/* Register for cooling device if psy can control charging */
-	for (i = 0; i < psy->num_properties; i++) {
-		if (psy->properties[i] ==
+	for (i = 0; i < psy->desc->num_properties; i++) {
+		if (psy->desc->properties[i] ==
 				POWER_SUPPLY_PROP_CHARGE_CONTROL_LIMIT) {
 			psy->tcd = thermal_cooling_device_register(
-							(char *)psy->name,
+							(char *)psy->desc->name,
 							psy, &psy_tcd_ops);
 			return PTR_ERR_OR_ZERO(psy->tcd);
 		}
@@ -578,17 +584,21 @@ static void psy_unregister_cooler(struct power_supply *psy)
 }
 #endif
 
-static int __power_supply_register(struct device *parent,
-				   struct power_supply *psy,
+static struct power_supply *__must_check
+__power_supply_register(struct device *parent,
+				   const struct power_supply_desc *desc,
 				   const struct power_supply_config *cfg,
 				   bool ws)
 {
 	struct device *dev;
+	struct power_supply *psy;
 	int rc;
 
-	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-	if (!dev)
-		return -ENOMEM;
+	psy = kzalloc(sizeof(*psy), GFP_KERNEL);
+	if (!psy)
+		return ERR_PTR(-ENOMEM);
+
+	dev = &psy->dev;
 
 	device_initialize(dev);
 
@@ -597,7 +607,7 @@ static int __power_supply_register(struct device *parent,
 	dev->parent = parent;
 	dev->release = power_supply_dev_release;
 	dev_set_drvdata(dev, psy);
-	psy->dev = dev;
+	psy->desc = desc;
 	atomic_inc(&psy->use_cnt);
 	if (cfg) {
 		psy->drv_data = cfg->drv_data;
@@ -606,7 +616,7 @@ static int __power_supply_register(struct device *parent,
 		psy->num_supplicants = cfg->num_supplicants;
 	}
 
-	rc = dev_set_name(dev, "%s", psy->name);
+	rc = dev_set_name(dev, "%s", desc->name);
 	if (rc)
 		goto dev_set_name_failed;
 
@@ -641,7 +651,7 @@ static int __power_supply_register(struct device *parent,
 
 	power_supply_changed(psy);
 
-	return 0;
+	return psy;
 
 create_triggers_failed:
 	psy_unregister_cooler(psy);
@@ -654,20 +664,49 @@ wakeup_init_failed:
 check_supplies_failed:
 dev_set_name_failed:
 	put_device(dev);
-	return rc;
+	return ERR_PTR(rc);
 }
 
-int power_supply_register(struct device *parent, struct power_supply *psy,
+/**
+ * power_supply_register() - Register new power supply
+ * @parent:	Device to be a parent of power supply's device
+ * @desc:	Description of power supply, must be valid through whole
+ *		lifetime of this power supply
+ * @cfg:	Run-time specific configuration accessed during registering,
+ *		may be NULL
+ *
+ * Return: A pointer to newly allocated power_supply on success
+ * or ERR_PTR otherwise.
+ * Use power_supply_unregister() on returned power_supply pointer to release
+ * resources.
+ */
+struct power_supply *__must_check power_supply_register(struct device *parent,
+		const struct power_supply_desc *desc,
 		const struct power_supply_config *cfg)
 {
-	return __power_supply_register(parent, psy, cfg, true);
+	return __power_supply_register(parent, desc, cfg, true);
 }
 EXPORT_SYMBOL_GPL(power_supply_register);
 
-int power_supply_register_no_ws(struct device *parent, struct power_supply *psy,
+/**
+ * power_supply_register() - Register new non-waking-source power supply
+ * @parent:	Device to be a parent of power supply's device
+ * @desc:	Description of power supply, must be valid through whole
+ *		lifetime of this power supply
+ * @cfg:	Run-time specific configuration accessed during registering,
+ *		may be NULL
+ *
+ * Return: A pointer to newly allocated power_supply on success
+ * or ERR_PTR otherwise.
+ * Use power_supply_unregister() on returned power_supply pointer to release
+ * resources.
+ */
+struct power_supply *__must_check
+power_supply_register_no_ws(struct device *parent,
+		const struct power_supply_desc *desc,
 		const struct power_supply_config *cfg)
 {
-	return __power_supply_register(parent, psy, cfg, false);
+	return __power_supply_register(parent, desc, cfg, false);
 }
 EXPORT_SYMBOL_GPL(power_supply_register_no_ws);
 
@@ -678,56 +717,93 @@ static void devm_power_supply_release(struct device *dev, void *res)
 	power_supply_unregister(*psy);
 }
 
-int devm_power_supply_register(struct device *parent, struct power_supply *psy,
+/**
+ * power_supply_register() - Register managed power supply
+ * @parent:	Device to be a parent of power supply's device
+ * @desc:	Description of power supply, must be valid through whole
+ *		lifetime of this power supply
+ * @cfg:	Run-time specific configuration accessed during registering,
+ *		may be NULL
+ *
+ * Return: A pointer to newly allocated power_supply on success
+ * or ERR_PTR otherwise.
+ * The returned power_supply pointer will be automatically unregistered
+ * on driver detach.
+ */
+struct power_supply *__must_check
+devm_power_supply_register(struct device *parent,
+		const struct power_supply_desc *desc,
 		const struct power_supply_config *cfg)
 {
-	struct power_supply **ptr = devres_alloc(devm_power_supply_release,
-						 sizeof(*ptr), GFP_KERNEL);
-	int ret;
+	struct power_supply **ptr, *psy;
+
+	ptr = devres_alloc(devm_power_supply_release, sizeof(*ptr), GFP_KERNEL);
 
 	if (!ptr)
-		return -ENOMEM;
-	ret = __power_supply_register(parent, psy, cfg, true);
-	if (ret < 0)
+		return ERR_PTR(-ENOMEM);
+	psy = __power_supply_register(parent, desc, cfg, true);
+	if (IS_ERR(psy)) {
 		devres_free(ptr);
-	else {
+	} else {
 		*ptr = psy;
 		devres_add(parent, ptr);
 	}
-	return ret;
+	return psy;
 }
 EXPORT_SYMBOL_GPL(devm_power_supply_register);
 
-int devm_power_supply_register_no_ws(struct device *parent, struct power_supply *psy,
+/**
+ * power_supply_register() - Register managed non-waking-source power supply
+ * @parent:	Device to be a parent of power supply's device
+ * @desc:	Description of power supply, must be valid through whole
+ *		lifetime of this power supply
+ * @cfg:	Run-time specific configuration accessed during registering,
+ *		may be NULL
+ *
+ * Return: A pointer to newly allocated power_supply on success
+ * or ERR_PTR otherwise.
+ * The returned power_supply pointer will be automatically unregistered
+ * on driver detach.
+ */
+struct power_supply *__must_check
+devm_power_supply_register_no_ws(struct device *parent,
+		const struct power_supply_desc *desc,
 		const struct power_supply_config *cfg)
 {
-	struct power_supply **ptr = devres_alloc(devm_power_supply_release,
-						 sizeof(*ptr), GFP_KERNEL);
-	int ret;
+	struct power_supply **ptr, *psy;
+
+	ptr = devres_alloc(devm_power_supply_release, sizeof(*ptr), GFP_KERNEL);
 
 	if (!ptr)
-		return -ENOMEM;
-	ret = __power_supply_register(parent, psy, cfg, false);
-	if (ret < 0)
+		return ERR_PTR(-ENOMEM);
+	psy = __power_supply_register(parent, desc, cfg, false);
+	if (IS_ERR(psy)) {
 		devres_free(ptr);
-	else {
+	} else {
 		*ptr = psy;
 		devres_add(parent, ptr);
 	}
-	return ret;
+	return psy;
 }
 EXPORT_SYMBOL_GPL(devm_power_supply_register_no_ws);
 
+/**
+ * power_supply_unregister() - Remove this power supply from system
+ * @psy:	Pointer to power supply to unregister
+ *
+ * Remove this power supply from the system. The resources of power supply
+ * will be freed here or on last power_supply_put() call.
+ */
 void power_supply_unregister(struct power_supply *psy)
 {
 	WARN_ON(atomic_dec_return(&psy->use_cnt));
 	cancel_work_sync(&psy->changed_work);
-	sysfs_remove_link(&psy->dev->kobj, "powers");
+	sysfs_remove_link(&psy->dev.kobj, "powers");
 	power_supply_remove_triggers(psy);
 	psy_unregister_cooler(psy);
 	psy_unregister_thermal(psy);
-	device_init_wakeup(psy->dev, false);
-	device_unregister(psy->dev);
+	device_init_wakeup(&psy->dev, false);
+	device_unregister(&psy->dev);
 }
 EXPORT_SYMBOL_GPL(power_supply_unregister);
 
diff --git a/drivers/power/power_supply_leds.c b/drivers/power/power_supply_leds.c
index effa093c37b0..2d41a43fc81a 100644
--- a/drivers/power/power_supply_leds.c
+++ b/drivers/power/power_supply_leds.c
@@ -25,10 +25,10 @@ static void power_supply_update_bat_leds(struct power_supply *psy)
 	unsigned long delay_on = 0;
 	unsigned long delay_off = 0;
 
-	if (psy->get_property(psy, POWER_SUPPLY_PROP_STATUS, &status))
+	if (psy->desc->get_property(psy, POWER_SUPPLY_PROP_STATUS, &status))
 		return;
 
-	dev_dbg(psy->dev, "%s %d\n", __func__, status.intval);
+	dev_dbg(&psy->dev, "%s %d\n", __func__, status.intval);
 
 	switch (status.intval) {
 	case POWER_SUPPLY_STATUS_FULL:
@@ -58,21 +58,21 @@ static void power_supply_update_bat_leds(struct power_supply *psy)
 static int power_supply_create_bat_triggers(struct power_supply *psy)
 {
 	psy->charging_full_trig_name = kasprintf(GFP_KERNEL,
-					"%s-charging-or-full", psy->name);
+					"%s-charging-or-full", psy->desc->name);
 	if (!psy->charging_full_trig_name)
 		goto charging_full_failed;
 
 	psy->charging_trig_name = kasprintf(GFP_KERNEL,
-					"%s-charging", psy->name);
+					"%s-charging", psy->desc->name);
 	if (!psy->charging_trig_name)
 		goto charging_failed;
 
-	psy->full_trig_name = kasprintf(GFP_KERNEL, "%s-full", psy->name);
+	psy->full_trig_name = kasprintf(GFP_KERNEL, "%s-full", psy->desc->name);
 	if (!psy->full_trig_name)
 		goto full_failed;
 
 	psy->charging_blink_full_solid_trig_name = kasprintf(GFP_KERNEL,
-		"%s-charging-blink-full-solid", psy->name);
+		"%s-charging-blink-full-solid", psy->desc->name);
 	if (!psy->charging_blink_full_solid_trig_name)
 		goto charging_blink_full_solid_failed;
 
@@ -115,10 +115,10 @@ static void power_supply_update_gen_leds(struct power_supply *psy)
 {
 	union power_supply_propval online;
 
-	if (psy->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &online))
+	if (psy->desc->get_property(psy, POWER_SUPPLY_PROP_ONLINE, &online))
 		return;
 
-	dev_dbg(psy->dev, "%s %d\n", __func__, online.intval);
+	dev_dbg(&psy->dev, "%s %d\n", __func__, online.intval);
 
 	if (online.intval)
 		led_trigger_event(psy->online_trig, LED_FULL);
@@ -128,7 +128,8 @@ static void power_supply_update_gen_leds(struct power_supply *psy)
 
 static int power_supply_create_gen_triggers(struct power_supply *psy)
 {
-	psy->online_trig_name = kasprintf(GFP_KERNEL, "%s-online", psy->name);
+	psy->online_trig_name = kasprintf(GFP_KERNEL, "%s-online",
+					  psy->desc->name);
 	if (!psy->online_trig_name)
 		return -ENOMEM;
 
@@ -147,7 +148,7 @@ static void power_supply_remove_gen_triggers(struct power_supply *psy)
 
 void power_supply_update_leds(struct power_supply *psy)
 {
-	if (psy->type == POWER_SUPPLY_TYPE_BATTERY)
+	if (psy->desc->type == POWER_SUPPLY_TYPE_BATTERY)
 		power_supply_update_bat_leds(psy);
 	else
 		power_supply_update_gen_leds(psy);
@@ -155,14 +156,14 @@ void power_supply_update_leds(struct power_supply *psy)
 
 int power_supply_create_triggers(struct power_supply *psy)
 {
-	if (psy->type == POWER_SUPPLY_TYPE_BATTERY)
+	if (psy->desc->type == POWER_SUPPLY_TYPE_BATTERY)
 		return power_supply_create_bat_triggers(psy);
 	return power_supply_create_gen_triggers(psy);
 }
 
 void power_supply_remove_triggers(struct power_supply *psy)
 {
-	if (psy->type == POWER_SUPPLY_TYPE_BATTERY)
+	if (psy->desc->type == POWER_SUPPLY_TYPE_BATTERY)
 		power_supply_remove_bat_triggers(psy);
 	else
 		power_supply_remove_gen_triggers(psy);
diff --git a/drivers/power/power_supply_sysfs.c b/drivers/power/power_supply_sysfs.c
index f817aab80813..9134e3d2d95e 100644
--- a/drivers/power/power_supply_sysfs.c
+++ b/drivers/power/power_supply_sysfs.c
@@ -74,7 +74,7 @@ static ssize_t power_supply_show_property(struct device *dev,
 	union power_supply_propval value;
 
 	if (off == POWER_SUPPLY_PROP_TYPE) {
-		value.intval = psy->type;
+		value.intval = psy->desc->type;
 	} else {
 		ret = power_supply_get_property(psy, off, &value);
 
@@ -125,7 +125,7 @@ static ssize_t power_supply_store_property(struct device *dev,
 
 	value.intval = long_val;
 
-	ret = power_supply_set_property(psy, off, &value);
+	ret = psy->desc->set_property(psy, off, &value);
 	if (ret < 0)
 		return ret;
 
@@ -218,11 +218,11 @@ static umode_t power_supply_attr_is_visible(struct kobject *kobj,
 	if (attrno == POWER_SUPPLY_PROP_TYPE)
 		return mode;
 
-	for (i = 0; i < psy->num_properties; i++) {
-		int property = psy->properties[i];
+	for (i = 0; i < psy->desc->num_properties; i++) {
+		int property = psy->desc->properties[i];
 
 		if (property == attrno) {
-			if (psy->property_is_writeable &&
+			if (psy->desc->property_is_writeable &&
 			    power_supply_property_is_writeable(psy, property) > 0)
 				mode |= S_IWUSR;
 
@@ -279,14 +279,14 @@ int power_supply_uevent(struct device *dev, struct kobj_uevent_env *env)
 
 	dev_dbg(dev, "uevent\n");
 
-	if (!psy || !psy->dev) {
+	if (!psy || !psy->desc) {
 		dev_dbg(dev, "No power supply yet\n");
 		return ret;
 	}
 
-	dev_dbg(dev, "POWER_SUPPLY_NAME=%s\n", psy->name);
+	dev_dbg(dev, "POWER_SUPPLY_NAME=%s\n", psy->desc->name);
 
-	ret = add_uevent_var(env, "POWER_SUPPLY_NAME=%s", psy->name);
+	ret = add_uevent_var(env, "POWER_SUPPLY_NAME=%s", psy->desc->name);
 	if (ret)
 		return ret;
 
@@ -294,11 +294,11 @@ int power_supply_uevent(struct device *dev, struct kobj_uevent_env *env)
 	if (!prop_buf)
 		return -ENOMEM;
 
-	for (j = 0; j < psy->num_properties; j++) {
+	for (j = 0; j < psy->desc->num_properties; j++) {
 		struct device_attribute *attr;
 		char *line;
 
-		attr = &power_supply_attrs[psy->properties[j]];
+		attr = &power_supply_attrs[psy->desc->properties[j]];
 
 		ret = power_supply_show_property(dev, attr, prop_buf);
 		if (ret == -ENODEV || ret == -ENODATA) {
diff --git a/drivers/power/rt5033_battery.c b/drivers/power/rt5033_battery.c
index 8cf000baaf36..a7a6877b4e16 100644
--- a/drivers/power/rt5033_battery.c
+++ b/drivers/power/rt5033_battery.c
@@ -72,8 +72,7 @@ static int rt5033_battery_get_property(struct power_supply *psy,
 		enum power_supply_property psp,
 		union power_supply_propval *val)
 {
-	struct rt5033_battery *battery = container_of(psy,
-				struct rt5033_battery, psy);
+	struct rt5033_battery *battery = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_VOLTAGE_NOW:
@@ -108,10 +107,19 @@ static const struct regmap_config rt5033_battery_regmap_config = {
 	.max_register	= RT5033_FUEL_REG_END,
 };
 
+static const struct power_supply_desc rt5033_battery_desc = {
+	.name		= "rt5033-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= rt5033_battery_get_property,
+	.properties	= rt5033_battery_props,
+	.num_properties	= ARRAY_SIZE(rt5033_battery_props),
+};
+
 static int rt5033_battery_probe(struct i2c_client *client,
 		const struct i2c_device_id *id)
 {
 	struct i2c_adapter *adapter = to_i2c_adapter(client->dev.parent);
+	struct power_supply_config psy_cfg = {};
 	struct rt5033_battery *battery;
 	u32 ret;
 
@@ -131,16 +139,13 @@ static int rt5033_battery_probe(struct i2c_client *client,
 	}
 
 	i2c_set_clientdata(client, battery);
+	psy_cfg.drv_data = battery;
 
-	battery->psy.name		= "rt5033-battery";
-	battery->psy.type		= POWER_SUPPLY_TYPE_BATTERY;
-	battery->psy.get_property	= rt5033_battery_get_property;
-	battery->psy.properties		= rt5033_battery_props;
-	battery->psy.num_properties	= ARRAY_SIZE(rt5033_battery_props);
-
-	ret = power_supply_register(&client->dev, &battery->psy, NULL);
-	if (ret) {
+	battery->psy = power_supply_register(&client->dev,
+					     &rt5033_battery_desc, &psy_cfg);
+	if (IS_ERR(battery->psy)) {
 		dev_err(&client->dev, "Failed to register power supply\n");
+		ret = PTR_ERR(battery->psy);
 		return ret;
 	}
 
@@ -151,7 +156,7 @@ static int rt5033_battery_remove(struct i2c_client *client)
 {
 	struct rt5033_battery *battery = i2c_get_clientdata(client);
 
-	power_supply_unregister(&battery->psy);
+	power_supply_unregister(battery->psy);
 
 	return 0;
 }
diff --git a/drivers/power/rx51_battery.c b/drivers/power/rx51_battery.c
index 804f60c7b715..ac6206951d58 100644
--- a/drivers/power/rx51_battery.c
+++ b/drivers/power/rx51_battery.c
@@ -29,7 +29,8 @@
 
 struct rx51_device_info {
 	struct device *dev;
-	struct power_supply bat;
+	struct power_supply *bat;
+	struct power_supply_desc bat_desc;
 	struct iio_channel *channel_temp;
 	struct iio_channel *channel_bsi;
 	struct iio_channel *channel_vbat;
@@ -161,8 +162,7 @@ static int rx51_battery_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct rx51_device_info *di = container_of((psy),
-				struct rx51_device_info, bat);
+	struct rx51_device_info *di = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_TECHNOLOGY:
@@ -204,6 +204,7 @@ static enum power_supply_property rx51_battery_props[] = {
 
 static int rx51_battery_probe(struct platform_device *pdev)
 {
+	struct power_supply_config psy_cfg = {};
 	struct rx51_device_info *di;
 	int ret;
 
@@ -214,11 +215,13 @@ static int rx51_battery_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, di);
 
 	di->dev = &pdev->dev;
-	di->bat.name = dev_name(&pdev->dev);
-	di->bat.type = POWER_SUPPLY_TYPE_BATTERY;
-	di->bat.properties = rx51_battery_props;
-	di->bat.num_properties = ARRAY_SIZE(rx51_battery_props);
-	di->bat.get_property = rx51_battery_get_property;
+	di->bat_desc.name = dev_name(&pdev->dev);
+	di->bat_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	di->bat_desc.properties = rx51_battery_props;
+	di->bat_desc.num_properties = ARRAY_SIZE(rx51_battery_props);
+	di->bat_desc.get_property = rx51_battery_get_property;
+
+	psy_cfg.drv_data = di;
 
 	di->channel_temp = iio_channel_get(di->dev, "temp");
 	if (IS_ERR(di->channel_temp)) {
@@ -238,9 +241,11 @@ static int rx51_battery_probe(struct platform_device *pdev)
 		goto error_channel_bsi;
 	}
 
-	ret = power_supply_register(di->dev, &di->bat, NULL);
-	if (ret)
+	di->bat = power_supply_register(di->dev, &di->bat_desc, &psy_cfg);
+	if (IS_ERR(di->bat)) {
+		ret = PTR_ERR(di->bat);
 		goto error_channel_vbat;
+	}
 
 	return 0;
 
@@ -259,7 +264,7 @@ static int rx51_battery_remove(struct platform_device *pdev)
 {
 	struct rx51_device_info *di = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&di->bat);
+	power_supply_unregister(di->bat);
 
 	iio_channel_release(di->channel_vbat);
 	iio_channel_release(di->channel_bsi);
diff --git a/drivers/power/s3c_adc_battery.c b/drivers/power/s3c_adc_battery.c
index b6ff213373dd..0ffe5cd3abf6 100644
--- a/drivers/power/s3c_adc_battery.c
+++ b/drivers/power/s3c_adc_battery.c
@@ -28,7 +28,7 @@
 #define JITTER_DELAY			500 /* ms */
 
 struct s3c_adc_bat {
-	struct power_supply		psy;
+	struct power_supply		*psy;
 	struct s3c_adc_client		*client;
 	struct s3c_adc_bat_pdata	*pdata;
 	int				volt_value;
@@ -73,10 +73,10 @@ static int s3c_adc_backup_bat_get_property(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct s3c_adc_bat *bat = container_of(psy, struct s3c_adc_bat, psy);
+	struct s3c_adc_bat *bat = power_supply_get_drvdata(psy);
 
 	if (!bat) {
-		dev_err(psy->dev, "%s: no battery infos ?!\n", __func__);
+		dev_err(&psy->dev, "%s: no battery infos ?!\n", __func__);
 		return -EINVAL;
 	}
 
@@ -105,17 +105,17 @@ static int s3c_adc_backup_bat_get_property(struct power_supply *psy,
 	}
 }
 
-static struct s3c_adc_bat backup_bat = {
-	.psy = {
-		.name		= "backup-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= s3c_adc_backup_bat_props,
-		.num_properties = ARRAY_SIZE(s3c_adc_backup_bat_props),
-		.get_property	= s3c_adc_backup_bat_get_property,
-		.use_for_apm	= 1,
-	},
+static const struct power_supply_desc backup_bat_desc = {
+	.name		= "backup-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= s3c_adc_backup_bat_props,
+	.num_properties = ARRAY_SIZE(s3c_adc_backup_bat_props),
+	.get_property	= s3c_adc_backup_bat_get_property,
+	.use_for_apm	= 1,
 };
 
+static struct s3c_adc_bat backup_bat;
+
 static enum power_supply_property s3c_adc_main_bat_props[] = {
 	POWER_SUPPLY_PROP_STATUS,
 	POWER_SUPPLY_PROP_CHARGE_FULL_DESIGN,
@@ -141,7 +141,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 				    enum power_supply_property psp,
 				    union power_supply_propval *val)
 {
-	struct s3c_adc_bat *bat = container_of(psy, struct s3c_adc_bat, psy);
+	struct s3c_adc_bat *bat = power_supply_get_drvdata(psy);
 
 	int new_level;
 	int full_volt;
@@ -149,7 +149,7 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 	unsigned int lut_size;
 
 	if (!bat) {
-		dev_err(psy->dev, "no battery infos ?!\n");
+		dev_err(&psy->dev, "no battery infos ?!\n");
 		return -EINVAL;
 	}
 
@@ -232,18 +232,18 @@ static int s3c_adc_bat_get_property(struct power_supply *psy,
 	}
 }
 
-static struct s3c_adc_bat main_bat = {
-	.psy = {
-		.name			= "main-battery",
-		.type			= POWER_SUPPLY_TYPE_BATTERY,
-		.properties		= s3c_adc_main_bat_props,
-		.num_properties		= ARRAY_SIZE(s3c_adc_main_bat_props),
-		.get_property		= s3c_adc_bat_get_property,
-		.external_power_changed = s3c_adc_bat_ext_power_changed,
-		.use_for_apm		= 1,
-	},
+static const struct power_supply_desc main_bat_desc = {
+	.name			= "main-battery",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= s3c_adc_main_bat_props,
+	.num_properties		= ARRAY_SIZE(s3c_adc_main_bat_props),
+	.get_property		= s3c_adc_bat_get_property,
+	.external_power_changed = s3c_adc_bat_ext_power_changed,
+	.use_for_apm		= 1,
 };
 
+static struct s3c_adc_bat main_bat;
+
 static void s3c_adc_bat_work(struct work_struct *work)
 {
 	struct s3c_adc_bat *bat = &main_bat;
@@ -251,7 +251,7 @@ static void s3c_adc_bat_work(struct work_struct *work)
 	int is_plugged;
 	static int was_plugged;
 
-	is_plugged = power_supply_am_i_supplied(&bat->psy);
+	is_plugged = power_supply_am_i_supplied(bat->psy);
 	bat->cable_plugged = is_plugged;
 	if (is_plugged != was_plugged) {
 		was_plugged = is_plugged;
@@ -279,7 +279,7 @@ static void s3c_adc_bat_work(struct work_struct *work)
 		}
 	}
 
-	power_supply_changed(&bat->psy);
+	power_supply_changed(bat->psy);
 }
 
 static irqreturn_t s3c_adc_bat_charged(int irq, void *dev_id)
@@ -310,16 +310,25 @@ static int s3c_adc_bat_probe(struct platform_device *pdev)
 	main_bat.cable_plugged = 0;
 	main_bat.status = POWER_SUPPLY_STATUS_DISCHARGING;
 
-	ret = power_supply_register(&pdev->dev, &main_bat.psy, NULL);
-	if (ret)
+	main_bat.psy = power_supply_register(&pdev->dev, &main_bat_desc, NULL);
+	if (IS_ERR(main_bat.psy)) {
+		ret = PTR_ERR(main_bat.psy);
 		goto err_reg_main;
+	}
 	if (pdata->backup_volt_mult) {
+		const struct power_supply_config psy_cfg
+						= { .drv_data = &backup_bat, };
+
 		backup_bat.client = client;
 		backup_bat.pdata = pdev->dev.platform_data;
 		backup_bat.volt_value = -1;
-		ret = power_supply_register(&pdev->dev, &backup_bat.psy, NULL);
-		if (ret)
+		backup_bat.psy = power_supply_register(&pdev->dev,
+						       &backup_bat_desc,
+						       &psy_cfg);
+		if (IS_ERR(backup_bat.psy)) {
+			ret = PTR_ERR(backup_bat.psy);
 			goto err_reg_backup;
+		}
 	}
 
 	INIT_DELAYED_WORK(&bat_work, s3c_adc_bat_work);
@@ -360,9 +369,9 @@ err_irq:
 		gpio_free(pdata->gpio_charge_finished);
 err_gpio:
 	if (pdata->backup_volt_mult)
-		power_supply_unregister(&backup_bat.psy);
+		power_supply_unregister(backup_bat.psy);
 err_reg_backup:
-	power_supply_unregister(&main_bat.psy);
+	power_supply_unregister(main_bat.psy);
 err_reg_main:
 	return ret;
 }
@@ -372,9 +381,9 @@ static int s3c_adc_bat_remove(struct platform_device *pdev)
 	struct s3c_adc_client *client = platform_get_drvdata(pdev);
 	struct s3c_adc_bat_pdata *pdata = pdev->dev.platform_data;
 
-	power_supply_unregister(&main_bat.psy);
+	power_supply_unregister(main_bat.psy);
 	if (pdata->backup_volt_mult)
-		power_supply_unregister(&backup_bat.psy);
+		power_supply_unregister(backup_bat.psy);
 
 	s3c_adc_release(client);
 
diff --git a/drivers/power/sbs-battery.c b/drivers/power/sbs-battery.c
index 879f1448fc4a..de1178659d4b 100644
--- a/drivers/power/sbs-battery.c
+++ b/drivers/power/sbs-battery.c
@@ -156,7 +156,7 @@ static enum power_supply_property sbs_properties[] = {
 
 struct sbs_info {
 	struct i2c_client		*client;
-	struct power_supply		power_supply;
+	struct power_supply		*power_supply;
 	struct sbs_platform_data	*pdata;
 	bool				is_present;
 	bool				gpio_detect;
@@ -391,7 +391,7 @@ static int sbs_get_battery_property(struct i2c_client *client,
 			chip->last_state = val->intval;
 		else if (chip->last_state != val->intval) {
 			cancel_delayed_work_sync(&chip->work);
-			power_supply_changed(&chip->power_supply);
+			power_supply_changed(chip->power_supply);
 			chip->poll_time = 0;
 		}
 	} else {
@@ -556,8 +556,7 @@ static int sbs_get_property(struct power_supply *psy,
 	union power_supply_propval *val)
 {
 	int ret = 0;
-	struct sbs_info *chip = container_of(psy,
-				struct sbs_info, power_supply);
+	struct sbs_info *chip = power_supply_get_drvdata(psy);
 	struct i2c_client *client = chip->client;
 
 	switch (psp) {
@@ -638,7 +637,7 @@ static int sbs_get_property(struct power_supply *psy,
 	if (!chip->gpio_detect &&
 		chip->is_present != (ret >= 0)) {
 		chip->is_present = (ret >= 0);
-		power_supply_changed(&chip->power_supply);
+		power_supply_changed(chip->power_supply);
 	}
 
 done:
@@ -671,9 +670,7 @@ static irqreturn_t sbs_irq(int irq, void *devid)
 
 static void sbs_external_power_changed(struct power_supply *psy)
 {
-	struct sbs_info *chip;
-
-	chip = container_of(psy, struct sbs_info, power_supply);
+	struct sbs_info *chip = power_supply_get_drvdata(psy);
 
 	if (chip->ignore_changes > 0) {
 		chip->ignore_changes--;
@@ -712,7 +709,7 @@ static void sbs_delayed_work(struct work_struct *work)
 
 	if (chip->last_state != ret) {
 		chip->poll_time = 0;
-		power_supply_changed(&chip->power_supply);
+		power_supply_changed(chip->power_supply);
 		return;
 	}
 	if (chip->poll_time > 0) {
@@ -796,43 +793,48 @@ static struct sbs_platform_data *sbs_of_populate_pdata(
 }
 #endif
 
+static const struct power_supply_desc sbs_default_desc = {
+	.type = POWER_SUPPLY_TYPE_BATTERY,
+	.properties = sbs_properties,
+	.num_properties = ARRAY_SIZE(sbs_properties),
+	.get_property = sbs_get_property,
+	.external_power_changed = sbs_external_power_changed,
+};
+
 static int sbs_probe(struct i2c_client *client,
 	const struct i2c_device_id *id)
 {
 	struct sbs_info *chip;
+	struct power_supply_desc *sbs_desc;
 	struct sbs_platform_data *pdata = client->dev.platform_data;
 	struct power_supply_config psy_cfg = {};
 	int rc;
 	int irq;
-	char *name;
 
-	name = kasprintf(GFP_KERNEL, "sbs-%s", dev_name(&client->dev));
-	if (!name) {
-		dev_err(&client->dev, "Failed to allocate device name\n");
+	sbs_desc = devm_kmemdup(&client->dev, &sbs_default_desc,
+			sizeof(*sbs_desc), GFP_KERNEL);
+	if (!sbs_desc)
+		return -ENOMEM;
+
+	sbs_desc->name = devm_kasprintf(&client->dev, GFP_KERNEL, "sbs-%s",
+			dev_name(&client->dev));
+	if (!sbs_desc->name)
 		return -ENOMEM;
-	}
 
 	chip = kzalloc(sizeof(struct sbs_info), GFP_KERNEL);
-	if (!chip) {
-		rc = -ENOMEM;
-		goto exit_free_name;
-	}
+	if (!chip)
+		return -ENOMEM;
 
 	chip->client = client;
 	chip->enable_detection = false;
 	chip->gpio_detect = false;
-	chip->power_supply.name = name;
-	chip->power_supply.type = POWER_SUPPLY_TYPE_BATTERY;
-	chip->power_supply.properties = sbs_properties;
-	chip->power_supply.num_properties = ARRAY_SIZE(sbs_properties);
-	chip->power_supply.get_property = sbs_get_property;
 	psy_cfg.of_node = client->dev.of_node;
+	psy_cfg.drv_data = chip;
 	/* ignore first notification of external change, it is generated
 	 * from the power_supply_register call back
 	 */
 	chip->ignore_changes = 1;
 	chip->last_state = POWER_SUPPLY_STATUS_UNKNOWN;
-	chip->power_supply.external_power_changed = sbs_external_power_changed;
 
 	pdata = sbs_of_populate_pdata(client);
 
@@ -871,7 +873,7 @@ static int sbs_probe(struct i2c_client *client,
 
 	rc = request_irq(irq, sbs_irq,
 		IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING,
-		dev_name(&client->dev), &chip->power_supply);
+		dev_name(&client->dev), chip->power_supply);
 	if (rc) {
 		dev_warn(&client->dev, "Failed to request irq: %d\n", rc);
 		gpio_free(pdata->battery_detect);
@@ -893,11 +895,12 @@ skip_gpio:
 		goto exit_psupply;
 	}
 
-	rc = power_supply_register(&client->dev, &chip->power_supply,
-			&psy_cfg);
-	if (rc) {
+	chip->power_supply = power_supply_register(&client->dev, sbs_desc,
+						   &psy_cfg);
+	if (IS_ERR(chip->power_supply)) {
 		dev_err(&client->dev,
 			"%s: Failed to register power supply\n", __func__);
+		rc = PTR_ERR(chip->power_supply);
 		goto exit_psupply;
 	}
 
@@ -912,15 +915,12 @@ skip_gpio:
 
 exit_psupply:
 	if (chip->irq)
-		free_irq(chip->irq, &chip->power_supply);
+		free_irq(chip->irq, chip->power_supply);
 	if (chip->gpio_detect)
 		gpio_free(pdata->battery_detect);
 
 	kfree(chip);
 
-exit_free_name:
-	kfree(name);
-
 	return rc;
 }
 
@@ -929,15 +929,14 @@ static int sbs_remove(struct i2c_client *client)
 	struct sbs_info *chip = i2c_get_clientdata(client);
 
 	if (chip->irq)
-		free_irq(chip->irq, &chip->power_supply);
+		free_irq(chip->irq, chip->power_supply);
 	if (chip->gpio_detect)
 		gpio_free(chip->pdata->battery_detect);
 
-	power_supply_unregister(&chip->power_supply);
+	power_supply_unregister(chip->power_supply);
 
 	cancel_delayed_work_sync(&chip->work);
 
-	kfree(chip->power_supply.name);
 	kfree(chip);
 	chip = NULL;
 
diff --git a/drivers/power/smb347-charger.c b/drivers/power/smb347-charger.c
index 4396a1ffeb1a..0b60a0b5878b 100644
--- a/drivers/power/smb347-charger.c
+++ b/drivers/power/smb347-charger.c
@@ -139,9 +139,9 @@ struct smb347_charger {
 	struct mutex		lock;
 	struct device		*dev;
 	struct regmap		*regmap;
-	struct power_supply	mains;
-	struct power_supply	usb;
-	struct power_supply	battery;
+	struct power_supply	*mains;
+	struct power_supply	*usb;
+	struct power_supply	*battery;
 	bool			mains_online;
 	bool			usb_online;
 	bool			charging_enabled;
@@ -741,7 +741,7 @@ static irqreturn_t smb347_interrupt(int irq, void *data)
 	 */
 	if (stat_c & STAT_C_CHARGER_ERROR) {
 		dev_err(smb->dev, "charging stopped due to charger error\n");
-		power_supply_changed(&smb->battery);
+		power_supply_changed(smb->battery);
 		handled = true;
 	}
 
@@ -752,7 +752,7 @@ static irqreturn_t smb347_interrupt(int irq, void *data)
 	 */
 	if (irqstat_c & (IRQSTAT_C_TERMINATION_IRQ | IRQSTAT_C_TAPER_IRQ)) {
 		if (irqstat_c & IRQSTAT_C_TERMINATION_STAT)
-			power_supply_changed(&smb->battery);
+			power_supply_changed(smb->battery);
 		dev_dbg(smb->dev, "going to HW maintenance mode\n");
 		handled = true;
 	}
@@ -766,7 +766,7 @@ static irqreturn_t smb347_interrupt(int irq, void *data)
 
 		if (irqstat_d & IRQSTAT_D_CHARGE_TIMEOUT_STAT)
 			dev_warn(smb->dev, "charging stopped due to timeout\n");
-		power_supply_changed(&smb->battery);
+		power_supply_changed(smb->battery);
 		handled = true;
 	}
 
@@ -778,9 +778,9 @@ static irqreturn_t smb347_interrupt(int irq, void *data)
 		if (smb347_update_ps_status(smb) > 0) {
 			smb347_start_stop_charging(smb);
 			if (smb->pdata->use_mains)
-				power_supply_changed(&smb->mains);
+				power_supply_changed(smb->mains);
 			if (smb->pdata->use_usb)
-				power_supply_changed(&smb->usb);
+				power_supply_changed(smb->usb);
 		}
 		handled = true;
 	}
@@ -935,8 +935,7 @@ static int smb347_mains_get_property(struct power_supply *psy,
 				     enum power_supply_property prop,
 				     union power_supply_propval *val)
 {
-	struct smb347_charger *smb =
-		container_of(psy, struct smb347_charger, mains);
+	struct smb347_charger *smb = power_supply_get_drvdata(psy);
 	int ret;
 
 	switch (prop) {
@@ -977,8 +976,7 @@ static int smb347_usb_get_property(struct power_supply *psy,
 				   enum power_supply_property prop,
 				   union power_supply_propval *val)
 {
-	struct smb347_charger *smb =
-		container_of(psy, struct smb347_charger, usb);
+	struct smb347_charger *smb = power_supply_get_drvdata(psy);
 	int ret;
 
 	switch (prop) {
@@ -1064,8 +1062,7 @@ static int smb347_battery_get_property(struct power_supply *psy,
 				       enum power_supply_property prop,
 				       union power_supply_propval *val)
 {
-	struct smb347_charger *smb =
-			container_of(psy, struct smb347_charger, battery);
+	struct smb347_charger *smb = power_supply_get_drvdata(psy);
 	const struct smb347_charger_platform_data *pdata = smb->pdata;
 	int ret;
 
@@ -1189,12 +1186,36 @@ static const struct regmap_config smb347_regmap = {
 	.readable_reg	= smb347_readable_reg,
 };
 
+static const struct power_supply_desc smb347_mains_desc = {
+	.name		= "smb347-mains",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.get_property	= smb347_mains_get_property,
+	.properties	= smb347_mains_properties,
+	.num_properties	= ARRAY_SIZE(smb347_mains_properties),
+};
+
+static const struct power_supply_desc smb347_usb_desc = {
+	.name		= "smb347-usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.get_property	= smb347_usb_get_property,
+	.properties	= smb347_usb_properties,
+	.num_properties	= ARRAY_SIZE(smb347_usb_properties),
+};
+
+static const struct power_supply_desc smb347_battery_desc = {
+	.name		= "smb347-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.get_property	= smb347_battery_get_property,
+	.properties	= smb347_battery_properties,
+	.num_properties	= ARRAY_SIZE(smb347_battery_properties),
+};
+
 static int smb347_probe(struct i2c_client *client,
 			const struct i2c_device_id *id)
 {
 	static char *battery[] = { "smb347-battery" };
 	const struct smb347_charger_platform_data *pdata;
-	struct power_supply_config psy_cfg = {}; /* Only for mains and usb */
+	struct power_supply_config mains_usb_cfg = {}, battery_cfg = {};
 	struct device *dev = &client->dev;
 	struct smb347_charger *smb;
 	int ret;
@@ -1224,47 +1245,35 @@ static int smb347_probe(struct i2c_client *client,
 	if (ret < 0)
 		return ret;
 
-	psy_cfg.supplied_to = battery;
-	psy_cfg.num_supplicants = ARRAY_SIZE(battery);
+	mains_usb_cfg.supplied_to = battery;
+	mains_usb_cfg.num_supplicants = ARRAY_SIZE(battery);
+	mains_usb_cfg.drv_data = smb;
 	if (smb->pdata->use_mains) {
-		smb->mains.name = "smb347-mains";
-		smb->mains.type = POWER_SUPPLY_TYPE_MAINS;
-		smb->mains.get_property = smb347_mains_get_property;
-		smb->mains.properties = smb347_mains_properties;
-		smb->mains.num_properties = ARRAY_SIZE(smb347_mains_properties);
-		ret = power_supply_register(dev, &smb->mains, &psy_cfg);
-		if (ret < 0)
-			return ret;
+		smb->mains = power_supply_register(dev, &smb347_mains_desc,
+						   &mains_usb_cfg);
+		if (IS_ERR(smb->mains))
+			return PTR_ERR(smb->mains);
 	}
 
 	if (smb->pdata->use_usb) {
-		smb->usb.name = "smb347-usb";
-		smb->usb.type = POWER_SUPPLY_TYPE_USB;
-		smb->usb.get_property = smb347_usb_get_property;
-		smb->usb.properties = smb347_usb_properties;
-		smb->usb.num_properties = ARRAY_SIZE(smb347_usb_properties);
-		ret = power_supply_register(dev, &smb->usb, &psy_cfg);
-		if (ret < 0) {
+		smb->usb = power_supply_register(dev, &smb347_usb_desc,
+						 &mains_usb_cfg);
+		if (IS_ERR(smb->usb)) {
 			if (smb->pdata->use_mains)
-				power_supply_unregister(&smb->mains);
-			return ret;
+				power_supply_unregister(smb->mains);
+			return PTR_ERR(smb->usb);
 		}
 	}
 
-	smb->battery.name = "smb347-battery";
-	smb->battery.type = POWER_SUPPLY_TYPE_BATTERY;
-	smb->battery.get_property = smb347_battery_get_property;
-	smb->battery.properties = smb347_battery_properties;
-	smb->battery.num_properties = ARRAY_SIZE(smb347_battery_properties);
-
-
-	ret = power_supply_register(dev, &smb->battery, NULL);
-	if (ret < 0) {
+	battery_cfg.drv_data = smb;
+	smb->battery = power_supply_register(dev, &smb347_battery_desc,
+					     &battery_cfg);
+	if (IS_ERR(smb->battery)) {
 		if (smb->pdata->use_usb)
-			power_supply_unregister(&smb->usb);
+			power_supply_unregister(smb->usb);
 		if (smb->pdata->use_mains)
-			power_supply_unregister(&smb->mains);
-		return ret;
+			power_supply_unregister(smb->mains);
+		return PTR_ERR(smb->battery);
 	}
 
 	/*
@@ -1294,11 +1303,11 @@ static int smb347_remove(struct i2c_client *client)
 		gpio_free(smb->pdata->irq_gpio);
 	}
 
-	power_supply_unregister(&smb->battery);
+	power_supply_unregister(smb->battery);
 	if (smb->pdata->use_usb)
-		power_supply_unregister(&smb->usb);
+		power_supply_unregister(smb->usb);
 	if (smb->pdata->use_mains)
-		power_supply_unregister(&smb->mains);
+		power_supply_unregister(smb->mains);
 	return 0;
 }
 
diff --git a/drivers/power/test_power.c b/drivers/power/test_power.c
index f6c92d1d7811..f986e0cca7ac 100644
--- a/drivers/power/test_power.c
+++ b/drivers/power/test_power.c
@@ -153,7 +153,9 @@ static char *test_power_ac_supplied_to[] = {
 	"test_battery",
 };
 
-static struct power_supply test_power_supplies[] = {
+static struct power_supply *test_power_supplies[TEST_POWER_NUM];
+
+static const struct power_supply_desc test_power_desc[] = {
 	[TEST_AC] = {
 		.name = "test_ac",
 		.type = POWER_SUPPLY_TYPE_MAINS,
@@ -200,11 +202,13 @@ static int __init test_power_init(void)
 	BUILD_BUG_ON(TEST_POWER_NUM != ARRAY_SIZE(test_power_configs));
 
 	for (i = 0; i < ARRAY_SIZE(test_power_supplies); i++) {
-		ret = power_supply_register(NULL, &test_power_supplies[i],
+		test_power_supplies[i] = power_supply_register(NULL,
+						&test_power_desc[i],
 						&test_power_configs[i]);
-		if (ret) {
+		if (IS_ERR(test_power_supplies[i])) {
 			pr_err("%s: failed to register %s\n", __func__,
-				test_power_supplies[i].name);
+				test_power_desc[i].name);
+			ret = PTR_ERR(test_power_supplies[i]);
 			goto failed;
 		}
 	}
@@ -213,7 +217,7 @@ static int __init test_power_init(void)
 	return 0;
 failed:
 	while (--i >= 0)
-		power_supply_unregister(&test_power_supplies[i]);
+		power_supply_unregister(test_power_supplies[i]);
 	return ret;
 }
 module_init(test_power_init);
@@ -227,13 +231,13 @@ static void __exit test_power_exit(void)
 	usb_online = 0;
 	battery_status = POWER_SUPPLY_STATUS_DISCHARGING;
 	for (i = 0; i < ARRAY_SIZE(test_power_supplies); i++)
-		power_supply_changed(&test_power_supplies[i]);
+		power_supply_changed(test_power_supplies[i]);
 	pr_info("%s: 'changed' event sent, sleeping for 10 seconds...\n",
 		__func__);
 	ssleep(10);
 
 	for (i = 0; i < ARRAY_SIZE(test_power_supplies); i++)
-		power_supply_unregister(&test_power_supplies[i]);
+		power_supply_unregister(test_power_supplies[i]);
 
 	module_initialized = false;
 }
@@ -331,7 +335,7 @@ static inline void signal_power_supply_changed(struct power_supply *psy)
 static int param_set_ac_online(const char *key, const struct kernel_param *kp)
 {
 	ac_online = map_get_value(map_ac_online, key, ac_online);
-	signal_power_supply_changed(&test_power_supplies[TEST_AC]);
+	signal_power_supply_changed(test_power_supplies[TEST_AC]);
 	return 0;
 }
 
@@ -344,7 +348,7 @@ static int param_get_ac_online(char *buffer, const struct kernel_param *kp)
 static int param_set_usb_online(const char *key, const struct kernel_param *kp)
 {
 	usb_online = map_get_value(map_ac_online, key, usb_online);
-	signal_power_supply_changed(&test_power_supplies[TEST_USB]);
+	signal_power_supply_changed(test_power_supplies[TEST_USB]);
 	return 0;
 }
 
@@ -358,7 +362,7 @@ static int param_set_battery_status(const char *key,
 					const struct kernel_param *kp)
 {
 	battery_status = map_get_value(map_status, key, battery_status);
-	signal_power_supply_changed(&test_power_supplies[TEST_BATTERY]);
+	signal_power_supply_changed(test_power_supplies[TEST_BATTERY]);
 	return 0;
 }
 
@@ -372,7 +376,7 @@ static int param_set_battery_health(const char *key,
 					const struct kernel_param *kp)
 {
 	battery_health = map_get_value(map_health, key, battery_health);
-	signal_power_supply_changed(&test_power_supplies[TEST_BATTERY]);
+	signal_power_supply_changed(test_power_supplies[TEST_BATTERY]);
 	return 0;
 }
 
@@ -386,7 +390,7 @@ static int param_set_battery_present(const char *key,
 					const struct kernel_param *kp)
 {
 	battery_present = map_get_value(map_present, key, battery_present);
-	signal_power_supply_changed(&test_power_supplies[TEST_AC]);
+	signal_power_supply_changed(test_power_supplies[TEST_AC]);
 	return 0;
 }
 
@@ -402,7 +406,7 @@ static int param_set_battery_technology(const char *key,
 {
 	battery_technology = map_get_value(map_technology, key,
 						battery_technology);
-	signal_power_supply_changed(&test_power_supplies[TEST_BATTERY]);
+	signal_power_supply_changed(test_power_supplies[TEST_BATTERY]);
 	return 0;
 }
 
@@ -423,7 +427,7 @@ static int param_set_battery_capacity(const char *key,
 		return -EINVAL;
 
 	battery_capacity = tmp;
-	signal_power_supply_changed(&test_power_supplies[TEST_BATTERY]);
+	signal_power_supply_changed(test_power_supplies[TEST_BATTERY]);
 	return 0;
 }
 
@@ -438,7 +442,7 @@ static int param_set_battery_voltage(const char *key,
 		return -EINVAL;
 
 	battery_voltage = tmp;
-	signal_power_supply_changed(&test_power_supplies[TEST_BATTERY]);
+	signal_power_supply_changed(test_power_supplies[TEST_BATTERY]);
 	return 0;
 }
 
diff --git a/drivers/power/tosa_battery.c b/drivers/power/tosa_battery.c
index 895e4b4dfcf6..6e88c1b37945 100644
--- a/drivers/power/tosa_battery.c
+++ b/drivers/power/tosa_battery.c
@@ -26,7 +26,7 @@ static struct work_struct bat_work;
 
 struct tosa_bat {
 	int status;
-	struct power_supply psy;
+	struct power_supply *psy;
 	int full_chrg;
 
 	struct mutex work_lock; /* protects data */
@@ -61,7 +61,7 @@ static unsigned long tosa_read_bat(struct tosa_bat *bat)
 	mutex_lock(&bat_lock);
 	gpio_set_value(bat->gpio_bat, 1);
 	msleep(5);
-	value = wm97xx_read_aux_adc(dev_get_drvdata(bat->psy.dev->parent),
+	value = wm97xx_read_aux_adc(dev_get_drvdata(bat->psy->dev.parent),
 			bat->adc_bat);
 	gpio_set_value(bat->gpio_bat, 0);
 	mutex_unlock(&bat_lock);
@@ -81,7 +81,7 @@ static unsigned long tosa_read_temp(struct tosa_bat *bat)
 	mutex_lock(&bat_lock);
 	gpio_set_value(bat->gpio_temp, 1);
 	msleep(5);
-	value = wm97xx_read_aux_adc(dev_get_drvdata(bat->psy.dev->parent),
+	value = wm97xx_read_aux_adc(dev_get_drvdata(bat->psy->dev.parent),
 			bat->adc_temp);
 	gpio_set_value(bat->gpio_temp, 0);
 	mutex_unlock(&bat_lock);
@@ -96,7 +96,7 @@ static int tosa_bat_get_property(struct power_supply *psy,
 			    union power_supply_propval *val)
 {
 	int ret = 0;
-	struct tosa_bat *bat = container_of(psy, struct tosa_bat, psy);
+	struct tosa_bat *bat = power_supply_get_drvdata(psy);
 
 	if (bat->is_present && !bat->is_present(bat)
 			&& psp != POWER_SUPPLY_PROP_PRESENT) {
@@ -158,14 +158,14 @@ static irqreturn_t tosa_bat_gpio_isr(int irq, void *data)
 static void tosa_bat_update(struct tosa_bat *bat)
 {
 	int old;
-	struct power_supply *psy = &bat->psy;
+	struct power_supply *psy = bat->psy;
 
 	mutex_lock(&bat->work_lock);
 
 	old = bat->status;
 
 	if (bat->is_present && !bat->is_present(bat)) {
-		printk(KERN_NOTICE "%s not present\n", psy->name);
+		printk(KERN_NOTICE "%s not present\n", psy->desc->name);
 		bat->status = POWER_SUPPLY_STATUS_UNKNOWN;
 		bat->full_chrg = -1;
 	} else if (power_supply_am_i_supplied(psy)) {
@@ -222,18 +222,38 @@ static enum power_supply_property tosa_bat_bu_props[] = {
 	POWER_SUPPLY_PROP_PRESENT,
 };
 
+static const struct power_supply_desc tosa_bat_main_desc = {
+	.name		= "main-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= tosa_bat_main_props,
+	.num_properties	= ARRAY_SIZE(tosa_bat_main_props),
+	.get_property	= tosa_bat_get_property,
+	.external_power_changed = tosa_bat_external_power_changed,
+	.use_for_apm	= 1,
+};
+
+static const struct power_supply_desc tosa_bat_jacket_desc = {
+	.name		= "jacket-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= tosa_bat_main_props,
+	.num_properties	= ARRAY_SIZE(tosa_bat_main_props),
+	.get_property	= tosa_bat_get_property,
+	.external_power_changed = tosa_bat_external_power_changed,
+};
+
+static const struct power_supply_desc tosa_bat_bu_desc = {
+	.name		= "backup-battery",
+	.type		= POWER_SUPPLY_TYPE_BATTERY,
+	.properties	= tosa_bat_bu_props,
+	.num_properties	= ARRAY_SIZE(tosa_bat_bu_props),
+	.get_property	= tosa_bat_get_property,
+	.external_power_changed = tosa_bat_external_power_changed,
+};
+
 static struct tosa_bat tosa_bat_main = {
 	.status = POWER_SUPPLY_STATUS_DISCHARGING,
 	.full_chrg = -1,
-	.psy = {
-		.name		= "main-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= tosa_bat_main_props,
-		.num_properties	= ARRAY_SIZE(tosa_bat_main_props),
-		.get_property	= tosa_bat_get_property,
-		.external_power_changed = tosa_bat_external_power_changed,
-		.use_for_apm	= 1,
-	},
+	.psy = NULL,
 
 	.gpio_full = TOSA_GPIO_BAT0_CRG,
 	.gpio_charge_off = TOSA_GPIO_CHARGE_OFF,
@@ -254,14 +274,7 @@ static struct tosa_bat tosa_bat_main = {
 static struct tosa_bat tosa_bat_jacket = {
 	.status = POWER_SUPPLY_STATUS_DISCHARGING,
 	.full_chrg = -1,
-	.psy = {
-		.name		= "jacket-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= tosa_bat_main_props,
-		.num_properties	= ARRAY_SIZE(tosa_bat_main_props),
-		.get_property	= tosa_bat_get_property,
-		.external_power_changed = tosa_bat_external_power_changed,
-	},
+	.psy = NULL,
 
 	.is_present = tosa_jacket_bat_is_present,
 	.gpio_full = TOSA_GPIO_BAT1_CRG,
@@ -283,15 +296,7 @@ static struct tosa_bat tosa_bat_jacket = {
 static struct tosa_bat tosa_bat_bu = {
 	.status = POWER_SUPPLY_STATUS_UNKNOWN,
 	.full_chrg = -1,
-
-	.psy = {
-		.name		= "backup-battery",
-		.type		= POWER_SUPPLY_TYPE_BATTERY,
-		.properties	= tosa_bat_bu_props,
-		.num_properties	= ARRAY_SIZE(tosa_bat_bu_props),
-		.get_property	= tosa_bat_get_property,
-		.external_power_changed = tosa_bat_external_power_changed,
-	},
+	.psy = NULL,
 
 	.gpio_full = -1,
 	.gpio_charge_off = -1,
@@ -345,6 +350,9 @@ static int tosa_bat_resume(struct platform_device *dev)
 static int tosa_bat_probe(struct platform_device *dev)
 {
 	int ret;
+	struct power_supply_config main_psy_cfg = {},
+				   jacket_psy_cfg = {},
+				   bu_psy_cfg = {};
 
 	if (!machine_is_tosa())
 		return -ENODEV;
@@ -358,15 +366,31 @@ static int tosa_bat_probe(struct platform_device *dev)
 
 	INIT_WORK(&bat_work, tosa_bat_work);
 
-	ret = power_supply_register(&dev->dev, &tosa_bat_main.psy, NULL);
-	if (ret)
+	main_psy_cfg.drv_data = &tosa_bat_main;
+	tosa_bat_main.psy = power_supply_register(&dev->dev,
+						  &tosa_bat_main_desc,
+						  &main_psy_cfg);
+	if (IS_ERR(tosa_bat_main.psy)) {
+		ret = PTR_ERR(tosa_bat_main.psy);
 		goto err_psy_reg_main;
-	ret = power_supply_register(&dev->dev, &tosa_bat_jacket.psy, NULL);
-	if (ret)
+	}
+
+	jacket_psy_cfg.drv_data = &tosa_bat_jacket;
+	tosa_bat_jacket.psy = power_supply_register(&dev->dev,
+						    &tosa_bat_jacket_desc,
+						    &jacket_psy_cfg);
+	if (IS_ERR(tosa_bat_jacket.psy)) {
+		ret = PTR_ERR(tosa_bat_jacket.psy);
 		goto err_psy_reg_jacket;
-	ret = power_supply_register(&dev->dev, &tosa_bat_bu.psy, NULL);
-	if (ret)
+	}
+
+	bu_psy_cfg.drv_data = &tosa_bat_bu;
+	tosa_bat_bu.psy = power_supply_register(&dev->dev, &tosa_bat_bu_desc,
+						&bu_psy_cfg);
+	if (IS_ERR(tosa_bat_bu.psy)) {
+		ret = PTR_ERR(tosa_bat_bu.psy);
 		goto err_psy_reg_bu;
+	}
 
 	ret = request_irq(gpio_to_irq(TOSA_GPIO_BAT0_CRG),
 				tosa_bat_gpio_isr,
@@ -395,11 +419,11 @@ static int tosa_bat_probe(struct platform_device *dev)
 err_req_jacket:
 	free_irq(gpio_to_irq(TOSA_GPIO_BAT0_CRG), &tosa_bat_main);
 err_req_main:
-	power_supply_unregister(&tosa_bat_bu.psy);
+	power_supply_unregister(tosa_bat_bu.psy);
 err_psy_reg_bu:
-	power_supply_unregister(&tosa_bat_jacket.psy);
+	power_supply_unregister(tosa_bat_jacket.psy);
 err_psy_reg_jacket:
-	power_supply_unregister(&tosa_bat_main.psy);
+	power_supply_unregister(tosa_bat_main.psy);
 err_psy_reg_main:
 
 	/* see comment in tosa_bat_remove */
@@ -415,9 +439,9 @@ static int tosa_bat_remove(struct platform_device *dev)
 	free_irq(gpio_to_irq(TOSA_GPIO_BAT1_CRG), &tosa_bat_jacket);
 	free_irq(gpio_to_irq(TOSA_GPIO_BAT0_CRG), &tosa_bat_main);
 
-	power_supply_unregister(&tosa_bat_bu.psy);
-	power_supply_unregister(&tosa_bat_jacket.psy);
-	power_supply_unregister(&tosa_bat_main.psy);
+	power_supply_unregister(tosa_bat_bu.psy);
+	power_supply_unregister(tosa_bat_jacket.psy);
+	power_supply_unregister(tosa_bat_main.psy);
 
 	/*
 	 * Now cancel the bat_work.  We won't get any more schedules,
diff --git a/drivers/power/tps65090-charger.c b/drivers/power/tps65090-charger.c
index 9872c901bd70..dcf9a3ca53d5 100644
--- a/drivers/power/tps65090-charger.c
+++ b/drivers/power/tps65090-charger.c
@@ -43,7 +43,7 @@ struct tps65090_charger {
 	int	irq;
 	struct task_struct	*poll_task;
 	bool			passive_mode;
-	struct power_supply	ac;
+	struct power_supply	*ac;
 	struct tps65090_platform_data *pdata;
 };
 
@@ -135,8 +135,7 @@ static int tps65090_ac_get_property(struct power_supply *psy,
 			enum power_supply_property psp,
 			union power_supply_propval *val)
 {
-	struct tps65090_charger *charger = container_of(psy,
-					struct tps65090_charger, ac);
+	struct tps65090_charger *charger = power_supply_get_drvdata(psy);
 
 	if (psp == POWER_SUPPLY_PROP_ONLINE) {
 		val->intval = charger->ac_online;
@@ -190,7 +189,7 @@ static irqreturn_t tps65090_charger_isr(int irq, void *dev_id)
 	}
 
 	if (charger->prev_ac_online != charger->ac_online)
-		power_supply_changed(&charger->ac);
+		power_supply_changed(charger->ac);
 
 	return IRQ_HANDLED;
 }
@@ -229,6 +228,14 @@ static int tps65090_charger_poll_task(void *data)
 	return 0;
 }
 
+static const struct power_supply_desc tps65090_charger_desc = {
+	.name			= "tps65090-ac",
+	.type			= POWER_SUPPLY_TYPE_MAINS,
+	.get_property		= tps65090_ac_get_property,
+	.properties		= tps65090_ac_props,
+	.num_properties		= ARRAY_SIZE(tps65090_ac_props),
+};
+
 static int tps65090_charger_probe(struct platform_device *pdev)
 {
 	struct tps65090_charger *cdata;
@@ -260,20 +267,16 @@ static int tps65090_charger_probe(struct platform_device *pdev)
 	cdata->dev			= &pdev->dev;
 	cdata->pdata			= pdata;
 
-	cdata->ac.name			= "tps65090-ac";
-	cdata->ac.type			= POWER_SUPPLY_TYPE_MAINS;
-	cdata->ac.get_property		= tps65090_ac_get_property;
-	cdata->ac.properties		= tps65090_ac_props;
-	cdata->ac.num_properties	= ARRAY_SIZE(tps65090_ac_props);
-
 	psy_cfg.supplied_to		= pdata->supplied_to;
 	psy_cfg.num_supplicants		= pdata->num_supplicants;
 	psy_cfg.of_node			= pdev->dev.of_node;
+	psy_cfg.drv_data		= cdata;
 
-	ret = power_supply_register(&pdev->dev, &cdata->ac, &psy_cfg);
-	if (ret) {
+	cdata->ac = power_supply_register(&pdev->dev, &tps65090_charger_desc,
+			&psy_cfg);
+	if (IS_ERR(cdata->ac)) {
 		dev_err(&pdev->dev, "failed: power supply register\n");
-		return ret;
+		return PTR_ERR(cdata->ac);
 	}
 
 	irq = platform_get_irq(pdev, 0);
@@ -303,7 +306,7 @@ static int tps65090_charger_probe(struct platform_device *pdev)
 			goto fail_unregister_supply;
 		}
 		cdata->ac_online = 1;
-		power_supply_changed(&cdata->ac);
+		power_supply_changed(cdata->ac);
 	}
 
 	if (irq != -ENXIO) {
@@ -330,7 +333,7 @@ static int tps65090_charger_probe(struct platform_device *pdev)
 	return 0;
 
 fail_unregister_supply:
-	power_supply_unregister(&cdata->ac);
+	power_supply_unregister(cdata->ac);
 
 	return ret;
 }
@@ -341,7 +344,7 @@ static int tps65090_charger_remove(struct platform_device *pdev)
 
 	if (cdata->irq == -ENXIO)
 		kthread_stop(cdata->poll_task);
-	power_supply_unregister(&cdata->ac);
+	power_supply_unregister(cdata->ac);
 
 	return 0;
 }
diff --git a/drivers/power/twl4030_charger.c b/drivers/power/twl4030_charger.c
index 156f30e64a75..02a522cb7753 100644
--- a/drivers/power/twl4030_charger.c
+++ b/drivers/power/twl4030_charger.c
@@ -87,8 +87,8 @@ MODULE_PARM_DESC(allow_usb, "Allow USB charge drawing default current");
 
 struct twl4030_bci {
 	struct device		*dev;
-	struct power_supply	ac;
-	struct power_supply	usb;
+	struct power_supply	*ac;
+	struct power_supply	*usb;
 	struct usb_phy		*transceiver;
 	struct notifier_block	usb_nb;
 	struct work_struct	work;
@@ -318,8 +318,8 @@ static irqreturn_t twl4030_charger_interrupt(int irq, void *arg)
 	struct twl4030_bci *bci = arg;
 
 	dev_dbg(bci->dev, "CHG_PRES irq\n");
-	power_supply_changed(&bci->ac);
-	power_supply_changed(&bci->usb);
+	power_supply_changed(bci->ac);
+	power_supply_changed(bci->usb);
 
 	return IRQ_HANDLED;
 }
@@ -347,8 +347,8 @@ static irqreturn_t twl4030_bci_interrupt(int irq, void *arg)
 
 	if (irqs1 & (TWL4030_ICHGLOW | TWL4030_ICHGEOC)) {
 		/* charger state change, inform the core */
-		power_supply_changed(&bci->ac);
-		power_supply_changed(&bci->usb);
+		power_supply_changed(bci->ac);
+		power_supply_changed(bci->usb);
 	}
 
 	/* various monitoring events, for now we just log them here */
@@ -463,7 +463,7 @@ static int twl4030_bci_get_property(struct power_supply *psy,
 				    enum power_supply_property psp,
 				    union power_supply_propval *val)
 {
-	struct twl4030_bci *bci = dev_get_drvdata(psy->dev->parent);
+	struct twl4030_bci *bci = dev_get_drvdata(psy->dev.parent);
 	int is_charging;
 	int state;
 	int ret;
@@ -472,7 +472,7 @@ static int twl4030_bci_get_property(struct power_supply *psy,
 	if (state < 0)
 		return state;
 
-	if (psy->type == POWER_SUPPLY_TYPE_USB)
+	if (psy->desc->type == POWER_SUPPLY_TYPE_USB)
 		is_charging = state & TWL4030_MSTATEC_USB;
 	else
 		is_charging = state & TWL4030_MSTATEC_AC;
@@ -488,7 +488,7 @@ static int twl4030_bci_get_property(struct power_supply *psy,
 		/* charging must be active for meaningful result */
 		if (!is_charging)
 			return -ENODATA;
-		if (psy->type == POWER_SUPPLY_TYPE_USB) {
+		if (psy->desc->type == POWER_SUPPLY_TYPE_USB) {
 			ret = twl4030bci_read_adc_val(TWL4030_BCIVBUS);
 			if (ret < 0)
 				return ret;
@@ -558,6 +558,22 @@ twl4030_bci_parse_dt(struct device *dev)
 }
 #endif
 
+static const struct power_supply_desc twl4030_bci_ac_desc = {
+	.name		= "twl4030_ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= twl4030_charger_props,
+	.num_properties	= ARRAY_SIZE(twl4030_charger_props),
+	.get_property	= twl4030_bci_get_property,
+};
+
+static const struct power_supply_desc twl4030_bci_usb_desc = {
+	.name		= "twl4030_usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= twl4030_charger_props,
+	.num_properties	= ARRAY_SIZE(twl4030_charger_props),
+	.get_property	= twl4030_bci_get_property,
+};
+
 static int __init twl4030_bci_probe(struct platform_device *pdev)
 {
 	struct twl4030_bci *bci;
@@ -584,28 +600,21 @@ static int __init twl4030_bci_probe(struct platform_device *pdev)
 	}
 
 	platform_set_drvdata(pdev, bci);
-	bci->ac.name = "twl4030_ac";
-	bci->ac.type = POWER_SUPPLY_TYPE_MAINS;
-	bci->ac.properties = twl4030_charger_props;
-	bci->ac.num_properties = ARRAY_SIZE(twl4030_charger_props);
-	bci->ac.get_property = twl4030_bci_get_property;
 
-	ret = power_supply_register(&pdev->dev, &bci->ac, NULL);
-	if (ret) {
+	bci->ac = power_supply_register(&pdev->dev, &twl4030_bci_ac_desc,
+					NULL);
+	if (IS_ERR(bci->ac)) {
+		ret = PTR_ERR(bci->ac);
 		dev_err(&pdev->dev, "failed to register ac: %d\n", ret);
 		goto fail_register_ac;
 	}
 
-	bci->usb.name = "twl4030_usb";
-	bci->usb.type = POWER_SUPPLY_TYPE_USB;
-	bci->usb.properties = twl4030_charger_props;
-	bci->usb.num_properties = ARRAY_SIZE(twl4030_charger_props);
-	bci->usb.get_property = twl4030_bci_get_property;
-
 	bci->usb_reg = regulator_get(bci->dev, "bci3v1");
 
-	ret = power_supply_register(&pdev->dev, &bci->usb, NULL);
-	if (ret) {
+	bci->usb = power_supply_register(&pdev->dev, &twl4030_bci_usb_desc,
+					 NULL);
+	if (IS_ERR(bci->usb)) {
+		ret = PTR_ERR(bci->usb);
 		dev_err(&pdev->dev, "failed to register usb: %d\n", ret);
 		goto fail_register_usb;
 	}
@@ -670,9 +679,9 @@ fail_unmask_interrupts:
 fail_bci_irq:
 	free_irq(bci->irq_chg, bci);
 fail_chg_irq:
-	power_supply_unregister(&bci->usb);
+	power_supply_unregister(bci->usb);
 fail_register_usb:
-	power_supply_unregister(&bci->ac);
+	power_supply_unregister(bci->ac);
 fail_register_ac:
 fail_no_battery:
 	kfree(bci);
@@ -700,8 +709,8 @@ static int __exit twl4030_bci_remove(struct platform_device *pdev)
 	}
 	free_irq(bci->irq_bci, bci);
 	free_irq(bci->irq_chg, bci);
-	power_supply_unregister(&bci->usb);
-	power_supply_unregister(&bci->ac);
+	power_supply_unregister(bci->usb);
+	power_supply_unregister(bci->ac);
 	kfree(bci);
 
 	return 0;
diff --git a/drivers/power/twl4030_madc_battery.c b/drivers/power/twl4030_madc_battery.c
index d065460c1cb3..36d3a0e229fa 100644
--- a/drivers/power/twl4030_madc_battery.c
+++ b/drivers/power/twl4030_madc_battery.c
@@ -21,7 +21,7 @@
 #include <linux/power/twl4030_madc_battery.h>
 
 struct twl4030_madc_battery {
-	struct power_supply psy;
+	struct power_supply *psy;
 	struct twl4030_madc_bat_platform_data *pdata;
 };
 
@@ -113,8 +113,7 @@ static int twl4030_madc_bat_get_property(struct power_supply *psy,
 					enum power_supply_property psp,
 					union power_supply_propval *val)
 {
-	struct twl4030_madc_battery *bat = container_of(psy,
-					struct twl4030_madc_battery, psy);
+	struct twl4030_madc_battery *bat = power_supply_get_drvdata(psy);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -176,12 +175,19 @@ static int twl4030_madc_bat_get_property(struct power_supply *psy,
 
 static void twl4030_madc_bat_ext_changed(struct power_supply *psy)
 {
-	struct twl4030_madc_battery *bat = container_of(psy,
-					struct twl4030_madc_battery, psy);
-
-	power_supply_changed(&bat->psy);
+	power_supply_changed(psy);
 }
 
+static const struct power_supply_desc twl4030_madc_bat_desc = {
+	.name			= "twl4030_battery",
+	.type			= POWER_SUPPLY_TYPE_BATTERY,
+	.properties		= twl4030_madc_bat_props,
+	.num_properties		= ARRAY_SIZE(twl4030_madc_bat_props),
+	.get_property		= twl4030_madc_bat_get_property,
+	.external_power_changed	= twl4030_madc_bat_ext_changed,
+
+};
+
 static int twl4030_cmp(const void *a, const void *b)
 {
 	return ((struct twl4030_madc_bat_calibration *)b)->voltage -
@@ -192,21 +198,13 @@ static int twl4030_madc_battery_probe(struct platform_device *pdev)
 {
 	struct twl4030_madc_battery *twl4030_madc_bat;
 	struct twl4030_madc_bat_platform_data *pdata = pdev->dev.platform_data;
+	struct power_supply_config psy_cfg = {};
 	int ret = 0;
 
 	twl4030_madc_bat = kzalloc(sizeof(*twl4030_madc_bat), GFP_KERNEL);
 	if (!twl4030_madc_bat)
 		return -ENOMEM;
 
-	twl4030_madc_bat->psy.name = "twl4030_battery";
-	twl4030_madc_bat->psy.type = POWER_SUPPLY_TYPE_BATTERY;
-	twl4030_madc_bat->psy.properties = twl4030_madc_bat_props;
-	twl4030_madc_bat->psy.num_properties =
-					ARRAY_SIZE(twl4030_madc_bat_props);
-	twl4030_madc_bat->psy.get_property = twl4030_madc_bat_get_property;
-	twl4030_madc_bat->psy.external_power_changed =
-					twl4030_madc_bat_ext_changed;
-
 	/* sort charging and discharging calibration data */
 	sort(pdata->charging, pdata->charging_size,
 		sizeof(struct twl4030_madc_bat_calibration),
@@ -217,9 +215,14 @@ static int twl4030_madc_battery_probe(struct platform_device *pdev)
 
 	twl4030_madc_bat->pdata = pdata;
 	platform_set_drvdata(pdev, twl4030_madc_bat);
-	ret = power_supply_register(&pdev->dev, &twl4030_madc_bat->psy, NULL);
-	if (ret < 0)
+	psy_cfg.drv_data = twl4030_madc_bat;
+	twl4030_madc_bat->psy = power_supply_register(&pdev->dev,
+						      &twl4030_madc_bat_desc,
+						      &psy_cfg);
+	if (IS_ERR(twl4030_madc_bat->psy)) {
+		ret = PTR_ERR(twl4030_madc_bat->psy);
 		kfree(twl4030_madc_bat);
+	}
 
 	return ret;
 }
@@ -228,7 +231,7 @@ static int twl4030_madc_battery_remove(struct platform_device *pdev)
 {
 	struct twl4030_madc_battery *bat = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&bat->psy);
+	power_supply_unregister(bat->psy);
 	kfree(bat);
 
 	return 0;
diff --git a/drivers/power/wm831x_backup.c b/drivers/power/wm831x_backup.c
index 60ae871148b0..2e33109ca8c7 100644
--- a/drivers/power/wm831x_backup.c
+++ b/drivers/power/wm831x_backup.c
@@ -21,7 +21,8 @@
 
 struct wm831x_backup {
 	struct wm831x *wm831x;
-	struct power_supply backup;
+	struct power_supply *backup;
+	struct power_supply_desc backup_desc;
 	char name[20];
 };
 
@@ -115,7 +116,7 @@ static int wm831x_backup_get_prop(struct power_supply *psy,
 				  enum power_supply_property psp,
 				  union power_supply_propval *val)
 {
-	struct wm831x_backup *devdata = dev_get_drvdata(psy->dev->parent);
+	struct wm831x_backup *devdata = dev_get_drvdata(psy->dev.parent);
 	struct wm831x *wm831x = devdata->wm831x;
 	int ret = 0;
 
@@ -166,8 +167,6 @@ static int wm831x_backup_probe(struct platform_device *pdev)
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_pdata *wm831x_pdata = wm831x->dev->platform_data;
 	struct wm831x_backup *devdata;
-	struct power_supply *backup;
-	int ret;
 
 	devdata = devm_kzalloc(&pdev->dev, sizeof(struct wm831x_backup),
 				GFP_KERNEL);
@@ -177,8 +176,6 @@ static int wm831x_backup_probe(struct platform_device *pdev)
 	devdata->wm831x = wm831x;
 	platform_set_drvdata(pdev, devdata);
 
-	backup = &devdata->backup;
-
 	/* We ignore configuration failures since we can still read
 	 * back the status without enabling the charger (which may
 	 * already be enabled anyway).
@@ -192,21 +189,22 @@ static int wm831x_backup_probe(struct platform_device *pdev)
 		snprintf(devdata->name, sizeof(devdata->name),
 			 "wm831x-backup");
 
-	backup->name = devdata->name;
-	backup->type = POWER_SUPPLY_TYPE_BATTERY;
-	backup->properties = wm831x_backup_props;
-	backup->num_properties = ARRAY_SIZE(wm831x_backup_props);
-	backup->get_property = wm831x_backup_get_prop;
-	ret = power_supply_register(&pdev->dev, backup, NULL);
+	devdata->backup_desc.name = devdata->name;
+	devdata->backup_desc.type = POWER_SUPPLY_TYPE_BATTERY;
+	devdata->backup_desc.properties = wm831x_backup_props;
+	devdata->backup_desc.num_properties = ARRAY_SIZE(wm831x_backup_props);
+	devdata->backup_desc.get_property = wm831x_backup_get_prop;
+	devdata->backup = power_supply_register(&pdev->dev,
+						&devdata->backup_desc, NULL);
 
-	return ret;
+	return PTR_ERR_OR_ZERO(devdata->backup);
 }
 
 static int wm831x_backup_remove(struct platform_device *pdev)
 {
 	struct wm831x_backup *devdata = platform_get_drvdata(pdev);
 
-	power_supply_unregister(&devdata->backup);
+	power_supply_unregister(devdata->backup);
 
 	return 0;
 }
diff --git a/drivers/power/wm831x_power.c b/drivers/power/wm831x_power.c
index a132aae6225d..0161bdabd5a3 100644
--- a/drivers/power/wm831x_power.c
+++ b/drivers/power/wm831x_power.c
@@ -21,9 +21,12 @@
 
 struct wm831x_power {
 	struct wm831x *wm831x;
-	struct power_supply wall;
-	struct power_supply usb;
-	struct power_supply battery;
+	struct power_supply *wall;
+	struct power_supply *usb;
+	struct power_supply *battery;
+	struct power_supply_desc wall_desc;
+	struct power_supply_desc usb_desc;
+	struct power_supply_desc battery_desc;
 	char wall_name[20];
 	char usb_name[20];
 	char battery_name[20];
@@ -67,7 +70,7 @@ static int wm831x_wall_get_prop(struct power_supply *psy,
 				enum power_supply_property psp,
 				union power_supply_propval *val)
 {
-	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev.parent);
 	struct wm831x *wm831x = wm831x_power->wm831x;
 	int ret = 0;
 
@@ -98,7 +101,7 @@ static int wm831x_usb_get_prop(struct power_supply *psy,
 			       enum power_supply_property psp,
 			       union power_supply_propval *val)
 {
-	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev.parent);
 	struct wm831x *wm831x = wm831x_power->wm831x;
 	int ret = 0;
 
@@ -393,7 +396,7 @@ static int wm831x_bat_get_prop(struct power_supply *psy,
 			       enum power_supply_property psp,
 			       union power_supply_propval *val)
 {
-	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev->parent);
+	struct wm831x_power *wm831x_power = dev_get_drvdata(psy->dev.parent);
 	struct wm831x *wm831x = wm831x_power->wm831x;
 	int ret = 0;
 
@@ -451,7 +454,7 @@ static irqreturn_t wm831x_bat_irq(int irq, void *data)
 	/* The battery charger is autonomous so we don't need to do
 	 * anything except kick user space */
 	if (wm831x_power->have_battery)
-		power_supply_changed(&wm831x_power->battery);
+		power_supply_changed(wm831x_power->battery);
 
 	return IRQ_HANDLED;
 }
@@ -482,9 +485,9 @@ static irqreturn_t wm831x_pwr_src_irq(int irq, void *data)
 
 	/* Just notify for everything - little harm in overnotifying. */
 	if (wm831x_power->have_battery)
-		power_supply_changed(&wm831x_power->battery);
-	power_supply_changed(&wm831x_power->usb);
-	power_supply_changed(&wm831x_power->wall);
+		power_supply_changed(wm831x_power->battery);
+	power_supply_changed(wm831x_power->usb);
+	power_supply_changed(wm831x_power->wall);
 
 	return IRQ_HANDLED;
 }
@@ -494,9 +497,6 @@ static int wm831x_power_probe(struct platform_device *pdev)
 	struct wm831x *wm831x = dev_get_drvdata(pdev->dev.parent);
 	struct wm831x_pdata *wm831x_pdata = wm831x->dev->platform_data;
 	struct wm831x_power *power;
-	struct power_supply *usb;
-	struct power_supply *battery;
-	struct power_supply *wall;
 	int ret, irq, i;
 
 	power = kzalloc(sizeof(struct wm831x_power), GFP_KERNEL);
@@ -506,10 +506,6 @@ static int wm831x_power_probe(struct platform_device *pdev)
 	power->wm831x = wm831x;
 	platform_set_drvdata(pdev, power);
 
-	usb = &power->usb;
-	battery = &power->battery;
-	wall = &power->wall;
-
 	if (wm831x_pdata && wm831x_pdata->wm831x_num) {
 		snprintf(power->wall_name, sizeof(power->wall_name),
 			 "wm831x-wall.%d", wm831x_pdata->wm831x_num);
@@ -531,23 +527,28 @@ static int wm831x_power_probe(struct platform_device *pdev)
 	 */
 	wm831x_config_battery(wm831x);
 
-	wall->name = power->wall_name;
-	wall->type = POWER_SUPPLY_TYPE_MAINS;
-	wall->properties = wm831x_wall_props;
-	wall->num_properties = ARRAY_SIZE(wm831x_wall_props);
-	wall->get_property = wm831x_wall_get_prop;
-	ret = power_supply_register(&pdev->dev, wall, NULL);
-	if (ret)
+	power->wall_desc.name = power->wall_name;
+	power->wall_desc.type = POWER_SUPPLY_TYPE_MAINS;
+	power->wall_desc.properties = wm831x_wall_props;
+	power->wall_desc.num_properties = ARRAY_SIZE(wm831x_wall_props);
+	power->wall_desc.get_property = wm831x_wall_get_prop;
+	power->wall = power_supply_register(&pdev->dev, &power->wall_desc,
+					    NULL);
+	if (IS_ERR(power->wall)) {
+		ret = PTR_ERR(power->wall);
 		goto err_kmalloc;
+	}
 
-	usb->name = power->usb_name,
-	usb->type = POWER_SUPPLY_TYPE_USB;
-	usb->properties = wm831x_usb_props;
-	usb->num_properties = ARRAY_SIZE(wm831x_usb_props);
-	usb->get_property = wm831x_usb_get_prop;
-	ret = power_supply_register(&pdev->dev, usb, NULL);
-	if (ret)
+	power->usb_desc.name = power->usb_name,
+	power->usb_desc.type = POWER_SUPPLY_TYPE_USB;
+	power->usb_desc.properties = wm831x_usb_props;
+	power->usb_desc.num_properties = ARRAY_SIZE(wm831x_usb_props);
+	power->usb_desc.get_property = wm831x_usb_get_prop;
+	power->usb = power_supply_register(&pdev->dev, &power->usb_desc, NULL);
+	if (IS_ERR(power->usb)) {
+		ret = PTR_ERR(power->usb);
 		goto err_wall;
+	}
 
 	ret = wm831x_reg_read(wm831x, WM831X_CHARGER_CONTROL_1);
 	if (ret < 0)
@@ -555,14 +556,18 @@ static int wm831x_power_probe(struct platform_device *pdev)
 	power->have_battery = ret & WM831X_CHG_ENA;
 
 	if (power->have_battery) {
-		    battery->name = power->battery_name;
-		    battery->properties = wm831x_bat_props;
-		    battery->num_properties = ARRAY_SIZE(wm831x_bat_props);
-		    battery->get_property = wm831x_bat_get_prop;
-		    battery->use_for_apm = 1;
-		    ret = power_supply_register(&pdev->dev, battery, NULL);
-		    if (ret)
-			    goto err_usb;
+		power->battery_desc.name = power->battery_name;
+		power->battery_desc.properties = wm831x_bat_props;
+		power->battery_desc.num_properties = ARRAY_SIZE(wm831x_bat_props);
+		power->battery_desc.get_property = wm831x_bat_get_prop;
+		power->battery_desc.use_for_apm = 1;
+		power->battery = power_supply_register(&pdev->dev,
+						       &power->battery_desc,
+						       NULL);
+		if (IS_ERR(power->battery)) {
+			ret = PTR_ERR(power->battery);
+			goto err_usb;
+		}
 	}
 
 	irq = wm831x_irq(wm831x, platform_get_irq_byname(pdev, "SYSLO"));
@@ -615,11 +620,11 @@ err_syslo:
 	free_irq(irq, power);
 err_battery:
 	if (power->have_battery)
-		power_supply_unregister(battery);
+		power_supply_unregister(power->battery);
 err_usb:
-	power_supply_unregister(usb);
+	power_supply_unregister(power->usb);
 err_wall:
-	power_supply_unregister(wall);
+	power_supply_unregister(power->wall);
 err_kmalloc:
 	kfree(power);
 	return ret;
@@ -645,9 +650,9 @@ static int wm831x_power_remove(struct platform_device *pdev)
 	free_irq(irq, wm831x_power);
 
 	if (wm831x_power->have_battery)
-		power_supply_unregister(&wm831x_power->battery);
-	power_supply_unregister(&wm831x_power->wall);
-	power_supply_unregister(&wm831x_power->usb);
+		power_supply_unregister(wm831x_power->battery);
+	power_supply_unregister(wm831x_power->wall);
+	power_supply_unregister(wm831x_power->usb);
 	kfree(wm831x_power);
 	return 0;
 }
diff --git a/drivers/power/wm8350_power.c b/drivers/power/wm8350_power.c
index 261ceca561d5..5c5880664e09 100644
--- a/drivers/power/wm8350_power.c
+++ b/drivers/power/wm8350_power.c
@@ -196,14 +196,14 @@ static irqreturn_t wm8350_charger_handler(int irq, void *data)
 		break;
 	case WM8350_IRQ_CHG_TO:
 		dev_err(wm8350->dev, "charger timeout\n");
-		power_supply_changed(&power->battery);
+		power_supply_changed(power->battery);
 		break;
 
 	case WM8350_IRQ_CHG_BAT_HOT:
 	case WM8350_IRQ_CHG_BAT_COLD:
 	case WM8350_IRQ_CHG_START:
 	case WM8350_IRQ_CHG_END:
-		power_supply_changed(&power->battery);
+		power_supply_changed(power->battery);
 		break;
 
 	case WM8350_IRQ_CHG_FAST_RDY:
@@ -231,9 +231,9 @@ static irqreturn_t wm8350_charger_handler(int irq, void *data)
 	case WM8350_IRQ_EXT_WALL_FB:
 		wm8350_charger_config(wm8350, policy);
 	case WM8350_IRQ_EXT_BAT_FB:   /* Fall through */
-		power_supply_changed(&power->battery);
-		power_supply_changed(&power->usb);
-		power_supply_changed(&power->ac);
+		power_supply_changed(power->battery);
+		power_supply_changed(power->usb);
+		power_supply_changed(power->ac);
 		break;
 
 	default:
@@ -250,7 +250,7 @@ static int wm8350_ac_get_prop(struct power_supply *psy,
 			      enum power_supply_property psp,
 			      union power_supply_propval *val)
 {
-	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev->parent);
+	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -280,7 +280,7 @@ static int wm8350_usb_get_prop(struct power_supply *psy,
 			       enum power_supply_property psp,
 			       union power_supply_propval *val)
 {
-	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev->parent);
+	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -346,7 +346,7 @@ static int wm8350_bat_get_property(struct power_supply *psy,
 				   enum power_supply_property psp,
 				   union power_supply_propval *val)
 {
-	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev->parent);
+	struct wm8350 *wm8350 = dev_get_drvdata(psy->dev.parent);
 	int ret = 0;
 
 	switch (psp) {
@@ -382,6 +382,30 @@ static enum power_supply_property wm8350_bat_props[] = {
 	POWER_SUPPLY_PROP_CHARGE_TYPE,
 };
 
+static const struct power_supply_desc wm8350_ac_desc = {
+	.name		= "wm8350-ac",
+	.type		= POWER_SUPPLY_TYPE_MAINS,
+	.properties	= wm8350_ac_props,
+	.num_properties	= ARRAY_SIZE(wm8350_ac_props),
+	.get_property	= wm8350_ac_get_prop,
+};
+
+static const struct power_supply_desc wm8350_battery_desc = {
+	.name		= "wm8350-battery",
+	.properties	= wm8350_bat_props,
+	.num_properties	= ARRAY_SIZE(wm8350_bat_props),
+	.get_property	= wm8350_bat_get_property,
+	.use_for_apm	= 1,
+};
+
+static const struct power_supply_desc wm8350_usb_desc = {
+	.name		= "wm8350-usb",
+	.type		= POWER_SUPPLY_TYPE_USB,
+	.properties	= wm8350_usb_props,
+	.num_properties	= ARRAY_SIZE(wm8350_usb_props),
+	.get_property	= wm8350_usb_get_prop,
+};
+
 /*********************************************************************
  *		Initialisation
  *********************************************************************/
@@ -447,37 +471,24 @@ static int wm8350_power_probe(struct platform_device *pdev)
 	struct wm8350 *wm8350 = platform_get_drvdata(pdev);
 	struct wm8350_power *power = &wm8350->power;
 	struct wm8350_charger_policy *policy = power->policy;
-	struct power_supply *usb = &power->usb;
-	struct power_supply *battery = &power->battery;
-	struct power_supply *ac = &power->ac;
 	int ret;
 
-	ac->name = "wm8350-ac";
-	ac->type = POWER_SUPPLY_TYPE_MAINS;
-	ac->properties = wm8350_ac_props;
-	ac->num_properties = ARRAY_SIZE(wm8350_ac_props);
-	ac->get_property = wm8350_ac_get_prop;
-	ret = power_supply_register(&pdev->dev, ac, NULL);
-	if (ret)
-		return ret;
-
-	battery->name = "wm8350-battery";
-	battery->properties = wm8350_bat_props;
-	battery->num_properties = ARRAY_SIZE(wm8350_bat_props);
-	battery->get_property = wm8350_bat_get_property;
-	battery->use_for_apm = 1;
-	ret = power_supply_register(&pdev->dev, battery, NULL);
-	if (ret)
+	power->ac = power_supply_register(&pdev->dev, &wm8350_ac_desc, NULL);
+	if (IS_ERR(power->ac))
+		return PTR_ERR(power->ac);
+
+	power->battery = power_supply_register(&pdev->dev, &wm8350_battery_desc,
+					       NULL);
+	if (IS_ERR(power->battery)) {
+		ret = PTR_ERR(power->battery);
 		goto battery_failed;
+	}
 
-	usb->name = "wm8350-usb",
-	usb->type = POWER_SUPPLY_TYPE_USB;
-	usb->properties = wm8350_usb_props;
-	usb->num_properties = ARRAY_SIZE(wm8350_usb_props);
-	usb->get_property = wm8350_usb_get_prop;
-	ret = power_supply_register(&pdev->dev, usb, NULL);
-	if (ret)
+	power->usb = power_supply_register(&pdev->dev, &wm8350_usb_desc, NULL);
+	if (IS_ERR(power->usb)) {
+		ret = PTR_ERR(power->usb);
 		goto usb_failed;
+	}
 
 	ret = device_create_file(&pdev->dev, &dev_attr_charger_state);
 	if (ret < 0)
@@ -494,9 +505,9 @@ static int wm8350_power_probe(struct platform_device *pdev)
 	return ret;
 
 usb_failed:
-	power_supply_unregister(battery);
+	power_supply_unregister(power->battery);
 battery_failed:
-	power_supply_unregister(ac);
+	power_supply_unregister(power->ac);
 
 	return ret;
 }
@@ -508,9 +519,9 @@ static int wm8350_power_remove(struct platform_device *pdev)
 
 	free_charger_irq(wm8350);
 	device_remove_file(&pdev->dev, &dev_attr_charger_state);
-	power_supply_unregister(&power->battery);
-	power_supply_unregister(&power->ac);
-	power_supply_unregister(&power->usb);
+	power_supply_unregister(power->battery);
+	power_supply_unregister(power->ac);
+	power_supply_unregister(power->usb);
 	return 0;
 }
 
diff --git a/drivers/power/wm97xx_battery.c b/drivers/power/wm97xx_battery.c
index e81e917bd9d0..c2f09ed35050 100644
--- a/drivers/power/wm97xx_battery.c
+++ b/drivers/power/wm97xx_battery.c
@@ -32,20 +32,20 @@ static enum power_supply_property *prop;
 
 static unsigned long wm97xx_read_bat(struct power_supply *bat_ps)
 {
-	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_pdata *wmdata = bat_ps->dev.parent->platform_data;
 	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
 
-	return wm97xx_read_aux_adc(dev_get_drvdata(bat_ps->dev->parent),
+	return wm97xx_read_aux_adc(dev_get_drvdata(bat_ps->dev.parent),
 					pdata->batt_aux) * pdata->batt_mult /
 					pdata->batt_div;
 }
 
 static unsigned long wm97xx_read_temp(struct power_supply *bat_ps)
 {
-	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_pdata *wmdata = bat_ps->dev.parent->platform_data;
 	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
 
-	return wm97xx_read_aux_adc(dev_get_drvdata(bat_ps->dev->parent),
+	return wm97xx_read_aux_adc(dev_get_drvdata(bat_ps->dev.parent),
 					pdata->temp_aux) * pdata->temp_mult /
 					pdata->temp_div;
 }
@@ -54,7 +54,7 @@ static int wm97xx_bat_get_property(struct power_supply *bat_ps,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_pdata *wmdata = bat_ps->dev.parent->platform_data;
 	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
 
 	switch (psp) {
@@ -105,7 +105,7 @@ static void wm97xx_bat_external_power_changed(struct power_supply *bat_ps)
 static void wm97xx_bat_update(struct power_supply *bat_ps)
 {
 	int old_status = bat_status;
-	struct wm97xx_pdata *wmdata = bat_ps->dev->parent->platform_data;
+	struct wm97xx_pdata *wmdata = bat_ps->dev.parent->platform_data;
 	struct wm97xx_batt_pdata *pdata = wmdata->batt_pdata;
 
 	mutex_lock(&work_lock);
@@ -117,7 +117,7 @@ static void wm97xx_bat_update(struct power_supply *bat_ps)
 			POWER_SUPPLY_STATUS_UNKNOWN;
 
 	if (old_status != bat_status) {
-		pr_debug("%s: %i -> %i\n", bat_ps->name, old_status,
+		pr_debug("%s: %i -> %i\n", bat_ps->desc->name, old_status,
 					bat_status);
 		power_supply_changed(bat_ps);
 	}
@@ -125,7 +125,8 @@ static void wm97xx_bat_update(struct power_supply *bat_ps)
 	mutex_unlock(&work_lock);
 }
 
-static struct power_supply bat_ps = {
+static struct power_supply *bat_psy;
+static struct power_supply_desc bat_psy_desc = {
 	.type			= POWER_SUPPLY_TYPE_BATTERY,
 	.get_property		= wm97xx_bat_get_property,
 	.external_power_changed = wm97xx_bat_external_power_changed,
@@ -134,7 +135,7 @@ static struct power_supply bat_ps = {
 
 static void wm97xx_bat_work(struct work_struct *work)
 {
-	wm97xx_bat_update(&bat_ps);
+	wm97xx_bat_update(bat_psy);
 }
 
 static irqreturn_t wm97xx_chrg_irq(int irq, void *data)
@@ -237,18 +238,20 @@ static int wm97xx_bat_probe(struct platform_device *dev)
 		dev_info(&dev->dev, "Please consider setting proper battery "
 				"name in platform definition file, falling "
 				"back to name \"wm97xx-batt\"\n");
-		bat_ps.name = "wm97xx-batt";
+		bat_psy_desc.name = "wm97xx-batt";
 	} else
-		bat_ps.name = pdata->batt_name;
+		bat_psy_desc.name = pdata->batt_name;
 
-	bat_ps.properties = prop;
-	bat_ps.num_properties = props;
+	bat_psy_desc.properties = prop;
+	bat_psy_desc.num_properties = props;
 
-	ret = power_supply_register(&dev->dev, &bat_ps, NULL);
-	if (!ret)
+	bat_psy = power_supply_register(&dev->dev, &bat_psy_desc, NULL);
+	if (!IS_ERR(bat_psy)) {
 		schedule_work(&bat_work);
-	else
+	} else {
+		ret = PTR_ERR(bat_psy);
 		goto err4;
+	}
 
 	return 0;
 err4:
@@ -273,7 +276,7 @@ static int wm97xx_bat_remove(struct platform_device *dev)
 		gpio_free(pdata->charge_gpio);
 	}
 	cancel_work_sync(&bat_work);
-	power_supply_unregister(&bat_ps);
+	power_supply_unregister(bat_psy);
 	kfree(prop);
 	return 0;
 }
diff --git a/drivers/power/z2_battery.c b/drivers/power/z2_battery.c
index df22364212dd..b201e3facf73 100644
--- a/drivers/power/z2_battery.c
+++ b/drivers/power/z2_battery.c
@@ -21,12 +21,13 @@
 #define	Z2_DEFAULT_NAME	"Z2"
 
 struct z2_charger {
-	struct z2_battery_info	*info;
-	int			bat_status;
-	struct i2c_client	*client;
-	struct power_supply	batt_ps;
-	struct mutex		work_lock;
-	struct work_struct	bat_work;
+	struct z2_battery_info		*info;
+	int				bat_status;
+	struct i2c_client		*client;
+	struct power_supply		*batt_ps;
+	struct power_supply_desc	batt_ps_desc;
+	struct mutex			work_lock;
+	struct work_struct		bat_work;
 };
 
 static unsigned long z2_read_bat(struct z2_charger *charger)
@@ -44,8 +45,7 @@ static int z2_batt_get_property(struct power_supply *batt_ps,
 			    enum power_supply_property psp,
 			    union power_supply_propval *val)
 {
-	struct z2_charger *charger = container_of(batt_ps, struct z2_charger,
-						batt_ps);
+	struct z2_charger *charger = power_supply_get_drvdata(batt_ps);
 	struct z2_battery_info *info = charger->info;
 
 	switch (psp) {
@@ -85,8 +85,8 @@ static int z2_batt_get_property(struct power_supply *batt_ps,
 
 static void z2_batt_ext_power_changed(struct power_supply *batt_ps)
 {
-	struct z2_charger *charger = container_of(batt_ps, struct z2_charger,
-						batt_ps);
+	struct z2_charger *charger = power_supply_get_drvdata(batt_ps);
+
 	schedule_work(&charger->bat_work);
 }
 
@@ -106,9 +106,10 @@ static void z2_batt_update(struct z2_charger *charger)
 		POWER_SUPPLY_STATUS_UNKNOWN;
 
 	if (old_status != charger->bat_status) {
-		pr_debug("%s: %i -> %i\n", charger->batt_ps.name, old_status,
-			charger->bat_status);
-		power_supply_changed(&charger->batt_ps);
+		pr_debug("%s: %i -> %i\n", charger->batt_ps->desc->name,
+				old_status,
+				charger->bat_status);
+		power_supply_changed(charger->batt_ps);
 	}
 
 	mutex_unlock(&charger->work_lock);
@@ -166,16 +167,17 @@ static int z2_batt_ps_init(struct z2_charger *charger, int props)
 				"Please consider setting proper battery "
 				"name in platform definition file, falling "
 				"back to name \" Z2_DEFAULT_NAME \"\n");
-		charger->batt_ps.name = Z2_DEFAULT_NAME;
+		charger->batt_ps_desc.name = Z2_DEFAULT_NAME;
 	} else
-		charger->batt_ps.name = info->batt_name;
+		charger->batt_ps_desc.name = info->batt_name;
 
-	charger->batt_ps.properties		= prop;
-	charger->batt_ps.num_properties		= props;
-	charger->batt_ps.type			= POWER_SUPPLY_TYPE_BATTERY;
-	charger->batt_ps.get_property		= z2_batt_get_property;
-	charger->batt_ps.external_power_changed	= z2_batt_ext_power_changed;
-	charger->batt_ps.use_for_apm		= 1;
+	charger->batt_ps_desc.properties	= prop;
+	charger->batt_ps_desc.num_properties	= props;
+	charger->batt_ps_desc.type		= POWER_SUPPLY_TYPE_BATTERY;
+	charger->batt_ps_desc.get_property	= z2_batt_get_property;
+	charger->batt_ps_desc.external_power_changed =
+						z2_batt_ext_power_changed;
+	charger->batt_ps_desc.use_for_apm	= 1;
 
 	return 0;
 }
@@ -187,6 +189,7 @@ static int z2_batt_probe(struct i2c_client *client,
 	int props = 1;	/* POWER_SUPPLY_PROP_PRESENT */
 	struct z2_charger *charger;
 	struct z2_battery_info *info = client->dev.platform_data;
+	struct power_supply_config psy_cfg = {};
 
 	if (info == NULL) {
 		dev_err(&client->dev,
@@ -203,6 +206,7 @@ static int z2_batt_probe(struct i2c_client *client,
 	charger->info = info;
 	charger->client = client;
 	i2c_set_clientdata(client, charger);
+	psy_cfg.drv_data = charger;
 
 	mutex_init(&charger->work_lock);
 
@@ -230,16 +234,20 @@ static int z2_batt_probe(struct i2c_client *client,
 
 	INIT_WORK(&charger->bat_work, z2_batt_work);
 
-	ret = power_supply_register(&client->dev, &charger->batt_ps, NULL);
-	if (ret)
+	charger->batt_ps = power_supply_register(&client->dev,
+						 &charger->batt_ps_desc,
+						 &psy_cfg);
+	if (IS_ERR(charger->batt_ps)) {
+		ret = PTR_ERR(charger->batt_ps);
 		goto err4;
+	}
 
 	schedule_work(&charger->bat_work);
 
 	return 0;
 
 err4:
-	kfree(charger->batt_ps.properties);
+	kfree(charger->batt_ps_desc.properties);
 err3:
 	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio))
 		free_irq(gpio_to_irq(info->charge_gpio), charger);
@@ -257,9 +265,9 @@ static int z2_batt_remove(struct i2c_client *client)
 	struct z2_battery_info *info = charger->info;
 
 	cancel_work_sync(&charger->bat_work);
-	power_supply_unregister(&charger->batt_ps);
+	power_supply_unregister(charger->batt_ps);
 
-	kfree(charger->batt_ps.properties);
+	kfree(charger->batt_ps_desc.properties);
 	if (info->charge_gpio >= 0 && gpio_is_valid(info->charge_gpio)) {
 		free_irq(gpio_to_irq(info->charge_gpio), charger);
 		gpio_free(info->charge_gpio);
diff --git a/drivers/staging/nvec/nvec_power.c b/drivers/staging/nvec/nvec_power.c
index 4bfa84672818..30b66c3c9b73 100644
--- a/drivers/staging/nvec/nvec_power.c
+++ b/drivers/staging/nvec/nvec_power.c
@@ -82,8 +82,8 @@ struct bat_response {
 	};
 };
 
-static struct power_supply nvec_bat_psy;
-static struct power_supply nvec_psy;
+static struct power_supply *nvec_bat_psy;
+static struct power_supply *nvec_psy;
 
 static int nvec_power_notifier(struct notifier_block *nb,
 			       unsigned long event_type, void *data)
@@ -98,7 +98,7 @@ static int nvec_power_notifier(struct notifier_block *nb,
 	if (res->sub_type == 0) {
 		if (power->on != res->plu) {
 			power->on = res->plu;
-			power_supply_changed(&nvec_psy);
+			power_supply_changed(nvec_psy);
 		}
 		return NOTIFY_STOP;
 	}
@@ -167,7 +167,7 @@ static int nvec_power_bat_notifier(struct notifier_block *nb,
 		}
 		power->bat_cap = res->plc[1];
 		if (status_changed)
-			power_supply_changed(&nvec_bat_psy);
+			power_supply_changed(nvec_bat_psy);
 		break;
 	case VOLTAGE:
 		power->bat_voltage_now = res->plu * 1000;
@@ -225,7 +225,7 @@ static int nvec_power_get_property(struct power_supply *psy,
 				   enum power_supply_property psp,
 				   union power_supply_propval *val)
 {
-	struct nvec_power *power = dev_get_drvdata(psy->dev->parent);
+	struct nvec_power *power = dev_get_drvdata(psy->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_ONLINE:
@@ -241,7 +241,7 @@ static int nvec_battery_get_property(struct power_supply *psy,
 				     enum power_supply_property psp,
 				     union power_supply_propval *val)
 {
-	struct nvec_power *power = dev_get_drvdata(psy->dev->parent);
+	struct nvec_power *power = dev_get_drvdata(psy->dev.parent);
 
 	switch (psp) {
 	case POWER_SUPPLY_PROP_STATUS:
@@ -323,7 +323,7 @@ static char *nvec_power_supplied_to[] = {
 	"battery",
 };
 
-static struct power_supply nvec_bat_psy = {
+static const struct power_supply_desc nvec_bat_psy_desc = {
 	.name = "battery",
 	.type = POWER_SUPPLY_TYPE_BATTERY,
 	.properties = nvec_battery_props,
@@ -331,7 +331,7 @@ static struct power_supply nvec_bat_psy = {
 	.get_property = nvec_battery_get_property,
 };
 
-static struct power_supply nvec_psy = {
+static const struct power_supply_desc nvec_psy_desc = {
 	.name = "ac",
 	.type = POWER_SUPPLY_TYPE_MAINS,
 	.properties = nvec_power_props,
@@ -371,7 +371,8 @@ static void nvec_power_poll(struct work_struct *work)
 
 static int nvec_power_probe(struct platform_device *pdev)
 {
-	struct power_supply *psy;
+	struct power_supply **psy;
+	const struct power_supply_desc *psy_desc;
 	struct nvec_power *power;
 	struct nvec_chip *nvec = dev_get_drvdata(pdev->dev.parent);
 	struct power_supply_config psy_cfg = {};
@@ -386,6 +387,7 @@ static int nvec_power_probe(struct platform_device *pdev)
 	switch (pdev->id) {
 	case AC:
 		psy = &nvec_psy;
+		psy_desc = &nvec_psy_desc;
 		psy_cfg.supplied_to = nvec_power_supplied_to;
 		psy_cfg.num_supplicants = ARRAY_SIZE(nvec_power_supplied_to);
 
@@ -396,6 +398,7 @@ static int nvec_power_probe(struct platform_device *pdev)
 		break;
 	case BAT:
 		psy = &nvec_bat_psy;
+		psy_desc = &nvec_bat_psy_desc;
 
 		power->notifier.notifier_call = nvec_power_bat_notifier;
 		break;
@@ -408,7 +411,9 @@ static int nvec_power_probe(struct platform_device *pdev)
 	if (pdev->id == BAT)
 		get_bat_mfg_data(power);
 
-	return power_supply_register(&pdev->dev, psy, &psy_cfg);
+	*psy = power_supply_register(&pdev->dev, psy_desc, &psy_cfg);
+
+	return PTR_ERR_OR_ZERO(*psy);
 }
 
 static int nvec_power_remove(struct platform_device *pdev)
@@ -419,10 +424,10 @@ static int nvec_power_remove(struct platform_device *pdev)
 	nvec_unregister_notifier(power->nvec, &power->notifier);
 	switch (pdev->id) {
 	case AC:
-		power_supply_unregister(&nvec_psy);
+		power_supply_unregister(nvec_psy);
 		break;
 	case BAT:
-		power_supply_unregister(&nvec_bat_psy);
+		power_supply_unregister(nvec_bat_psy);
 	}
 
 	return 0;
diff --git a/include/linux/hid.h b/include/linux/hid.h
index efc7787a41a8..f94cf28e4b7c 100644
--- a/include/linux/hid.h
+++ b/include/linux/hid.h
@@ -514,10 +514,10 @@ struct hid_device {							/* device report descriptor */
 #ifdef CONFIG_HID_BATTERY_STRENGTH
 	/*
 	 * Power supply information for HID devices which report
-	 * battery strength. power_supply is registered iff
-	 * battery.name is non-NULL.
+	 * battery strength. power_supply was successfully registered if
+	 * battery is non-NULL.
 	 */
-	struct power_supply battery;
+	struct power_supply *battery;
 	__s32 battery_min;
 	__s32 battery_max;
 	__s32 battery_report_type;
diff --git a/include/linux/mfd/abx500/ux500_chargalg.h b/include/linux/mfd/abx500/ux500_chargalg.h
index 234c99143bf7..67703f23e7ba 100644
--- a/include/linux/mfd/abx500/ux500_chargalg.h
+++ b/include/linux/mfd/abx500/ux500_chargalg.h
@@ -9,8 +9,13 @@
 
 #include <linux/power_supply.h>
 
-#define psy_to_ux500_charger(x) container_of((x), \
-		struct ux500_charger, psy)
+/*
+ * Valid only for supplies of type:
+ * - POWER_SUPPLY_TYPE_MAINS,
+ * - POWER_SUPPLY_TYPE_USB,
+ * because only them store as drv_data pointer to struct ux500_charger.
+ */
+#define psy_to_ux500_charger(x) power_supply_get_drvdata(psy)
 
 /* Forward declaration */
 struct ux500_charger;
@@ -35,7 +40,7 @@ struct ux500_charger_ops {
  * @power_path		USB power path support
  */
 struct ux500_charger {
-	struct power_supply psy;
+	struct power_supply *psy;
 	struct ux500_charger_ops ops;
 	int max_out_volt;
 	int max_out_curr;
diff --git a/include/linux/mfd/rt5033.h b/include/linux/mfd/rt5033.h
index 010cff49a98e..6cff5cf458d2 100644
--- a/include/linux/mfd/rt5033.h
+++ b/include/linux/mfd/rt5033.h
@@ -39,7 +39,7 @@ struct rt5033_battery {
 	struct i2c_client	*client;
 	struct rt5033_dev	*rt5033;
 	struct regmap		*regmap;
-	struct power_supply	psy;
+	struct power_supply	*psy;
 };
 
 /* RT5033 charger platform data */
diff --git a/include/linux/mfd/wm8350/supply.h b/include/linux/mfd/wm8350/supply.h
index 2b9479310bbd..8dc93673e34a 100644
--- a/include/linux/mfd/wm8350/supply.h
+++ b/include/linux/mfd/wm8350/supply.h
@@ -123,9 +123,9 @@ struct wm8350_charger_policy {
 
 struct wm8350_power {
 	struct platform_device *pdev;
-	struct power_supply battery;
-	struct power_supply usb;
-	struct power_supply ac;
+	struct power_supply *battery;
+	struct power_supply *usb;
+	struct power_supply *ac;
 	struct wm8350_charger_policy *policy;
 
 	int rev_g_coeff;
diff --git a/include/linux/power/charger-manager.h b/include/linux/power/charger-manager.h
index 416ebeb6ee1e..eadf28cb2fc9 100644
--- a/include/linux/power/charger-manager.h
+++ b/include/linux/power/charger-manager.h
@@ -242,7 +242,8 @@ struct charger_manager {
 	int emergency_stop;
 
 	char psy_name_buf[PSY_NAME_MAX + 1];
-	struct power_supply charger_psy;
+	struct power_supply_desc charger_psy_desc;
+	struct power_supply *charger_psy;
 
 	u64 charging_start_time;
 	u64 charging_end_time;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h
index 7ae60346465f..ea15eb68f609 100644
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -13,6 +13,7 @@
 #ifndef __LINUX_POWER_SUPPLY_H__
 #define __LINUX_POWER_SUPPLY_H__
 
+#include <linux/device.h>
 #include <linux/workqueue.h>
 #include <linux/leds.h>
 #include <linux/spinlock.h>
@@ -173,10 +174,10 @@ union power_supply_propval {
 	const char *strval;
 };
 
-struct device;
 struct device_node;
+struct power_supply;
 
-/* Power supply instance specific configuration */
+/* Run-time specific power supply configuration */
 struct power_supply_config {
 	struct device_node *of_node;
 	/* Driver private data */
@@ -186,19 +187,13 @@ struct power_supply_config {
 	size_t num_supplicants;
 };
 
-struct power_supply {
+/* Description of power supply */
+struct power_supply_desc {
 	const char *name;
 	enum power_supply_type type;
 	enum power_supply_property *properties;
 	size_t num_properties;
 
-	char **supplied_to;
-	size_t num_supplicants;
-
-	char **supplied_from;
-	size_t num_supplies;
-	struct device_node *of_node;
-
 	/*
 	 * Functions for drivers implementing power supply class.
 	 * These shouldn't be called directly by other drivers for accessing
@@ -224,12 +219,23 @@ struct power_supply {
 	bool no_thermal;
 	/* For APM emulation, think legacy userspace. */
 	int use_for_apm;
+};
+
+struct power_supply {
+	const struct power_supply_desc *desc;
+
+	char **supplied_to;
+	size_t num_supplicants;
+
+	char **supplied_from;
+	size_t num_supplies;
+	struct device_node *of_node;
 
 	/* Driver private data */
 	void *drv_data;
 
 	/* private */
-	struct device *dev;
+	struct device dev;
 	struct work_struct changed_work;
 	spinlock_t changed_lock;
 	bool changed;
@@ -303,17 +309,22 @@ extern int power_supply_set_property(struct power_supply *psy,
 extern int power_supply_property_is_writeable(struct power_supply *psy,
 					enum power_supply_property psp);
 extern void power_supply_external_power_changed(struct power_supply *psy);
-extern int power_supply_register(struct device *parent,
-				 struct power_supply *psy,
+
+extern struct power_supply *__must_check
+power_supply_register(struct device *parent,
+				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
-extern int power_supply_register_no_ws(struct device *parent,
-				 struct power_supply *psy,
+extern struct power_supply *__must_check
+power_supply_register_no_ws(struct device *parent,
+				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
-extern int devm_power_supply_register(struct device *parent,
-				 struct power_supply *psy,
+extern struct power_supply *__must_check
+devm_power_supply_register(struct device *parent,
+				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
-extern int devm_power_supply_register_no_ws(struct device *parent,
-				 struct power_supply *psy,
+extern struct power_supply *__must_check
+devm_power_supply_register_no_ws(struct device *parent,
+				 const struct power_supply_desc *desc,
 				 const struct power_supply_config *cfg);
 extern void power_supply_unregister(struct power_supply *psy);
 extern int power_supply_powers(struct power_supply *psy, struct device *dev);
-- 
cgit v1.2.3


From ed6dad52298152a5c493223234e431f206c5a46b Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski
Date: Thu, 12 Mar 2015 08:44:15 +0100
Subject: x86/olpc/xo1/sci: Use newly added power_supply_put API

Replace direct usage of put_device() with new API: power_supply_put().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Reviewed-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo1-sci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo1-sci.c b/arch/x86/platform/olpc/olpc-xo1-sci.c
index e4ed28bbf79d..7fa8b3b53bc0 100644
--- a/arch/x86/platform/olpc/olpc-xo1-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo1-sci.c
@@ -61,7 +61,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
@@ -71,7 +71,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
-- 
cgit v1.2.3


From 67273a1b421a12ef77bcf08b027d165f399054ae Mon Sep 17 00:00:00 2001
From: Krzysztof Kozlowski
Date: Thu, 12 Mar 2015 08:44:16 +0100
Subject: x86/olpc/xo15/sci: Use newly added power_supply_put API

Replace direct usage of put_device() with new API: power_supply_put().

Signed-off-by: Krzysztof Kozlowski <k.kozlowski@samsung.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Reviewed-by: Bartlomiej Zolnierkiewicz <b.zolnierkie@samsung.com>
Reviewed-by: Sebastian Reichel <sre@kernel.org>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sebastian Reichel <sre@kernel.org>
---
 arch/x86/platform/olpc/olpc-xo15-sci.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 186634e9021d..55130846ac87 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -83,7 +83,7 @@ static void battery_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
@@ -93,7 +93,7 @@ static void ac_status_changed(void)
 
 	if (psy) {
 		power_supply_changed(psy);
-		put_device(&psy->dev);
+		power_supply_put(psy);
 	}
 }
 
-- 
cgit v1.2.3


From b253149b843f89cd300cbdbea27ce1f847506f99 Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Wed, 15 Jan 2014 00:37:34 -0500
Subject: sched/idle/x86: Restore mwait_idle() to fix boot hangs, to improve
 power savings and to improve performance

In Linux-3.9 we removed the mwait_idle() loop:

  69fb3676df33 ("x86 idle: remove mwait_idle() and "idle=mwait" cmdline param")

The reasoning was that modern machines should be sufficiently
happy during the boot process using the default_idle() HALT
loop, until cpuidle loads and either acpi_idle or intel_idle
invoke the newer MWAIT-with-hints idle loop.

But two machines reported problems:

 1. Certain Core2-era machines support MWAIT-C1 and HALT only.
    MWAIT-C1 is preferred for optimal power and performance.
    But if they support just C1, cpuidle never loads and
    so they use the boot-time default idle loop forever.

 2. Some laptops will boot-hang if HALT is used,
    but will boot successfully if MWAIT is used.
    This appears to be a hidden assumption in BIOS SMI,
    that is presumably valid on the proprietary OS
    where the BIOS was validated.

       https://bugzilla.kernel.org/show_bug.cgi?id=60770

So here we effectively revert the patch above, restoring
the mwait_idle() loop.  However, we don't bother restoring
the idle=mwait cmdline parameter, since it appears to add
no value.

Maintainer notes:

  For 3.9, simply revert 69fb3676df
  for 3.10, patch -F3 applies, fuzz needed due to __cpuinit use in
  context For 3.11, 3.12, 3.13, this patch applies cleanly

Tested-by: Mike Galbraith <bitbucket@online.de>
Signed-off-by: Len Brown <len.brown@intel.com>
Acked-by: Mike Galbraith <bitbucket@online.de>
Cc: <stable@vger.kernel.org> # 3.9+
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ian Malone <ibmalone@gmail.com>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/345254a551eb5a6a866e048d7ab570fd2193aca4.1389763084.git.len.brown@intel.com
[ Ported to recent kernels. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/mwait.h |  8 ++++++++
 arch/x86/kernel/process.c    | 47 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index a1410db38a1a..653dfa7662e1 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -30,6 +30,14 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
+static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
+{
+	trace_hardirqs_on();
+	/* "mwait %eax, %ecx;" */
+	asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
+		     :: "a" (eax), "c" (ecx));
+}
+
 /*
  * This uses new MONITOR/MWAIT instructions on P4 processors with PNI,
  * which can obviate IPI to trigger checking of need_resched.
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e127ddaa2d5a..da06f741d2a6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -24,6 +24,7 @@
 #include <asm/syscalls.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
+#include <asm/mwait.h>
 #include <asm/i387.h>
 #include <asm/fpu-internal.h>
 #include <asm/debugreg.h>
@@ -398,6 +399,49 @@ static void amd_e400_idle(void)
 		default_idle();
 }
 
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+	if (c->x86_vendor != X86_VENDOR_INTEL)
+		return 0;
+
+	if (!cpu_has(c, X86_FEATURE_MWAIT))
+		return 0;
+
+	return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default default C1 state.
+ * This invokes MWAIT with interrutps enabled and no flags,
+ * which is backwards compatible with the original MWAIT implementation.
+ */
+
+static void mwait_idle(void)
+{
+	if (!need_resched()) {
+		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR))
+			clflush((void *)&current_thread_info()->flags);
+
+		__monitor((void *)&current_thread_info()->flags, 0, 0);
+		smp_mb();
+		if (!need_resched())
+			__sti_mwait(0, 0);
+		else
+			local_irq_enable();
+	} else
+		local_irq_enable();
+}
+
 void select_idle_routine(const struct cpuinfo_x86 *c)
 {
 #ifdef CONFIG_SMP
@@ -411,6 +455,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
 		/* E400: APIC timer interrupt does not wake up CPU from C1e */
 		pr_info("using AMD E400 aware idle routine\n");
 		x86_idle = amd_e400_idle;
+	} else if (prefer_mwait_c1_over_halt(c)) {
+		pr_info("using mwait in idle threads\n");
+		x86_idle = mwait_idle;
 	} else
 		x86_idle = default_idle;
 }
-- 
cgit v1.2.3


From f8e617f4582995f7c25ef25b4167213120ad122b Mon Sep 17 00:00:00 2001
From: Mike Galbraith
Date: Sat, 18 Jan 2014 17:14:44 +0100
Subject: sched/idle/x86: Optimize unnecessary mwait_idle() resched IPIs

To fully take advantage of MWAIT, apparently the CLFLUSH instruction needs
another quirk on certain CPUs: proper barriers around it on certain machines.

On a Q6600 SMP system, pipe-test scheduling performance, cross core,
improves significantly:

  3.8.13                   487.2 KHz    1.000
  3.13.0-master            415.5 KHz     .852
  3.13.0-master+           415.2 KHz     .852     + restore mwait_idle
  3.13.0-master++          488.5 KHz    1.002     + restore mwait_idle + IPI fix

Since X86_BUG_CLFLUSH_MONITOR is already a quirk, don't create a separate
quirk for the extra smp_mb()s.

Signed-off-by: Mike Galbraith <bitbucket@online.de>
Cc: <stable@vger.kernel.org> # 3.10+
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ian Malone <ibmalone@gmail.com>
Cc: Josh Boyer <jwboyer@redhat.com>
Cc: Len Brown <len.brown@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1390061684.5566.4.camel@marge.simpson.net
[ Ported to recent kernel, added comments about the quirk. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index da06f741d2a6..6ad8a6396b75 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -428,18 +428,22 @@ static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
 
 static void mwait_idle(void)
 {
-	if (!need_resched()) {
-		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR))
+	if (!current_set_polling_and_test()) {
+		if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+			smp_mb(); /* quirk */
 			clflush((void *)&current_thread_info()->flags);
+			smp_mb(); /* quirk */
+		}
 
 		__monitor((void *)&current_thread_info()->flags, 0, 0);
-		smp_mb();
 		if (!need_resched())
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-	} else
+	} else {
 		local_irq_enable();
+	}
+	__current_clr_polling();
 }
 
 void select_idle_routine(const struct cpuinfo_x86 *c)
-- 
cgit v1.2.3


From 69797dafe35541bfff1989c0b37c66ed785faf0e Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 16 Mar 2015 11:06:28 +0100
Subject: Revert "x86/mm/ASLR: Propagate base load address calculation"

This reverts commit:

  f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation")

The main reason for the revert is that the new boot flag does not work
at all currently, and in order to make this work, we need non-trivial
changes to the x86 boot code which we didn't manage to get done in
time for merging.

And even if we did, they would've been too risky so instead of
rushing things and break booting 4.1 on boxes left and right, we
will be very strict and conservative and will take our time with
this to fix and test it properly.

Reported-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: H. Peter Anvin <hpa@linux.intel.com
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Junjie Mao <eternal.n08@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/20150316100628.GD22995@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/aslr.c       | 34 +---------------------------------
 arch/x86/boot/compressed/misc.c       |  3 +--
 arch/x86/boot/compressed/misc.h       |  6 ++----
 arch/x86/include/asm/page_types.h     |  2 --
 arch/x86/include/uapi/asm/bootparam.h |  1 -
 arch/x86/kernel/module.c              | 10 +++++++++-
 arch/x86/kernel/setup.c               | 22 ++++------------------
 7 files changed, 17 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index 7083c16cccba..bb1376381985 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -14,13 +14,6 @@
 static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
 		LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
 
-struct kaslr_setup_data {
-	__u64 next;
-	__u32 type;
-	__u32 len;
-	__u8 data[1];
-} kaslr_setup_data;
-
 #define I8254_PORT_CONTROL	0x43
 #define I8254_PORT_COUNTER0	0x40
 #define I8254_CMD_READBACK	0xC0
@@ -302,29 +295,7 @@ static unsigned long find_random_addr(unsigned long minimum,
 	return slots_fetch_random();
 }
 
-static void add_kaslr_setup_data(struct boot_params *params, __u8 enabled)
-{
-	struct setup_data *data;
-
-	kaslr_setup_data.type = SETUP_KASLR;
-	kaslr_setup_data.len = 1;
-	kaslr_setup_data.next = 0;
-	kaslr_setup_data.data[0] = enabled;
-
-	data = (struct setup_data *)(unsigned long)params->hdr.setup_data;
-
-	while (data && data->next)
-		data = (struct setup_data *)(unsigned long)data->next;
-
-	if (data)
-		data->next = (unsigned long)&kaslr_setup_data;
-	else
-		params->hdr.setup_data = (unsigned long)&kaslr_setup_data;
-
-}
-
-unsigned char *choose_kernel_location(struct boot_params *params,
-				      unsigned char *input,
+unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
@@ -335,17 +306,14 @@ unsigned char *choose_kernel_location(struct boot_params *params,
 #ifdef CONFIG_HIBERNATION
 	if (!cmdline_find_option_bool("kaslr")) {
 		debug_putstr("KASLR disabled by default...\n");
-		add_kaslr_setup_data(params, 0);
 		goto out;
 	}
 #else
 	if (cmdline_find_option_bool("nokaslr")) {
 		debug_putstr("KASLR disabled by cmdline...\n");
-		add_kaslr_setup_data(params, 0);
 		goto out;
 	}
 #endif
-	add_kaslr_setup_data(params, 1);
 
 	/* Record the various known unsafe memory ranges. */
 	mem_avoid_init((unsigned long)input, input_size,
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 5903089c818f..a950864a64da 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -401,8 +401,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	 * the entire decompressed kernel plus relocation table, or the
 	 * entire decompressed kernel plus .bss and .brk sections.
 	 */
-	output = choose_kernel_location(real_mode, input_data, input_len,
-					output,
+	output = choose_kernel_location(input_data, input_len, output,
 					output_len > run_size ? output_len
 							      : run_size);
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index ee3576b2666b..04477d68403f 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,8 +57,7 @@ int cmdline_find_option_bool(const char *option);
 
 #if CONFIG_RANDOMIZE_BASE
 /* aslr.c */
-unsigned char *choose_kernel_location(struct boot_params *params,
-				      unsigned char *input,
+unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size);
@@ -66,8 +65,7 @@ unsigned char *choose_kernel_location(struct boot_params *params,
 bool has_cpuflag(int flag);
 #else
 static inline
-unsigned char *choose_kernel_location(struct boot_params *params,
-				      unsigned char *input,
+unsigned char *choose_kernel_location(unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 95e11f79f123..f97fbe3abb67 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -51,8 +51,6 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
-extern bool kaslr_enabled;
-
 static inline phys_addr_t get_max_mapped(void)
 {
 	return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 44e6dd7e36a2..225b0988043a 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -7,7 +7,6 @@
 #define SETUP_DTB			2
 #define SETUP_PCI			3
 #define SETUP_EFI			4
-#define SETUP_KASLR			5
 
 /* ram_size flags */
 #define RAMDISK_IMAGE_START_MASK	0x07FF
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 9bbb9b35c144..d1ac80b72c72 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -47,13 +47,21 @@ do {							\
 
 #ifdef CONFIG_RANDOMIZE_BASE
 static unsigned long module_load_offset;
+static int randomize_modules = 1;
 
 /* Mutex protects the module_load_offset. */
 static DEFINE_MUTEX(module_kaslr_mutex);
 
+static int __init parse_nokaslr(char *p)
+{
+	randomize_modules = 0;
+	return 0;
+}
+early_param("nokaslr", parse_nokaslr);
+
 static unsigned long int get_module_load_offset(void)
 {
-	if (kaslr_enabled) {
+	if (randomize_modules) {
 		mutex_lock(&module_kaslr_mutex);
 		/*
 		 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 98dc9317286e..0a2421cca01f 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -122,8 +122,6 @@
 unsigned long max_low_pfn_mapped;
 unsigned long max_pfn_mapped;
 
-bool __read_mostly kaslr_enabled = false;
-
 #ifdef CONFIG_DMI
 RESERVE_BRK(dmi_alloc, 65536);
 #endif
@@ -427,11 +425,6 @@ static void __init reserve_initrd(void)
 }
 #endif /* CONFIG_BLK_DEV_INITRD */
 
-static void __init parse_kaslr_setup(u64 pa_data, u32 data_len)
-{
-	kaslr_enabled = (bool)(pa_data + sizeof(struct setup_data));
-}
-
 static void __init parse_setup_data(void)
 {
 	struct setup_data *data;
@@ -457,9 +450,6 @@ static void __init parse_setup_data(void)
 		case SETUP_EFI:
 			parse_efi_setup(pa_data, data_len);
 			break;
-		case SETUP_KASLR:
-			parse_kaslr_setup(pa_data, data_len);
-			break;
 		default:
 			break;
 		}
@@ -842,14 +832,10 @@ static void __init trim_low_memory_range(void)
 static int
 dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
-	if (kaslr_enabled)
-		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
-			 (unsigned long)&_text - __START_KERNEL,
-			 __START_KERNEL,
-			 __START_KERNEL_map,
-			 MODULES_VADDR-1);
-	else
-		pr_emerg("Kernel Offset: disabled\n");
+	pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
+		 "(relocation range: 0x%lx-0x%lx)\n",
+		 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
+		 __START_KERNEL_map, MODULES_VADDR-1);
 
 	return 0;
 }
-- 
cgit v1.2.3


From c42e9902f3f1e731a7b83397acdf402eeb3237ca Mon Sep 17 00:00:00 2001
From: Ameen Ali
Date: Fri, 13 Mar 2015 23:38:21 +0200
Subject: crypto: sha1-mb - Syntax error

fixing a syntax-error .

Signed-off-by: Ameen Ali <AmeenAli023@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index fd9f6b035b16..9414fd964ecd 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -828,7 +828,7 @@ static unsigned long sha1_mb_flusher(struct mcryptd_alg_cstate *cstate)
 	while (!list_empty(&cstate->work_list)) {
 		rctx = list_entry(cstate->work_list.next,
 				struct mcryptd_hash_request_ctx, waiter);
-		if time_before(cur_time, rctx->tag.expire)
+		if (time_before(cur_time, rctx->tag.expire))
 			break;
 		kernel_fpu_begin();
 		sha_ctx = (struct sha1_hash_ctx *) sha1_ctx_mgr_flush(cstate->mgr);
-- 
cgit v1.2.3


From 9b4ade226f7468bb26f98b6cd01cb5b8a05fc96d Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Wed, 21 Jan 2015 08:49:22 +0100
Subject: xen: build infrastructure for generating hypercall depending symbols

Today there are several places in the kernel which build tables
containing one entry for each possible Xen hypercall. Create an
infrastructure to be able to generate these tables at build time.

Based-on-patch-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/syscalls/Makefile |  9 +++++++++
 scripts/xen-hypercalls.sh  | 12 ++++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 scripts/xen-hypercalls.sh

(limited to 'arch/x86')

diff --git a/arch/x86/syscalls/Makefile b/arch/x86/syscalls/Makefile
index 3323c2745248..a55abb9f6c5e 100644
--- a/arch/x86/syscalls/Makefile
+++ b/arch/x86/syscalls/Makefile
@@ -19,6 +19,9 @@ quiet_cmd_syshdr = SYSHDR  $@
 quiet_cmd_systbl = SYSTBL  $@
       cmd_systbl = $(CONFIG_SHELL) '$(systbl)' $< $@
 
+quiet_cmd_hypercalls = HYPERCALLS $@
+      cmd_hypercalls = $(CONFIG_SHELL) '$<' $@ $(filter-out $<,$^)
+
 syshdr_abi_unistd_32 := i386
 $(uapi)/unistd_32.h: $(syscall32) $(syshdr)
 	$(call if_changed,syshdr)
@@ -47,10 +50,16 @@ $(out)/syscalls_32.h: $(syscall32) $(systbl)
 $(out)/syscalls_64.h: $(syscall64) $(systbl)
 	$(call if_changed,systbl)
 
+$(out)/xen-hypercalls.h: $(srctree)/scripts/xen-hypercalls.sh
+	$(call if_changed,hypercalls)
+
+$(out)/xen-hypercalls.h: $(srctree)/include/xen/interface/xen*.h
+
 uapisyshdr-y			+= unistd_32.h unistd_64.h unistd_x32.h
 syshdr-y			+= syscalls_32.h
 syshdr-$(CONFIG_X86_64)		+= unistd_32_ia32.h unistd_64_x32.h
 syshdr-$(CONFIG_X86_64)		+= syscalls_64.h
+syshdr-$(CONFIG_XEN)		+= xen-hypercalls.h
 
 targets	+= $(uapisyshdr-y) $(syshdr-y)
 
diff --git a/scripts/xen-hypercalls.sh b/scripts/xen-hypercalls.sh
new file mode 100644
index 000000000000..676d9226814f
--- /dev/null
+++ b/scripts/xen-hypercalls.sh
@@ -0,0 +1,12 @@
+#!/bin/sh
+out="$1"
+shift
+in="$@"
+
+for i in $in; do
+	eval $CPP $LINUXINCLUDE -dD -imacros "$i" -x c /dev/null
+done | \
+awk '$1 == "#define" && $2 ~ /__HYPERVISOR_[a-z][a-z_0-9]*/ { v[$3] = $2 }
+	END {   print "/* auto-generated by scripts/xen-hypercall.sh */"
+		for (i in v) if (!(v[i] in v))
+			print "HYPERCALL("substr(v[i], 14)")"}' | sort -u >$out
-- 
cgit v1.2.3


From 16b12d605795e775cd80b5901c85dd2e320a6ec2 Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Wed, 21 Jan 2015 08:49:23 +0100
Subject: xen: synchronize include/xen/interface/xen.h with xen

The header include/xen/interface/xen.h doesn't contain all definitions
from Xen's version of that header. Update it accordingly.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/trace.c        | 2 +-
 include/xen/interface/xen.h | 6 +++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 520022d1a181..8296cdf4e581 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -29,7 +29,7 @@ static const char *xen_hypercall_names[] = {
 	N(vcpu_op),
 	N(set_segment_base),
 	N(mmuext_op),
-	N(acm_op),
+	N(xsm_op),
 	N(nmi_op),
 	N(sched_op),
 	N(callback_op),
diff --git a/include/xen/interface/xen.h b/include/xen/interface/xen.h
index f68719f405af..a48378958062 100644
--- a/include/xen/interface/xen.h
+++ b/include/xen/interface/xen.h
@@ -67,7 +67,7 @@
 #define __HYPERVISOR_vcpu_op              24
 #define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
 #define __HYPERVISOR_mmuext_op            26
-#define __HYPERVISOR_acm_op               27
+#define __HYPERVISOR_xsm_op               27
 #define __HYPERVISOR_nmi_op               28
 #define __HYPERVISOR_sched_op             29
 #define __HYPERVISOR_callback_op          30
@@ -75,7 +75,11 @@
 #define __HYPERVISOR_event_channel_op     32
 #define __HYPERVISOR_physdev_op           33
 #define __HYPERVISOR_hvm_op               34
+#define __HYPERVISOR_sysctl               35
+#define __HYPERVISOR_domctl               36
+#define __HYPERVISOR_kexec_op             37
 #define __HYPERVISOR_tmem_op              38
+#define __HYPERVISOR_xc_reserved_op       39 /* reserved for XenClient */
 
 /* Architecture-specific hypercall definitions. */
 #define __HYPERVISOR_arch_0               48
-- 
cgit v1.2.3


From fc903f87364d1b00890fa991a6f116e2bb38ec1e Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Wed, 21 Jan 2015 08:49:24 +0100
Subject: xen: use generated hypervisor symbols in arch/x86/xen/trace.c

Instead of manually list all hypervisor calls in arch/x86/xen/trace.c
use the auto generated list.

Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/trace.c | 50 ++++----------------------------------------------
 1 file changed, 4 insertions(+), 46 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/trace.c b/arch/x86/xen/trace.c
index 8296cdf4e581..a702ec2f5931 100644
--- a/arch/x86/xen/trace.c
+++ b/arch/x86/xen/trace.c
@@ -1,54 +1,12 @@
 #include <linux/ftrace.h>
 #include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
 
-#define N(x)	[__HYPERVISOR_##x] = "("#x")"
+#define HYPERCALL(x)	[__HYPERVISOR_##x] = "("#x")",
 static const char *xen_hypercall_names[] = {
-	N(set_trap_table),
-	N(mmu_update),
-	N(set_gdt),
-	N(stack_switch),
-	N(set_callbacks),
-	N(fpu_taskswitch),
-	N(sched_op_compat),
-	N(dom0_op),
-	N(set_debugreg),
-	N(get_debugreg),
-	N(update_descriptor),
-	N(memory_op),
-	N(multicall),
-	N(update_va_mapping),
-	N(set_timer_op),
-	N(event_channel_op_compat),
-	N(xen_version),
-	N(console_io),
-	N(physdev_op_compat),
-	N(grant_table_op),
-	N(vm_assist),
-	N(update_va_mapping_otherdomain),
-	N(iret),
-	N(vcpu_op),
-	N(set_segment_base),
-	N(mmuext_op),
-	N(xsm_op),
-	N(nmi_op),
-	N(sched_op),
-	N(callback_op),
-	N(xenoprof_op),
-	N(event_channel_op),
-	N(physdev_op),
-	N(hvm_op),
-
-/* Architecture-specific hypercall definitions. */
-	N(arch_0),
-	N(arch_1),
-	N(arch_2),
-	N(arch_3),
-	N(arch_4),
-	N(arch_5),
-	N(arch_6),
-	N(arch_7),
+#include <asm/xen-hypercalls.h>
 };
-#undef N
+#undef HYPERCALL
 
 static const char *xen_hypercall_name(unsigned op)
 {
-- 
cgit v1.2.3


From 526abeaed4971def462f209632b88fd7b8c4a09c Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Wed, 21 Jan 2015 08:49:25 +0100
Subject: xen: use generated hypercall symbols in arch/x86/xen/xen-head.S

Instead of manually list each hypercall in arch/x86/xen/xen-head.S
use the auto generated symbol list.

This also corrects the wrong address of xen_hypercall_mca which was
located 32 bytes higher than it should.

Symbol addresses have been verified to match the correct ones via
objdump output.

Based-on-patch-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Juergen Gross <jgross@suse.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/xen-head.S | 63 ++++++++-----------------------------------------
 1 file changed, 10 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 674b222544b7..8afdfccf6086 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -12,6 +12,8 @@
 
 #include <xen/interface/elfnote.h>
 #include <xen/interface/features.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/xen-mca.h>
 #include <asm/xen/interface.h>
 
 #ifdef CONFIG_XEN_PVH
@@ -85,59 +87,14 @@ ENTRY(xen_pvh_early_cpu_init)
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
-#define NEXT_HYPERCALL(x) \
-	ENTRY(xen_hypercall_##x) \
-	.skip 32
-
-NEXT_HYPERCALL(set_trap_table)
-NEXT_HYPERCALL(mmu_update)
-NEXT_HYPERCALL(set_gdt)
-NEXT_HYPERCALL(stack_switch)
-NEXT_HYPERCALL(set_callbacks)
-NEXT_HYPERCALL(fpu_taskswitch)
-NEXT_HYPERCALL(sched_op_compat)
-NEXT_HYPERCALL(platform_op)
-NEXT_HYPERCALL(set_debugreg)
-NEXT_HYPERCALL(get_debugreg)
-NEXT_HYPERCALL(update_descriptor)
-NEXT_HYPERCALL(ni)
-NEXT_HYPERCALL(memory_op)
-NEXT_HYPERCALL(multicall)
-NEXT_HYPERCALL(update_va_mapping)
-NEXT_HYPERCALL(set_timer_op)
-NEXT_HYPERCALL(event_channel_op_compat)
-NEXT_HYPERCALL(xen_version)
-NEXT_HYPERCALL(console_io)
-NEXT_HYPERCALL(physdev_op_compat)
-NEXT_HYPERCALL(grant_table_op)
-NEXT_HYPERCALL(vm_assist)
-NEXT_HYPERCALL(update_va_mapping_otherdomain)
-NEXT_HYPERCALL(iret)
-NEXT_HYPERCALL(vcpu_op)
-NEXT_HYPERCALL(set_segment_base)
-NEXT_HYPERCALL(mmuext_op)
-NEXT_HYPERCALL(xsm_op)
-NEXT_HYPERCALL(nmi_op)
-NEXT_HYPERCALL(sched_op)
-NEXT_HYPERCALL(callback_op)
-NEXT_HYPERCALL(xenoprof_op)
-NEXT_HYPERCALL(event_channel_op)
-NEXT_HYPERCALL(physdev_op)
-NEXT_HYPERCALL(hvm_op)
-NEXT_HYPERCALL(sysctl)
-NEXT_HYPERCALL(domctl)
-NEXT_HYPERCALL(kexec_op)
-NEXT_HYPERCALL(tmem_op) /* 38 */
-ENTRY(xen_hypercall_rsvr)
-	.skip 320
-NEXT_HYPERCALL(mca) /* 48 */
-NEXT_HYPERCALL(arch_1)
-NEXT_HYPERCALL(arch_2)
-NEXT_HYPERCALL(arch_3)
-NEXT_HYPERCALL(arch_4)
-NEXT_HYPERCALL(arch_5)
-NEXT_HYPERCALL(arch_6)
-	.balign PAGE_SIZE
+	.skip PAGE_SIZE
+
+#define HYPERCALL(n) \
+	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
+	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
+#include <asm/xen-hypercalls.h>
+#undef HYPERCALL
+
 .popsection
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
-- 
cgit v1.2.3


From feb44f1f7a4ac299d1ab1c3606860e70b9b89d69 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk
Date: Mon, 2 Mar 2015 12:06:23 -0500
Subject: x86/xen: Provide a "Xen PV" APIC driver to support >255 VCPUs

Instead of mangling the default APIC driver, provide a Xen PV guest
specific one that explicitly provides appropriate methods.

This allows use to report that all APIC IDs are valid, allowing dom0
to boot with more than 255 VCPUs.

Since the probe order of APIC drivers is link dependent, we add in an
late probe function to change to the Xen PV if it hadn't been done
during bootup.

Suggested-by: David Vrabel <david.vrabel@citrix.com>
Reported-by: Cathy Avery <cathy.avery@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/apic.c      | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 arch/x86/xen/enlighten.c |  90 +-----------------------
 2 files changed, 181 insertions(+), 89 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 7005ced5d1ad..5e0ecaf4c336 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -7,6 +7,7 @@
 #include <xen/xen.h>
 #include <xen/interface/physdev.h>
 #include "xen-ops.h"
+#include "smp.h"
 
 static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 {
@@ -28,7 +29,186 @@ static unsigned int xen_io_apic_read(unsigned apic, unsigned reg)
 	return 0xfd;
 }
 
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+	WARN_ON(1);
+	return x;
+}
+
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+	return ((x)>>24) & 0xFFu;
+}
+
+static u32 xen_apic_read(u32 reg)
+{
+	struct xen_platform_op op = {
+		.cmd = XENPF_get_cpuinfo,
+		.interface_version = XENPF_INTERFACE_VERSION,
+		.u.pcpu_info.xen_cpuid = 0,
+	};
+	int ret = 0;
+
+	/* Shouldn't need this as APIC is turned off for PV, and we only
+	 * get called on the bootup processor. But just in case. */
+	if (!xen_initial_domain() || smp_processor_id())
+		return 0;
+
+	if (reg == APIC_LVR)
+		return 0x10;
+#ifdef CONFIG_X86_32
+	if (reg == APIC_LDR)
+		return SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
+#endif
+	if (reg != APIC_ID)
+		return 0;
+
+	ret = HYPERVISOR_dom0_op(&op);
+	if (ret)
+		return 0;
+
+	return op.u.pcpu_info.apic_id << 24;
+}
+
+static void xen_apic_write(u32 reg, u32 val)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static u64 xen_apic_icr_read(void)
+{
+	return 0;
+}
+
+static void xen_apic_icr_write(u32 low, u32 id)
+{
+	/* Warn to see if there's any stray references */
+	WARN_ON(1);
+}
+
+static u32 xen_safe_apic_wait_icr_idle(void)
+{
+        return 0;
+}
+
+static int xen_apic_probe_pv(void)
+{
+	if (xen_pv_domain())
+		return 1;
+
+	return 0;
+}
+
+static int xen_madt_oem_check(char *oem_id, char *oem_table_id)
+{
+	return xen_pv_domain();
+}
+
+static int xen_id_always_valid(int apicid)
+{
+	return 1;
+}
+
+static int xen_id_always_registered(void)
+{
+	return 1;
+}
+
+static int xen_phys_pkg_id(int initial_apic_id, int index_msb)
+{
+	return initial_apic_id >> index_msb;
+}
+
+#ifdef CONFIG_X86_32
+static int xen_x86_32_early_logical_apicid(int cpu)
+{
+	/* Match with APIC_LDR read. Otherwise setup_local_APIC complains. */
+	return 1 << cpu;
+}
+#endif
+
+static void xen_noop(void)
+{
+}
+
+static void xen_silent_inquire(int apicid)
+{
+}
+
+static struct apic xen_pv_apic = {
+	.name 				= "Xen PV",
+	.probe 				= xen_apic_probe_pv,
+	.acpi_madt_oem_check		= xen_madt_oem_check,
+	.apic_id_valid 			= xen_id_always_valid,
+	.apic_id_registered 		= xen_id_always_registered,
+
+	/* .irq_delivery_mode - used in native_compose_msi_msg only */
+	/* .irq_dest_mode     - used in native_compose_msi_msg only */
+
+	.target_cpus			= default_target_cpus,
+	.disable_esr			= 0,
+	/* .dest_logical      -  default_send_IPI_ use it but we use our own. */
+	.check_apicid_used		= default_check_apicid_used, /* Used on 32-bit */
+
+	.vector_allocation_domain	= flat_vector_allocation_domain,
+	.init_apic_ldr			= xen_noop, /* setup_local_APIC calls it */
+
+	.ioapic_phys_id_map		= default_ioapic_phys_id_map, /* Used on 32-bit */
+	.setup_apic_routing		= NULL,
+	.cpu_present_to_apicid		= default_cpu_present_to_apicid,
+	.apicid_to_cpu_present		= physid_set_mask_of_physid, /* Used on 32-bit */
+	.check_phys_apicid_present	= default_check_phys_apicid_present, /* smp_sanity_check needs it */
+	.phys_pkg_id			= xen_phys_pkg_id, /* detect_ht */
+
+	.get_apic_id 			= xen_get_apic_id,
+	.set_apic_id 			= xen_set_apic_id, /* Can be NULL on 32-bit. */
+	.apic_id_mask			= 0xFF << 24, /* Used by verify_local_APIC. Match with what xen_get_apic_id does. */
+
+	.cpu_mask_to_apicid_and		= flat_cpu_mask_to_apicid_and,
+
+#ifdef CONFIG_SMP
+	.send_IPI_mask 			= xen_send_IPI_mask,
+	.send_IPI_mask_allbutself 	= xen_send_IPI_mask_allbutself,
+	.send_IPI_allbutself 		= xen_send_IPI_allbutself,
+	.send_IPI_all 			= xen_send_IPI_all,
+	.send_IPI_self 			= xen_send_IPI_self,
+#endif
+	/* .wait_for_init_deassert- used  by AP bootup - smp_callin which we don't use */
+	.inquire_remote_apic		= xen_silent_inquire,
+
+	.read				= xen_apic_read,
+	.write				= xen_apic_write,
+	.eoi_write			= xen_apic_write,
+
+	.icr_read 			= xen_apic_icr_read,
+	.icr_write 			= xen_apic_icr_write,
+	.wait_icr_idle 			= xen_noop,
+	.safe_wait_icr_idle 		= xen_safe_apic_wait_icr_idle,
+
+#ifdef CONFIG_X86_32
+	/* generic_processor_info and setup_local_APIC. */
+	.x86_32_early_logical_apicid	= xen_x86_32_early_logical_apicid,
+#endif
+};
+
+static void __init xen_apic_check(void)
+{
+	if (apic == &xen_pv_apic)
+		return;
+
+	pr_info("Switched APIC routing from %s to %s.\n", apic->name,
+		xen_pv_apic.name);
+	apic = &xen_pv_apic;
+}
 void __init xen_init_apic(void)
 {
 	x86_io_apic_ops.read = xen_io_apic_read;
+	/* On PV guests the APIC CPUID bit is disabled so none of the
+	 * routines end up executing. */
+	if (!xen_initial_domain())
+		apic = &xen_pv_apic;
+
+	x86_platform.apic_post_init = xen_apic_check;
 }
+apic_driver(xen_pv_apic);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..b9a227284149 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -927,92 +927,6 @@ static void xen_io_delay(void)
 {
 }
 
-#ifdef CONFIG_X86_LOCAL_APIC
-static unsigned long xen_set_apic_id(unsigned int x)
-{
-	WARN_ON(1);
-	return x;
-}
-static unsigned int xen_get_apic_id(unsigned long x)
-{
-	return ((x)>>24) & 0xFFu;
-}
-static u32 xen_apic_read(u32 reg)
-{
-	struct xen_platform_op op = {
-		.cmd = XENPF_get_cpuinfo,
-		.interface_version = XENPF_INTERFACE_VERSION,
-		.u.pcpu_info.xen_cpuid = 0,
-	};
-	int ret = 0;
-
-	/* Shouldn't need this as APIC is turned off for PV, and we only
-	 * get called on the bootup processor. But just in case. */
-	if (!xen_initial_domain() || smp_processor_id())
-		return 0;
-
-	if (reg == APIC_LVR)
-		return 0x10;
-
-	if (reg != APIC_ID)
-		return 0;
-
-	ret = HYPERVISOR_dom0_op(&op);
-	if (ret)
-		return 0;
-
-	return op.u.pcpu_info.apic_id << 24;
-}
-
-static void xen_apic_write(u32 reg, u32 val)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static u64 xen_apic_icr_read(void)
-{
-	return 0;
-}
-
-static void xen_apic_icr_write(u32 low, u32 id)
-{
-	/* Warn to see if there's any stray references */
-	WARN_ON(1);
-}
-
-static void xen_apic_wait_icr_idle(void)
-{
-        return;
-}
-
-static u32 xen_safe_apic_wait_icr_idle(void)
-{
-        return 0;
-}
-
-static void set_xen_basic_apic_ops(void)
-{
-	apic->read = xen_apic_read;
-	apic->write = xen_apic_write;
-	apic->icr_read = xen_apic_icr_read;
-	apic->icr_write = xen_apic_icr_write;
-	apic->wait_icr_idle = xen_apic_wait_icr_idle;
-	apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
-	apic->set_apic_id = xen_set_apic_id;
-	apic->get_apic_id = xen_get_apic_id;
-
-#ifdef CONFIG_SMP
-	apic->send_IPI_allbutself = xen_send_IPI_allbutself;
-	apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
-	apic->send_IPI_mask = xen_send_IPI_mask;
-	apic->send_IPI_all = xen_send_IPI_all;
-	apic->send_IPI_self = xen_send_IPI_self;
-#endif
-}
-
-#endif
-
 static void xen_clts(void)
 {
 	struct multicall_space mcs;
@@ -1618,7 +1532,7 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	/*
 	 * set up the basic apic ops.
 	 */
-	set_xen_basic_apic_ops();
+	xen_init_apic();
 #endif
 
 	if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
@@ -1731,8 +1645,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 		if (HYPERVISOR_dom0_op(&op) == 0)
 			boot_params.kbd_status = op.u.firmware_info.u.kbd_shift_flags;
 
-		xen_init_apic();
-
 		/* Make sure ACS will be enabled */
 		pci_request_acs();
 
-- 
cgit v1.2.3


From b3b06c7eb7820cea5c15f9faa4964044284c5399 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk
Date: Tue, 3 Mar 2015 16:12:12 -0500
Subject: x86/xen/apic: WARN with details.

We should not be writting to the APIC registers under
Xen PV. But if we do instead of just giving an blanket
warning - include some details to help troubleshoot.

Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/apic.c b/arch/x86/xen/apic.c
index 5e0ecaf4c336..70e060ad879a 100644
--- a/arch/x86/xen/apic.c
+++ b/arch/x86/xen/apic.c
@@ -73,7 +73,7 @@ static u32 xen_apic_read(u32 reg)
 static void xen_apic_write(u32 reg, u32 val)
 {
 	/* Warn to see if there's any stray references */
-	WARN_ON(1);
+	WARN(1,"register: %x, value: %x\n", reg, val);
 }
 
 static u64 xen_apic_icr_read(void)
-- 
cgit v1.2.3


From 628c28eefd6f2cef03b212081b466ae43fd093a3 Mon Sep 17 00:00:00 2001
From: David Vrabel
Date: Wed, 11 Mar 2015 14:49:56 +0000
Subject: xen: unify foreign GFN map/unmap for auto-xlated physmap guests

Auto-translated physmap guests (arm, arm64 and x86 PVHVM/PVH) map and
unmap foreign GFNs using the same method (updating the physmap).
Unify the two arm and x86 implementations into one commont one.

Note that on arm and arm64, the correct error code will be returned
(instead of always -EFAULT) and map/unmap failure warnings are no
longer printed.  These changes are required if the foreign domain is
paging (-ENOENT failures are expected and must be propagated up to the
caller).

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/arm/xen/enlighten.c |  90 ++------------------------------
 arch/x86/xen/mmu.c       | 110 ++-------------------------------------
 drivers/xen/Kconfig      |   6 +++
 drivers/xen/Makefile     |   1 +
 drivers/xen/xlate_mmu.c  | 133 +++++++++++++++++++++++++++++++++++++++++++++++
 include/xen/xen-ops.h    |   8 +++
 6 files changed, 154 insertions(+), 194 deletions(-)
 create mode 100644 drivers/xen/xlate_mmu.c

(limited to 'arch/x86')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 263a2044c65b..5c04389fc9ef 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -53,105 +53,21 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
 
 static __read_mostly int xen_events_irq = -1;
 
-/* map fgmfn of domid to lpfn in the current domain */
-static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
-			    unsigned int domid)
-{
-	int rc;
-	struct xen_add_to_physmap_range xatp = {
-		.domid = DOMID_SELF,
-		.foreign_domid = domid,
-		.size = 1,
-		.space = XENMAPSPACE_gmfn_foreign,
-	};
-	xen_ulong_t idx = fgmfn;
-	xen_pfn_t gpfn = lpfn;
-	int err = 0;
-
-	set_xen_guest_handle(xatp.idxs, &idx);
-	set_xen_guest_handle(xatp.gpfns, &gpfn);
-	set_xen_guest_handle(xatp.errs, &err);
-
-	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
-	if (rc || err) {
-		pr_warn("Failed to map pfn to mfn rc:%d:%d pfn:%lx mfn:%lx\n",
-			rc, err, lpfn, fgmfn);
-		return 1;
-	}
-	return 0;
-}
-
-struct remap_data {
-	xen_pfn_t fgmfn; /* foreign domain's gmfn */
-	pgprot_t prot;
-	domid_t  domid;
-	struct vm_area_struct *vma;
-	int index;
-	struct page **pages;
-	struct xen_remap_mfn_info *info;
-};
-
-static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
-			void *data)
-{
-	struct remap_data *info = data;
-	struct page *page = info->pages[info->index++];
-	unsigned long pfn = page_to_pfn(page);
-	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
-
-	if (map_foreign_page(pfn, info->fgmfn, info->domid))
-		return -EFAULT;
-	set_pte_at(info->vma->vm_mm, addr, ptep, pte);
-
-	return 0;
-}
-
 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
 			       xen_pfn_t mfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages)
 {
-	int err;
-	struct remap_data data;
-
-	/* TBD: Batching, current sole caller only does page at a time */
-	if (nr > 1)
-		return -EINVAL;
-
-	data.fgmfn = mfn;
-	data.prot = prot;
-	data.domid = domid;
-	data.vma = vma;
-	data.index = 0;
-	data.pages = pages;
-	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
-				  remap_pte_fn, &data);
-	return err;
+	return xen_xlate_remap_gfn_range(vma, addr, mfn, nr,
+					 prot, domid, pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
 
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 			       int nr, struct page **pages)
 {
-	int i;
-
-	for (i = 0; i < nr; i++) {
-		struct xen_remove_from_physmap xrp;
-		unsigned long rc, pfn;
-
-		pfn = page_to_pfn(pages[i]);
-
-		xrp.domid = DOMID_SELF;
-		xrp.gpfn = pfn;
-		rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
-		if (rc) {
-			pr_warn("Failed to unmap pfn:%lx rc:%ld\n",
-				pfn, rc);
-			return rc;
-		}
-	}
-	return 0;
+	return xen_xlate_unmap_gfn_range(vma, nr, pages);
 }
 EXPORT_SYMBOL_GPL(xen_unmap_domain_mfn_range);
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..3d536a56ddf1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2436,95 +2436,6 @@ void __init xen_hvm_init_mmu_ops(void)
 }
 #endif
 
-#ifdef CONFIG_XEN_PVH
-/*
- * Map foreign gfn (fgfn), to local pfn (lpfn). This for the user
- * space creating new guest on pvh dom0 and needing to map domU pages.
- */
-static int xlate_add_to_p2m(unsigned long lpfn, unsigned long fgfn,
-			    unsigned int domid)
-{
-	int rc, err = 0;
-	xen_pfn_t gpfn = lpfn;
-	xen_ulong_t idx = fgfn;
-
-	struct xen_add_to_physmap_range xatp = {
-		.domid = DOMID_SELF,
-		.foreign_domid = domid,
-		.size = 1,
-		.space = XENMAPSPACE_gmfn_foreign,
-	};
-	set_xen_guest_handle(xatp.idxs, &idx);
-	set_xen_guest_handle(xatp.gpfns, &gpfn);
-	set_xen_guest_handle(xatp.errs, &err);
-
-	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
-	if (rc < 0)
-		return rc;
-	return err;
-}
-
-static int xlate_remove_from_p2m(unsigned long spfn, int count)
-{
-	struct xen_remove_from_physmap xrp;
-	int i, rc;
-
-	for (i = 0; i < count; i++) {
-		xrp.domid = DOMID_SELF;
-		xrp.gpfn = spfn+i;
-		rc = HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
-		if (rc)
-			break;
-	}
-	return rc;
-}
-
-struct xlate_remap_data {
-	unsigned long fgfn; /* foreign domain's gfn */
-	pgprot_t prot;
-	domid_t  domid;
-	int index;
-	struct page **pages;
-};
-
-static int xlate_map_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
-			    void *data)
-{
-	int rc;
-	struct xlate_remap_data *remap = data;
-	unsigned long pfn = page_to_pfn(remap->pages[remap->index++]);
-	pte_t pteval = pte_mkspecial(pfn_pte(pfn, remap->prot));
-
-	rc = xlate_add_to_p2m(pfn, remap->fgfn, remap->domid);
-	if (rc)
-		return rc;
-	native_set_pte(ptep, pteval);
-
-	return 0;
-}
-
-static int xlate_remap_gfn_range(struct vm_area_struct *vma,
-				 unsigned long addr, unsigned long mfn,
-				 int nr, pgprot_t prot, unsigned domid,
-				 struct page **pages)
-{
-	int err;
-	struct xlate_remap_data pvhdata;
-
-	BUG_ON(!pages);
-
-	pvhdata.fgfn = mfn;
-	pvhdata.prot = prot;
-	pvhdata.domid = domid;
-	pvhdata.index = 0;
-	pvhdata.pages = pages;
-	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
-				  xlate_map_pte_fn, &pvhdata);
-	flush_tlb_all();
-	return err;
-}
-#endif
-
 #define REMAP_BATCH_SIZE 16
 
 struct remap_data {
@@ -2564,8 +2475,8 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
-					     domid, pages);
+		return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
+						 domid, pages);
 #else
 		return -EINVAL;
 #endif
@@ -2609,22 +2520,7 @@ int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 		return 0;
 
 #ifdef CONFIG_XEN_PVH
-	while (numpgs--) {
-		/*
-		 * The mmu has already cleaned up the process mmu
-		 * resources at this point (lookup_address will return
-		 * NULL).
-		 */
-		unsigned long pfn = page_to_pfn(pages[numpgs]);
-
-		xlate_remove_from_p2m(pfn, 1);
-	}
-	/*
-	 * We don't need to flush tlbs because as part of
-	 * xlate_remove_from_p2m, the hypervisor will do tlb flushes
-	 * after removing the p2m entries from the EPT/NPT
-	 */
-	return 0;
+	return xen_xlate_unmap_gfn_range(vma, numpgs, pages);
 #else
 	return -EINVAL;
 #endif
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index b812462083fc..afc39ca5cc4f 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -253,4 +253,10 @@ config XEN_EFI
 	def_bool y
 	depends on X86_64 && EFI
 
+config XEN_AUTO_XLATE
+	def_bool y
+	depends on ARM || ARM64 || XEN_PVHVM
+	help
+	  Support for auto-translated physmap guests.
+
 endmenu
diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile
index 2ccd3592d41f..40edd1cbb60d 100644
--- a/drivers/xen/Makefile
+++ b/drivers/xen/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_XEN_ACPI_HOTPLUG_CPU)	+= xen-acpi-cpuhotplug.o
 obj-$(CONFIG_XEN_ACPI_PROCESSOR)	+= xen-acpi-processor.o
 obj-$(CONFIG_XEN_EFI)			+= efi.o
 obj-$(CONFIG_XEN_SCSI_BACKEND)		+= xen-scsiback.o
+obj-$(CONFIG_XEN_AUTO_XLATE)		+= xlate_mmu.o
 xen-evtchn-y				:= evtchn.o
 xen-gntdev-y				:= gntdev.o
 xen-gntalloc-y				:= gntalloc.o
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
new file mode 100644
index 000000000000..7724d90fc697
--- /dev/null
+++ b/drivers/xen/xlate_mmu.c
@@ -0,0 +1,133 @@
+/*
+ * MMU operations common to all auto-translated physmap guests.
+ *
+ * Copyright (C) 2015 Citrix Systems R&D Ltd.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/mm.h>
+
+#include <asm/xen/hypercall.h>
+#include <asm/xen/hypervisor.h>
+
+#include <xen/xen.h>
+#include <xen/page.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/memory.h>
+
+/* map fgmfn of domid to lpfn in the current domain */
+static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
+			    unsigned int domid)
+{
+	int rc;
+	struct xen_add_to_physmap_range xatp = {
+		.domid = DOMID_SELF,
+		.foreign_domid = domid,
+		.size = 1,
+		.space = XENMAPSPACE_gmfn_foreign,
+	};
+	xen_ulong_t idx = fgmfn;
+	xen_pfn_t gpfn = lpfn;
+	int err = 0;
+
+	set_xen_guest_handle(xatp.idxs, &idx);
+	set_xen_guest_handle(xatp.gpfns, &gpfn);
+	set_xen_guest_handle(xatp.errs, &err);
+
+	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
+	return rc < 0 ? rc : err;
+}
+
+struct remap_data {
+	xen_pfn_t fgmfn; /* foreign domain's gmfn */
+	pgprot_t prot;
+	domid_t  domid;
+	struct vm_area_struct *vma;
+	int index;
+	struct page **pages;
+	struct xen_remap_mfn_info *info;
+};
+
+static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
+			void *data)
+{
+	struct remap_data *info = data;
+	struct page *page = info->pages[info->index++];
+	unsigned long pfn = page_to_pfn(page);
+	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
+	int rc;
+
+	rc = map_foreign_page(pfn, info->fgmfn, info->domid);
+	if (rc < 0)
+		return rc;
+	set_pte_at(info->vma->vm_mm, addr, ptep, pte);
+
+	return 0;
+}
+
+int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
+			      unsigned long addr,
+			      xen_pfn_t gfn, int nr,
+			      pgprot_t prot, unsigned domid,
+			      struct page **pages)
+{
+	int err;
+	struct remap_data data;
+
+	/* TBD: Batching, current sole caller only does page at a time */
+	if (nr > 1)
+		return -EINVAL;
+
+	data.fgmfn = gfn;
+	data.prot = prot;
+	data.domid = domid;
+	data.vma = vma;
+	data.index = 0;
+	data.pages = pages;
+	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+				  remap_pte_fn, &data);
+	return err;
+}
+EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_range);
+
+int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+			      int nr, struct page **pages)
+{
+	int i;
+
+	for (i = 0; i < nr; i++) {
+		struct xen_remove_from_physmap xrp;
+		unsigned long pfn;
+
+		pfn = page_to_pfn(pages[i]);
+
+		xrp.domid = DOMID_SELF;
+		xrp.gpfn = pfn;
+		(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xen_xlate_unmap_gfn_range);
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 83338210ee04..9eb88a4512bd 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -34,6 +34,14 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 			       struct page **pages);
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages);
+int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
+			      unsigned long addr,
+			      xen_pfn_t gfn, int nr,
+			      pgprot_t prot,
+			      unsigned domid,
+			      struct page **pages);
+int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
+			      int nr, struct page **pages);
 
 bool xen_running_on_version_or_later(unsigned int major, unsigned int minor);
 
-- 
cgit v1.2.3


From 4e8c0c8c4bf3a5b5c98046e146ab3884bf7a7d0e Mon Sep 17 00:00:00 2001
From: David Vrabel
Date: Wed, 11 Mar 2015 14:49:57 +0000
Subject: xen/privcmd: improve performance of MMAPBATCH_V2

Make the IOCTL_PRIVCMD_MMAPBATCH_V2 (and older V1 version) map
multiple frames at a time rather than one at a time, despite the pages
being non-consecutive GFNs.

xen_remap_foreign_mfn_array() is added which maps an array of GFNs
(instead of a consecutive range of GFNs).

Since per-frame errors are returned in an array, privcmd must set the
MMAPBATCH_V1 error bits as part of the "report errors" phase, after
all the frames are mapped.

Migrate times are significantly improved (when using a PV toolstack
domain).  For example, for an idle 12 GiB PV guest:

        Before     After
  real  0m38.179s  0m26.868s
  user  0m15.096s  0m13.652s
  sys   0m28.988s  0m18.732s

Signed-off-by: David Vrabel <david.vrabel@citrix.com>
Reviewed-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
---
 arch/arm/xen/enlighten.c |  20 ++++++--
 arch/x86/xen/mmu.c       | 101 ++++++++++++++++++++++++++++++++--------
 drivers/xen/privcmd.c    | 117 +++++++++++++++++++++++++++++++++--------------
 drivers/xen/xlate_mmu.c  |  46 +++++++++++--------
 include/xen/xen-ops.h    |  45 ++++++++++++++++--
 5 files changed, 249 insertions(+), 80 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c
index 5c04389fc9ef..224081ccc92f 100644
--- a/arch/arm/xen/enlighten.c
+++ b/arch/arm/xen/enlighten.c
@@ -53,15 +53,27 @@ EXPORT_SYMBOL_GPL(xen_platform_pci_unplug);
 
 static __read_mostly int xen_events_irq = -1;
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
-			       pgprot_t prot, unsigned domid,
+			       xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned domid,
 			       struct page **pages)
 {
-	return xen_xlate_remap_gfn_range(vma, addr, mfn, nr,
+	return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
 					 prot, domid, pages);
 }
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
+/* Not used by XENFEAT_auto_translated guests. */
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+                              unsigned long addr,
+                              xen_pfn_t mfn, int nr,
+                              pgprot_t prot, unsigned domid,
+                              struct page **pages)
+{
+	return -ENOSYS;
+}
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
 
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3d536a56ddf1..29b3be230ede 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -2439,7 +2439,8 @@ void __init xen_hvm_init_mmu_ops(void)
 #define REMAP_BATCH_SIZE 16
 
 struct remap_data {
-	unsigned long mfn;
+	xen_pfn_t *mfn;
+	bool contiguous;
 	pgprot_t prot;
 	struct mmu_update *mmu_update;
 };
@@ -2448,7 +2449,14 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 				 unsigned long addr, void *data)
 {
 	struct remap_data *rmd = data;
-	pte_t pte = pte_mkspecial(mfn_pte(rmd->mfn++, rmd->prot));
+	pte_t pte = pte_mkspecial(mfn_pte(*rmd->mfn, rmd->prot));
+
+	/* If we have a contigious range, just update the mfn itself,
+	   else update pointer to be "next mfn". */
+	if (rmd->contiguous)
+		(*rmd->mfn)++;
+	else
+		rmd->mfn++;
 
 	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
 	rmd->mmu_update->val = pte_val_ma(pte);
@@ -2457,26 +2465,26 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
 	return 0;
 }
 
-int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
-			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
-			       pgprot_t prot, unsigned domid,
-			       struct page **pages)
-
+static int do_remap_mfn(struct vm_area_struct *vma,
+			unsigned long addr,
+			xen_pfn_t *mfn, int nr,
+			int *err_ptr, pgprot_t prot,
+			unsigned domid,
+			struct page **pages)
 {
+	int err = 0;
 	struct remap_data rmd;
 	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
-	int batch;
 	unsigned long range;
-	int err = 0;
+	int mapped = 0;
 
 	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
 	if (xen_feature(XENFEAT_auto_translated_physmap)) {
 #ifdef CONFIG_XEN_PVH
 		/* We need to update the local page tables and the xen HAP */
-		return xen_xlate_remap_gfn_range(vma, addr, mfn, nr, prot,
-						 domid, pages);
+		return xen_xlate_remap_gfn_array(vma, addr, mfn, nr, err_ptr,
+						 prot, domid, pages);
 #else
 		return -EINVAL;
 #endif
@@ -2484,9 +2492,15 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 
 	rmd.mfn = mfn;
 	rmd.prot = prot;
+	/* We use the err_ptr to indicate if there we are doing a contigious
+	 * mapping or a discontigious mapping. */
+	rmd.contiguous = !err_ptr;
 
 	while (nr) {
-		batch = min(REMAP_BATCH_SIZE, nr);
+		int index = 0;
+		int done = 0;
+		int batch = min(REMAP_BATCH_SIZE, nr);
+		int batch_left = batch;
 		range = (unsigned long)batch << PAGE_SHIFT;
 
 		rmd.mmu_update = mmu_update;
@@ -2495,23 +2509,72 @@ int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 		if (err)
 			goto out;
 
-		err = HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid);
-		if (err < 0)
-			goto out;
+		/* We record the error for each page that gives an error, but
+		 * continue mapping until the whole set is done */
+		do {
+			int i;
+
+			err = HYPERVISOR_mmu_update(&mmu_update[index],
+						    batch_left, &done, domid);
+
+			/*
+			 * @err_ptr may be the same buffer as @mfn, so
+			 * only clear it after each chunk of @mfn is
+			 * used.
+			 */
+			if (err_ptr) {
+				for (i = index; i < index + done; i++)
+					err_ptr[i] = 0;
+			}
+			if (err < 0) {
+				if (!err_ptr)
+					goto out;
+				err_ptr[i] = err;
+				done++; /* Skip failed frame. */
+			} else
+				mapped += done;
+			batch_left -= done;
+			index += done;
+		} while (batch_left);
 
 		nr -= batch;
 		addr += range;
+		if (err_ptr)
+			err_ptr += batch;
 	}
-
-	err = 0;
 out:
 
 	xen_flush_tlb_all();
 
-	return err;
+	return err < 0 ? err : mapped;
+}
+
+int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t mfn, int nr,
+			       pgprot_t prot, unsigned domid,
+			       struct page **pages)
+{
+	return do_remap_mfn(vma, addr, &mfn, nr, NULL, prot, domid, pages);
 }
 EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
 
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t *mfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned domid, struct page **pages)
+{
+	/* We BUG_ON because it's a programmer error to pass a NULL err_ptr,
+	 * and the consequences later is quite hard to detect what the actual
+	 * cause of "wrong memory was mapped in".
+	 */
+	BUG_ON(err_ptr == NULL);
+	return do_remap_mfn(vma, addr, mfn, nr, err_ptr, prot, domid, pages);
+}
+EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_array);
+
+
 /* Returns: 0 success */
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages)
diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
index 59ac71c4a043..5a296161d843 100644
--- a/drivers/xen/privcmd.c
+++ b/drivers/xen/privcmd.c
@@ -159,6 +159,40 @@ static int traverse_pages(unsigned nelem, size_t size,
 	return ret;
 }
 
+/*
+ * Similar to traverse_pages, but use each page as a "block" of
+ * data to be processed as one unit.
+ */
+static int traverse_pages_block(unsigned nelem, size_t size,
+				struct list_head *pos,
+				int (*fn)(void *data, int nr, void *state),
+				void *state)
+{
+	void *pagedata;
+	unsigned pageidx;
+	int ret = 0;
+
+	BUG_ON(size > PAGE_SIZE);
+
+	pageidx = PAGE_SIZE;
+
+	while (nelem) {
+		int nr = (PAGE_SIZE/size);
+		struct page *page;
+		if (nr > nelem)
+			nr = nelem;
+		pos = pos->next;
+		page = list_entry(pos, struct page, lru);
+		pagedata = page_address(page);
+		ret = (*fn)(pagedata, nr, state);
+		if (ret)
+			break;
+		nelem -= nr;
+	}
+
+	return ret;
+}
+
 struct mmap_mfn_state {
 	unsigned long va;
 	struct vm_area_struct *vma;
@@ -274,39 +308,25 @@ struct mmap_batch_state {
 /* auto translated dom0 note: if domU being created is PV, then mfn is
  * mfn(addr on bus). If it's auto xlated, then mfn is pfn (input to HAP).
  */
-static int mmap_batch_fn(void *data, void *state)
+static int mmap_batch_fn(void *data, int nr, void *state)
 {
 	xen_pfn_t *mfnp = data;
 	struct mmap_batch_state *st = state;
 	struct vm_area_struct *vma = st->vma;
 	struct page **pages = vma->vm_private_data;
-	struct page *cur_page = NULL;
+	struct page **cur_pages = NULL;
 	int ret;
 
 	if (xen_feature(XENFEAT_auto_translated_physmap))
-		cur_page = pages[st->index++];
+		cur_pages = &pages[st->index];
 
-	ret = xen_remap_domain_mfn_range(st->vma, st->va & PAGE_MASK, *mfnp, 1,
-					 st->vma->vm_page_prot, st->domain,
-					 &cur_page);
+	BUG_ON(nr < 0);
+	ret = xen_remap_domain_mfn_array(st->vma, st->va & PAGE_MASK, mfnp, nr,
+					 (int *)mfnp, st->vma->vm_page_prot,
+					 st->domain, cur_pages);
 
-	/* Store error code for second pass. */
-	if (st->version == 1) {
-		if (ret < 0) {
-			/*
-			 * V1 encodes the error codes in the 32bit top nibble of the
-			 * mfn (with its known limitations vis-a-vis 64 bit callers).
-			 */
-			*mfnp |= (ret == -ENOENT) ?
-						PRIVCMD_MMAPBATCH_PAGED_ERROR :
-						PRIVCMD_MMAPBATCH_MFN_ERROR;
-		}
-	} else { /* st->version == 2 */
-		*((int *) mfnp) = ret;
-	}
-
-	/* And see if it affects the global_error. */
-	if (ret < 0) {
+	/* Adjust the global_error? */
+	if (ret != nr) {
 		if (ret == -ENOENT)
 			st->global_error = -ENOENT;
 		else {
@@ -315,23 +335,35 @@ static int mmap_batch_fn(void *data, void *state)
 				st->global_error = 1;
 		}
 	}
-	st->va += PAGE_SIZE;
+	st->va += PAGE_SIZE * nr;
+	st->index += nr;
 
 	return 0;
 }
 
-static int mmap_return_errors(void *data, void *state)
+static int mmap_return_error(int err, struct mmap_batch_state *st)
 {
-	struct mmap_batch_state *st = state;
+	int ret;
 
 	if (st->version == 1) {
-		xen_pfn_t mfnp = *((xen_pfn_t *) data);
-		if (mfnp & PRIVCMD_MMAPBATCH_MFN_ERROR)
-			return __put_user(mfnp, st->user_mfn++);
-		else
+		if (err) {
+			xen_pfn_t mfn;
+
+			ret = get_user(mfn, st->user_mfn);
+			if (ret < 0)
+				return ret;
+			/*
+			 * V1 encodes the error codes in the 32bit top
+			 * nibble of the mfn (with its known
+			 * limitations vis-a-vis 64 bit callers).
+			 */
+			mfn |= (err == -ENOENT) ?
+				PRIVCMD_MMAPBATCH_PAGED_ERROR :
+				PRIVCMD_MMAPBATCH_MFN_ERROR;
+			return __put_user(mfn, st->user_mfn++);
+		} else
 			st->user_mfn++;
 	} else { /* st->version == 2 */
-		int err = *((int *) data);
 		if (err)
 			return __put_user(err, st->user_err++);
 		else
@@ -341,6 +373,21 @@ static int mmap_return_errors(void *data, void *state)
 	return 0;
 }
 
+static int mmap_return_errors(void *data, int nr, void *state)
+{
+	struct mmap_batch_state *st = state;
+	int *errs = data;
+	int i;
+	int ret;
+
+	for (i = 0; i < nr; i++) {
+		ret = mmap_return_error(errs[i], st);
+		if (ret < 0)
+			return ret;
+	}
+	return 0;
+}
+
 /* Allocate pfns that are then mapped with gmfns from foreign domid. Update
  * the vma with the page info to use later.
  * Returns: 0 if success, otherwise -errno
@@ -472,8 +519,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 	state.version       = version;
 
 	/* mmap_batch_fn guarantees ret == 0 */
-	BUG_ON(traverse_pages(m.num, sizeof(xen_pfn_t),
-			     &pagelist, mmap_batch_fn, &state));
+	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
+				    &pagelist, mmap_batch_fn, &state));
 
 	up_write(&mm->mmap_sem);
 
@@ -481,8 +528,8 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
 		/* Write back errors in second pass. */
 		state.user_mfn = (xen_pfn_t *)m.arr;
 		state.user_err = m.err;
-		ret = traverse_pages(m.num, sizeof(xen_pfn_t),
-							 &pagelist, mmap_return_errors, &state);
+		ret = traverse_pages_block(m.num, sizeof(xen_pfn_t),
+					   &pagelist, mmap_return_errors, &state);
 	} else
 		ret = 0;
 
diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
index 7724d90fc697..58a5389aec89 100644
--- a/drivers/xen/xlate_mmu.c
+++ b/drivers/xen/xlate_mmu.c
@@ -62,13 +62,15 @@ static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
 }
 
 struct remap_data {
-	xen_pfn_t fgmfn; /* foreign domain's gmfn */
+	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
 	pgprot_t prot;
 	domid_t  domid;
 	struct vm_area_struct *vma;
 	int index;
 	struct page **pages;
 	struct xen_remap_mfn_info *info;
+	int *err_ptr;
+	int mapped;
 };
 
 static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
@@ -80,38 +82,46 @@ static int remap_pte_fn(pte_t *ptep, pgtable_t token, unsigned long addr,
 	pte_t pte = pte_mkspecial(pfn_pte(pfn, info->prot));
 	int rc;
 
-	rc = map_foreign_page(pfn, info->fgmfn, info->domid);
-	if (rc < 0)
-		return rc;
-	set_pte_at(info->vma->vm_mm, addr, ptep, pte);
+	rc = map_foreign_page(pfn, *info->fgmfn, info->domid);
+	*info->err_ptr++ = rc;
+	if (!rc) {
+		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
+		info->mapped++;
+	}
+	info->fgmfn++;
 
 	return 0;
 }
 
-int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
+int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
-			      xen_pfn_t gfn, int nr,
-			      pgprot_t prot, unsigned domid,
+			      xen_pfn_t *mfn, int nr,
+			      int *err_ptr, pgprot_t prot,
+			      unsigned domid,
 			      struct page **pages)
 {
 	int err;
 	struct remap_data data;
+	unsigned long range = nr << PAGE_SHIFT;
 
-	/* TBD: Batching, current sole caller only does page at a time */
-	if (nr > 1)
-		return -EINVAL;
+	/* Kept here for the purpose of making sure code doesn't break
+	   x86 PVOPS */
+	BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_IO)) == (VM_PFNMAP | VM_IO)));
 
-	data.fgmfn = gfn;
-	data.prot = prot;
+	data.fgmfn = mfn;
+	data.prot  = prot;
 	data.domid = domid;
-	data.vma = vma;
-	data.index = 0;
+	data.vma   = vma;
 	data.pages = pages;
-	err = apply_to_page_range(vma->vm_mm, addr, nr << PAGE_SHIFT,
+	data.index = 0;
+	data.err_ptr = err_ptr;
+	data.mapped = 0;
+
+	err = apply_to_page_range(vma->vm_mm, addr, range,
 				  remap_pte_fn, &data);
-	return err;
+	return err < 0 ? err : data.mapped;
 }
-EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_range);
+EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
 
 int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
 			      int nr, struct page **pages)
diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h
index 9eb88a4512bd..c643e6a94c9a 100644
--- a/include/xen/xen-ops.h
+++ b/include/xen/xen-ops.h
@@ -27,17 +27,54 @@ int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
 void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order);
 
 struct vm_area_struct;
+
+/*
+ * xen_remap_domain_mfn_array() - map an array of foreign frames
+ * @vma:     VMA to map the pages into
+ * @addr:    Address at which to map the pages
+ * @gfn:     Array of GFNs to map
+ * @nr:      Number entries in the GFN array
+ * @err_ptr: Returns per-GFN error status.
+ * @prot:    page protection mask
+ * @domid:   Domain owning the pages
+ * @pages:   Array of pages if this domain has an auto-translated physmap
+ *
+ * @gfn and @err_ptr may point to the same buffer, the GFNs will be
+ * overwritten by the error codes after they are mapped.
+ *
+ * Returns the number of successfully mapped frames, or a -ve error
+ * code.
+ */
+int xen_remap_domain_mfn_array(struct vm_area_struct *vma,
+			       unsigned long addr,
+			       xen_pfn_t *gfn, int nr,
+			       int *err_ptr, pgprot_t prot,
+			       unsigned domid,
+			       struct page **pages);
+
+/* xen_remap_domain_mfn_range() - map a range of foreign frames
+ * @vma:     VMA to map the pages into
+ * @addr:    Address at which to map the pages
+ * @gfn:     First GFN to map.
+ * @nr:      Number frames to map
+ * @prot:    page protection mask
+ * @domid:   Domain owning the pages
+ * @pages:   Array of pages if this domain has an auto-translated physmap
+ *
+ * Returns the number of successfully mapped frames, or a -ve error
+ * code.
+ */
 int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
 			       unsigned long addr,
-			       xen_pfn_t mfn, int nr,
+			       xen_pfn_t gfn, int nr,
 			       pgprot_t prot, unsigned domid,
 			       struct page **pages);
 int xen_unmap_domain_mfn_range(struct vm_area_struct *vma,
 			       int numpgs, struct page **pages);
-int xen_xlate_remap_gfn_range(struct vm_area_struct *vma,
+int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
 			      unsigned long addr,
-			      xen_pfn_t gfn, int nr,
-			      pgprot_t prot,
+			      xen_pfn_t *gfn, int nr,
+			      int *err_ptr, pgprot_t prot,
 			      unsigned domid,
 			      struct page **pages);
 int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
-- 
cgit v1.2.3


From c6f2062935c8fcb31235799eaee8bcd5b649936b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 12 Mar 2015 13:57:51 -0700
Subject: x86/signal/64: Fix SS handling for signals delivered to 64-bit
 programs

The comment in the signal code says that apps can save/restore
other segments on their own.  It's true that apps can *save* SS
on their own, but there's no way for apps to restore it: SYSCALL
effectively resets SS to __USER_DS, so any value that user code
tries to load into SS gets lost on entry to sigreturn.

This recycles two padding bytes in the segment selector area for SS.

While we're at it, we need a second change to make this useful.

If the signal we're delivering is caused by a bad SS value,
saving that value isn't enough.  We need to remove that bad
value from the regs before we try to deliver the signal.  Oddly,
the i386 code already got this right.

I suspect that 64-bit programs that try to run 16-bit code and
use signals will have a lot of trouble without this.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/405594361340a2ec32f8e2b115c142df0e180d8e.1426193719.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/sigcontext.h      |  2 +-
 arch/x86/include/uapi/asm/sigcontext.h |  2 +-
 arch/x86/kernel/signal.c               | 22 +++++++++++++---------
 3 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..f910cdcb71fd 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -59,7 +59,7 @@ struct sigcontext {
 	unsigned short cs;
 	unsigned short gs;
 	unsigned short fs;
-	unsigned short __pad0;
+	unsigned short ss;
 	unsigned long err;
 	unsigned long trapno;
 	unsigned long oldmask;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..076b11fd6fa1 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -179,7 +179,7 @@ struct sigcontext {
 	__u16 cs;
 	__u16 gs;
 	__u16 fs;
-	__u16 __pad0;
+	__u16 ss;
 	__u64 err;
 	__u64 trapno;
 	__u64 oldmask;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..e2f6061a9003 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 		COPY(r15);
 #endif /* CONFIG_X86_64 */
 
-#ifdef CONFIG_X86_32
 		COPY_SEG_CPL3(cs);
 		COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
-		/* Kernel saves and restores only the CS segment register on signals,
-		 * which is the bare minimum needed to allow mixed 32/64-bit code.
-		 * App's signal handler can save/restore other segments if needed. */
-		COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
 
 		get_user_ex(tmpflags, &sc->flags);
 		regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		put_user_ex(regs->cs, &sc->cs);
 		put_user_ex(0, &sc->gs);
 		put_user_ex(0, &sc->fs);
+		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
 		put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
 
 	regs->sp = (unsigned long)frame;
 
-	/* Set up the CS register to run signal handlers in 64-bit mode,
-	   even if the handler happens to be interrupting 32-bit code. */
+	/*
+	 * Set up the CS and SS registers to run signal handlers in
+	 * 64-bit mode, even if the handler happens to be interrupting
+	 * 32-bit or 16-bit code.
+	 *
+	 * SS is subtle.  In 64-bit mode, we don't need any particular
+	 * SS descriptor, but we do need SS to be valid.  It's possible
+	 * that the old SS is entirely bogus -- this can happen if the
+	 * signal we're trying to deliver is #GP or #SS caused by a bad
+	 * SS value.
+	 */
 	regs->cs = __USER_CS;
+	regs->ss = __USER_DS;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 9a036b93a344235b7899401d04e97c34f3a2554c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 12 Mar 2015 13:57:52 -0700
Subject: x86/signal/64: Remove 'fs' and 'gs' from sigcontext

As far as I can tell, these fields have been set to zero on save
and ignored on restore since Linux was imported into git.
Rename them '__pad1' and '__pad2' to avoid confusion.  This may
also allow us to recycle them some day.

This also adds a comment clarifying the history of those fields.

I'm intentionally avoiding calling either of them '__pad0': the
field formerly known as '__pad0' is now 'ss'.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/844f8490e938780c03355be4c9b69eb4c494bf4e.1426193719.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/sigcontext.h      |  4 ++--
 arch/x86/include/uapi/asm/sigcontext.h | 19 +++++++++++++++++--
 arch/x86/kernel/signal.c               |  4 ++--
 3 files changed, 21 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index f910cdcb71fd..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,8 +57,8 @@ struct sigcontext {
 	unsigned long ip;
 	unsigned long flags;
 	unsigned short cs;
-	unsigned short gs;
-	unsigned short fs;
+	unsigned short __pad2;	/* Was called gs, but was always zero. */
+	unsigned short __pad1;	/* Was called fs, but was always zero. */
 	unsigned short ss;
 	unsigned long err;
 	unsigned long trapno;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index 076b11fd6fa1..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,8 +177,23 @@ struct sigcontext {
 	__u64 rip;
 	__u64 eflags;		/* RFLAGS */
 	__u16 cs;
-	__u16 gs;
-	__u16 fs;
+
+	/*
+	 * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+	 * Linux saved and restored fs and gs in these slots.  This
+	 * was counterproductive, as fsbase and gsbase were never
+	 * saved, so arch_prctl was presumably unreliable.
+	 *
+	 * If these slots are ever needed for any other purpose, there
+	 * is some risk that very old 64-bit binaries could get
+	 * confused.  I doubt that many such binaries still work,
+	 * though, since the same patch in 2.5.64 also removed the
+	 * 64-bit set_thread_area syscall, so it appears that there is
+	 * no TLS API that works in both pre- and post-2.5.64 kernels.
+	 */
+	__u16 __pad2;		/* Was gs. */
+	__u16 __pad1;		/* Was fs. */
+
 	__u16 ss;
 	__u64 err;
 	__u64 trapno;
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e2f6061a9003..edcb862cdcae 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -155,8 +155,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 #else /* !CONFIG_X86_32 */
 		put_user_ex(regs->flags, &sc->flags);
 		put_user_ex(regs->cs, &sc->cs);
-		put_user_ex(0, &sc->gs);
-		put_user_ex(0, &sc->fs);
+		put_user_ex(0, &sc->__pad2);
+		put_user_ex(0, &sc->__pad1);
 		put_user_ex(regs->ss, &sc->ss);
 #endif /* CONFIG_X86_32 */
 
-- 
cgit v1.2.3


From 3ee4298f440c81638cbb5ec06f2497fb7a9a9eb4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 10 Mar 2015 11:05:58 -0700
Subject: x86/asm/entry: Create and use a 'TOP_OF_KERNEL_STACK_PADDING' macro

x86_32, unlike x86_64, pads the top of the kernel stack, because the
hardware stack frame formats are variable in size.

Document this padding and give it a name.

This should make no change whatsoever to the compiled kernel
image. It also doesn't fix any of the current bugs in this area.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/02bf2f54b8dcb76a62a142b6dfe07d4ef7fc582e.1426009661.git.luto@amacapital.net
[ Fixed small details, such as a missed magic constant in entry_32.S pointed out by Denys Vlasenko. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h   |  3 ++-
 arch/x86/include/asm/thread_info.h | 27 +++++++++++++++++++++++++++
 arch/x86/kernel/entry_32.S         |  2 +-
 3 files changed, 30 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 48a61c1c626e..88d9aa745898 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -849,7 +849,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define task_pt_regs(task)                                             \
 ({                                                                     \
        struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \
+				     TOP_OF_KERNEL_STACK_PADDING);     \
        __regs__ - 1;                                                   \
 })
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 7740edd56fed..ba115eb6fbcf 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -12,6 +12,33 @@
 #include <asm/percpu.h>
 #include <asm/types.h>
 
+/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack.  We do it because of a nasty
+ * 32-bit corner case.  On x86_32, the hardware stack frame is
+ * variable-length.  Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0.  On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault.  (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
 /*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index e33ba51b1069..4c8cc34e6d68 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -398,7 +398,7 @@ sysenter_past_esp:
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
-- 
cgit v1.2.3


From d9e05cc5a53246e074dc2b84956252e4bbe392cd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 10 Mar 2015 11:05:59 -0700
Subject: x86/asm/entry: Unify and fix initial thread_struct::sp0 values

x86_32 and x86_64 need slightly different thread_struct::sp0 values, and
x86_32's was incorrect for init.

This never mattered -- the init thread never runs user code, so we never
used thread_struct::sp0 for anything.

Fix it and mostly unify them.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1b810c1d2e797e27bb4a7708c426101161edd1f6.1426009661.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 7 +++++--
 arch/x86/kernel/process.c        | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 88d9aa745898..fc6d8d0d8d53 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -811,6 +811,9 @@ static inline void spin_lock_prefetch(const void *x)
 	prefetchw(x);
 }
 
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+			   TOP_OF_KERNEL_STACK_PADDING)
+
 #ifdef CONFIG_X86_32
 /*
  * User space process size: 3GB (default).
@@ -821,7 +824,7 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP_MAX		STACK_TOP
 
 #define INIT_THREAD  {							  \
-	.sp0			= sizeof(init_stack) + (long)&init_stack, \
+	.sp0			= TOP_OF_INIT_STACK,			  \
 	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
@@ -883,7 +886,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  { \
-	.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+	.sp0 = TOP_OF_INIT_STACK \
 }
 
 /*
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index f4c0af7fc3a0..12b1cf606ddf 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
  */
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
-		.sp0 = (unsigned long)&init_stack + sizeof(init_stack),
+		.sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
-- 
cgit v1.2.3


From 76e4c4908a4904a61aa67ae5eb0b2a7588c4a546 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Tue, 10 Mar 2015 11:06:00 -0700
Subject: x86/asm/entry/32: Document our abuse of x86_hw_tss::ss1 and
 x86_hw_tss::sp1

This has confused me for a while.  Now that I figured it out, document it.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/b7efc1b7364039824776f68e9ddee9ec1500e894.1426009661.git.luto@amacapital.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index fc6d8d0d8d53..b26208998b7c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -209,9 +209,24 @@ struct x86_hw_tss {
 	unsigned short		back_link, __blh;
 	unsigned long		sp0;
 	unsigned short		ss0, __ss0h;
-	unsigned long		sp1;
-	/* ss1 caches MSR_IA32_SYSENTER_CS: */
-	unsigned short		ss1, __ss1h;
+
+	/*
+	 * We don't use ring 1, so sp1 and ss1 are convenient scratch
+	 * spaces in the same cacheline as sp0.  We use them to cache
+	 * some MSR values to avoid unnecessary wrmsr instructions.
+	 *
+	 * We use SYSENTER_ESP to find sp0 and for the NMI emergency
+	 * stack, but we need to context switch it because we do
+	 * horrible things to the kernel stack in vm86 mode.
+	 *
+	 * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid
+	 * corrupting the stack if we went through the sysenter path
+	 * from vm86 mode.
+	 */
+	unsigned long		sp1;	/* MSR_IA32_SYSENTER_ESP */
+	unsigned short		ss1;	/* MSR_IA32_SYSENTER_CS */
+
+	unsigned short		__ss1h;
 	unsigned long		sp2;
 	unsigned short		ss2, __ss2h;
 	unsigned long		__cr3;
-- 
cgit v1.2.3


From 5c39403e004bec75ce0c549541be5479595d6ad0 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 13 Mar 2015 15:09:03 +0100
Subject: x86/asm/entry: Simplify task_pt_regs() macro definition

Before this change, task_pt_regs() was using KSTK_TOP(),
and it was the only use of that macro. In turn, KSTK_TOP used
THREAD_SIZE_LONGS, and it was the only use of that macro too.

Fold these macros into task_pt_regs(). Tweak comment
about "- 8" - we now use a symbolic constant, not literal 8.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426255743-5394-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index b26208998b7c..6a5c0ec5ee0e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -847,15 +847,8 @@ static inline void spin_lock_prefetch(const void *x)
 
 extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
-#define THREAD_SIZE_LONGS      (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info)                                                 \
-({                                                                     \
-       unsigned long *__ptr = (unsigned long *)(info);                 \
-       (unsigned long)(&__ptr[THREAD_SIZE_LONGS]);                     \
-})
-
 /*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
  * This is necessary to guarantee that the entire "struct pt_regs"
  * is accessible even if the CPU haven't stored the SS/ESP registers
  * on the stack (interrupt gate does not save these registers
@@ -864,12 +857,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
  * "struct pt_regs" is possible, but they may contain the
  * completely wrong values.
  */
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \
-				     TOP_OF_KERNEL_STACK_PADDING);     \
-       __regs__ - 1;                                                   \
+#define task_pt_regs(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
+	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
+	((struct pt_regs *)__ptr) - 1;					\
 })
 
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
-- 
cgit v1.2.3


From d828c71fba8922b116b4ec56c3e5bca8c822d5ae Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 9 Mar 2015 15:52:18 +0100
Subject: x86/asm/entry/32: Document the 32-bit SYSENTER "emergency stack"
 better

Before the patch, the 'tss_struct::stack' field was not referenced anywhere.

It was used only to set SYSENTER's stack to point after the last byte
of tss_struct, thus the trailing field, stack[64], was used.

But grep would not know it. You can comment it out, compile,
and kernel will even run until an unlucky NMI corrupts
io_bitmap[] (which is also not easily detectable).

This patch changes code so that the purpose and usage of this
field is not mysterious anymore, and can be easily grepped for.

This does change generated code, for a subtle reason:
since tss_struct is ____cacheline_aligned, there happens to be
5 longs of padding at the end. Old code was using the padding
too; new code will strictly use it only for SYSENTER_stack[].

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1425912738-559-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 4 ++--
 arch/x86/kernel/asm-offsets_32.c | 2 +-
 arch/x86/kernel/cpu/common.c     | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a5c0ec5ee0e..5abd9a535a24 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -291,9 +291,9 @@ struct tss_struct {
 	unsigned long		io_bitmap[IO_BITMAP_LONGS + 1];
 
 	/*
-	 * .. and then another 0x100 bytes for the emergency kernel stack:
+	 * Space for the temporary SYSENTER stack:
 	 */
-	unsigned long		stack[64];
+	unsigned long		SYSENTER_stack[64];
 
 } ____cacheline_aligned;
 
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
 
 	/* Offset from the sysenter stack to tss.sp0 */
 	DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
-		 sizeof(struct tss_struct));
+	       offsetofend(struct tss_struct, SYSENTER_stack));
 
 #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
 	BLANK();
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 76348334b934..7a3dfb1db78d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -987,7 +987,7 @@ void enable_sep_cpu(void)
 	}
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
-	tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
+	tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack);
 	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
 	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
-- 
cgit v1.2.3


From 8b6c0ab1a1296ef6922160fa27018f25c60b8940 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 16 Mar 2015 10:32:20 +0100
Subject: x86/asm/entry: Document and clean up the enable_sep_cpu() and
 syscall32_cpu_init() functions

Clean up the flow and document the functions a bit better.

Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 42 +++++++++++++++++++++++++++++-------------
 1 file changed, 29 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 7a3dfb1db78d..d79f139871e0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,37 +960,53 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 }
 
 #ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
+# ifdef CONFIG_IA32_EMULATION
 /* May not be __init: called during resume */
 static void syscall32_cpu_init(void)
 {
-	/* Load these always in case some future AMD CPU supports
-	   SYSENTER from compat mode too. */
+	/*
+	 * Always load these, in case some future 64-bit CPU supports
+	 * SYSENTER from compat mode too:
+	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 
 	wrmsrl(MSR_CSTAR, ia32_cstar_target);
 }
-#endif		/* CONFIG_IA32_EMULATION */
-#endif		/* CONFIG_X86_64 */
+# endif
+#endif
 
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
 #ifdef CONFIG_X86_32
 void enable_sep_cpu(void)
 {
-	int cpu = get_cpu();
-	struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
+	struct tss_struct *tss;
+	int cpu;
 
-	if (!boot_cpu_has(X86_FEATURE_SEP)) {
-		put_cpu();
-		return;
-	}
+	cpu = get_cpu();
+	tss = &per_cpu(cpu_tss, cpu);
+
+	if (!boot_cpu_has(X86_FEATURE_SEP))
+		goto out;
+
+	/*
+	 * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware,
+	 * we cache the SYSENTER CS and ESP values there for easy access:
+	 */
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
+	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
 	tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack);
-	wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
 	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
-	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+
+	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
 	put_cpu();
 }
 #endif
-- 
cgit v1.2.3


From c80e5c0c23ce2282476fdc64c4b5e3d3a40723fd Mon Sep 17 00:00:00 2001
From: Eugene Shatokhin
Date: Tue, 17 Mar 2015 19:09:18 +0900
Subject: kprobes/x86: Return correct length in __copy_instruction()

On x86-64, __copy_instruction() always returns 0 (error) if the
instruction uses %rip-relative addressing. This is because
kernel_insn_init() is called the second time for 'insn' instance
in such cases and sets all its fields to 0.

Because of this, trying to place a kprobe on such instruction
will fail, register_kprobe() will return -EINVAL.

This patch fixes the problem.

Signed-off-by: Eugene Shatokhin <eugene.shatokhin@rosalab.ru>
Signed-off-by: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Link: http://lkml.kernel.org/r/20150317100918.28349.94654.stgit@localhost.localdomain
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kprobes/core.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..03189d86357d 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -354,6 +354,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 {
 	struct insn insn;
 	kprobe_opcode_t buf[MAX_INSN_SIZE];
+	int length;
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
@@ -361,16 +362,18 @@ int __copy_instruction(u8 *dest, u8 *src)
 		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
+	length = insn.length;
+
 	/* Another subsystem puts a breakpoint, failed to recover */
 	if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION)
 		return 0;
-	memcpy(dest, insn.kaddr, insn.length);
+	memcpy(dest, insn.kaddr, length);
 
 #ifdef CONFIG_X86_64
 	if (insn_rip_relative(&insn)) {
 		s64 newdisp;
 		u8 *disp;
-		kernel_insn_init(&insn, dest, insn.length);
+		kernel_insn_init(&insn, dest, length);
 		insn_get_displacement(&insn);
 		/*
 		 * The copied instruction uses the %rip-relative addressing
@@ -394,7 +397,7 @@ int __copy_instruction(u8 *dest, u8 *src)
 		*(s32 *) disp = (s32) newdisp;
 	}
 #endif
-	return insn.length;
+	return length;
 }
 
 static int arch_copy_kprobe(struct kprobe *p)
-- 
cgit v1.2.3


From 91d8f0416f3989e248d3a3d3efb821eda10a85d2 Mon Sep 17 00:00:00 2001
From: Alexander Kuleshov
Date: Tue, 17 Mar 2015 19:00:05 +0600
Subject: x86/boot/64: Remove pointless early_printk() message

earlyprintk is not initialised yet by the setup_early_printk() function
so we can remove it.

Signed-off-by: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/1426597205-5142-1-git-send-email-kuleshovmail@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/head64.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index eda1a865641e..8c58135b0fa4 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -172,9 +172,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
 	 */
 	load_ucode_bsp();
 
-	if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
-		early_printk("Kernel alive\n");
-
 	clear_page(init_level4_pgt);
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
-- 
cgit v1.2.3


From 33db1fd48ac3d90385b412b41a8a6525096ac6d5 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 17 Mar 2015 14:52:24 +0100
Subject: x86/asm/entry/64: Enable interrupts *after* we fetch
 PER_CPU_VAR(old_rsp)

We want to use PER_CPU_VAR(old_rsp) as a simple temporary register,
to shuffle user-space RSP into (and from) when we set up the system
call stack frame. At that point we cannot shuffle values into general
purpose registers, because we have not saved them yet.

To be able to do this shuffling into a memory location, we must be
atomic and must not be preempted while we do the shuffling, otherwise
the 'temporary' register gets overwritten by some other task's
temporary register contents ...

Tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426600344-8254-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d86788c3257b..aed3f11c373b 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -241,16 +241,16 @@ GLOBAL(system_call_after_swapgs)
 	movq	%rsp,PER_CPU_VAR(old_rsp)
 	/* kernel_stack is set so that 5 slots (iret frame) are preallocated */
 	movq	PER_CPU_VAR(kernel_stack),%rsp
-	/*
-	 * No need to follow this irqs off/on section - it's straight
-	 * and short:
-	 */
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
 	movq	%rcx,RIP(%rsp)
 	movq	PER_CPU_VAR(old_rsp),%rcx
 	movq	%r11,EFLAGS(%rsp)
 	movq	%rcx,RSP(%rsp)
+	/*
+	 * No need to follow this irqs off/on section - it's straight
+	 * and short:
+	 */
+	ENABLE_INTERRUPTS(CLBR_NONE)
 	movq_cfi rax,ORIG_RAX
 	SAVE_C_REGS_EXCEPT_RAX_RCX_R11
 	movq	$-ENOSYS,RAX(%rsp)
-- 
cgit v1.2.3


From 9854dd74c3f6af8d9d527de86c6074b7ed0495f1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 17 Mar 2015 14:42:59 +0100
Subject: x86/asm/entry/64: Simplify 'old_rsp' usage

Remove all manipulations of PER_CPU(old_rsp) in C code:

 - it is not used on SYSRET return anymore, and system entries
   are atomic, so updating it from the fork and context switch
   paths is pointless.

 - Tweak a few related comments as well: we no longer have a
   "partial stack frame" on entry, ever.

Based on (split out of) patch from Denys Vlasenko.

Originally-from: Denys Vlasenko <dvlasenk@redhat.com>
Tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426599779-8010-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 5 -----
 arch/x86/kernel/process_64.c     | 2 --
 2 files changed, 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 5abd9a535a24..3ac5092ec113 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -905,11 +905,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
-/*
- * User space RSP while inside the SYSCALL fast path
- */
-DECLARE_PER_CPU(unsigned long, old_rsp);
-
 #endif /* CONFIG_X86_64 */
 
 extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e8c124a1f885..59696d76a4e3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -238,7 +238,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
-	this_cpu_write(old_rsp, new_sp);
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
@@ -399,7 +398,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 * Switch the PDA and FPU contexts.
 	 */
 	prev->usersp = this_cpu_read(old_rsp);
-	this_cpu_write(old_rsp, next->usersp);
 	this_cpu_write(current_task, next_p);
 
 	/*
-- 
cgit v1.2.3


From ac9af4983e77765a642b5a21086bc1fdc55418c4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 17 Mar 2015 14:42:59 +0100
Subject: x86/asm/entry/64: Remove thread_struct::usersp

Nothing uses thread_struct::usersp anymore, so remove it.

Originally-from: Denys Vlasenko <dvlasenk@redhat.com>
Tested-by: Borislav Petkov <bp@alien8.de>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 1 -
 arch/x86/kernel/process_64.c     | 3 ---
 2 files changed, 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 3ac5092ec113..572099710ba2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -493,7 +493,6 @@ struct thread_struct {
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
 #else
-	unsigned long		usersp;	/* Copy from PDA */
 	unsigned short		es;
 	unsigned short		ds;
 	unsigned short		fsindex;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 59696d76a4e3..14df2be4711f 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	p->thread.sp = (unsigned long) childregs;
-	p->thread.usersp = me->thread.usersp;
 	set_tsk_thread_flag(p, TIF_FORK);
 	p->thread.io_bitmap_ptr = NULL;
 
@@ -235,7 +234,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	loadsegment(es, _ds);
 	loadsegment(ds, _ds);
 	load_gs_index(0);
-	current->thread.usersp	= new_sp;
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->cs		= _cs;
@@ -397,7 +395,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	/*
 	 * Switch the PDA and FPU contexts.
 	 */
-	prev->usersp = this_cpu_read(old_rsp);
 	this_cpu_write(current_task, next_p);
 
 	/*
-- 
cgit v1.2.3


From 7fcb3bc361c724a75bc642dbdd9d9bf0bdf07260 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 17 Mar 2015 14:42:59 +0100
Subject: x86/asm/entry/64: Update comments about stack frames

Tweak a few outdated comments that were obsoleted by recent changes
to syscall entry code:

 - we no longer have a "partial stack frame" on
   entry, ever.

 - explain the syscall entry usage of old_rsp.

Partially based on a (split out of) patch from Denys Vlasenko.

Originally-from: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@alien8.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index aed3f11c373b..d287f785089e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -15,10 +15,8 @@
  * after an interrupt and after each system call.
  *
  * A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
  * at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
  *
  * Some macro usage:
  * - CFI macros are used to generate dwarf2 unwind information for better
@@ -219,7 +217,7 @@ ENDPROC(native_usergs_sysret64)
  * Interrupts are off on entry.
  * Only called from user space.
  *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
  * it deals with uncanonical addresses better. SYSRET has trouble
  * with them due to bugs in both AMD and Intel CPUs.
  */
@@ -238,6 +236,11 @@ ENTRY(system_call)
 	 */
 GLOBAL(system_call_after_swapgs)
 
+	/*
+	 * We use 'old_rsp' as a scratch register, hence this block must execute
+	 * atomically in the face of possible interrupt-driven task preemption,
+	 * so we can enable interrupts only after we're done with using old_rsp:
+	 */
 	movq	%rsp,PER_CPU_VAR(old_rsp)
 	/* kernel_stack is set so that 5 slots (iret frame) are preallocated */
 	movq	PER_CPU_VAR(kernel_stack),%rsp
@@ -303,7 +306,7 @@ int_ret_from_sys_call_fixup:
 	FIXUP_TOP_OF_STACK %r11
 	jmp int_ret_from_sys_call
 
-	/* Do syscall tracing */
+	/* Do syscall entry tracing */
 tracesys:
 	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
@@ -339,11 +342,11 @@ tracesys_phase2:
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
-	/* Use IRET because user could have changed frame */
+	/* Use IRET because user could have changed pt_regs->foo */
 
 /*
  * Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -374,7 +377,7 @@ int_careful:
 	TRACE_IRQS_OFF
 	jmp int_with_check
 
-	/* handle signals and tracing -- both require a full stack frame */
+	/* handle signals and tracing -- both require a full pt_regs */
 int_very_careful:
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-- 
cgit v1.2.3


From c38e503804b0402c510f82437069f7769fa0cea9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 17 Mar 2015 14:42:59 +0100
Subject: x86/asm/entry/64: Rename 'old_rsp' to 'rsp_scratch'

Make clear that the usage of PER_CPU(old_rsp) is purely temporary,
by renaming it to 'rsp_scratch'.

Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S   | 10 +++++-----
 arch/x86/kernel/process_64.c |  2 +-
 arch/x86/xen/xen-asm_64.S    |  8 ++++----
 3 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index d287f785089e..0c91256d73df 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -237,16 +237,16 @@ ENTRY(system_call)
 GLOBAL(system_call_after_swapgs)
 
 	/*
-	 * We use 'old_rsp' as a scratch register, hence this block must execute
+	 * We use 'rsp_scratch' as a scratch register, hence this block must execute
 	 * atomically in the face of possible interrupt-driven task preemption,
-	 * so we can enable interrupts only after we're done with using old_rsp:
+	 * so we can enable interrupts only after we're done with using rsp_scratch:
 	 */
-	movq	%rsp,PER_CPU_VAR(old_rsp)
+	movq	%rsp,PER_CPU_VAR(rsp_scratch)
 	/* kernel_stack is set so that 5 slots (iret frame) are preallocated */
 	movq	PER_CPU_VAR(kernel_stack),%rsp
 	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
 	movq	%rcx,RIP(%rsp)
-	movq	PER_CPU_VAR(old_rsp),%rcx
+	movq	PER_CPU_VAR(rsp_scratch),%rcx
 	movq	%r11,EFLAGS(%rsp)
 	movq	%rcx,RSP(%rsp)
 	/*
@@ -657,7 +657,7 @@ common_interrupt:
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
-	/* 0(%rsp): old_rsp */
+	/* 0(%rsp): rsp_scratch */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 14df2be4711f..97f5658290b7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 
 asmlinkage extern void ret_from_fork(void);
 
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
 
 /* Prints also some state that isn't saved in the pt_regs */
 void __show_regs(struct pt_regs *regs, int all)
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
+	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq PER_CPU_VAR(kernel_stack), %rsp
 
 	pushq $__USER_DS
-	pushq PER_CPU_VAR(old_rsp)
+	pushq PER_CPU_VAR(rsp_scratch)
 	pushq %r11
 	pushq $__USER_CS
 	pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
 	 * We're already on the usermode stack at this point, but
 	 * still with the kernel gs, so we can easily switch back
 	 */
-	movq %rsp, PER_CPU_VAR(old_rsp)
+	movq %rsp, PER_CPU_VAR(rsp_scratch)
 	movq PER_CPU_VAR(kernel_stack), %rsp
 
 	pushq $__USER32_DS
-	pushq PER_CPU_VAR(old_rsp)
+	pushq PER_CPU_VAR(rsp_scratch)
 	pushq %r11
 	pushq $__USER32_CS
 	pushq %rcx
-- 
cgit v1.2.3


From 0790ec172de1bd2e23f1dbd4925426b6cc3c1b72 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Tue, 17 Mar 2015 14:02:32 +0100
Subject: KVM: nVMX: mask unrestricted_guest if disabled on L0

If EPT was enabled, unrestricted_guest was allowed in L1 regardless of
L0.  L1 triple faulted when running L2 guest that required emulation.

Another side effect was 'WARN_ON_ONCE(vmx->nested.nested_run_pending)'
in L0's dmesg:
  WARNING: CPU: 0 PID: 0 at arch/x86/kvm/vmx.c:9190 nested_vmx_vmexit+0x96e/0xb00 [kvm_intel] ()

Prevent this scenario by masking SECONDARY_EXEC_UNRESTRICTED_GUEST when
the host doesn't have it enabled.

Fixes: 78051e3b7e35 ("KVM: nVMX: Disable unrestricted mode if ept=0")
Cc: stable@vger.kernel.org
Tested-By: Kashyap Chamarthy <kchamart@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/vmx.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10a481b7674d..ae4f6d35d19c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2479,8 +2479,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
 		vmx->nested.nested_vmx_secondary_ctls_high |=
-			SECONDARY_EXEC_ENABLE_EPT |
-			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+			SECONDARY_EXEC_ENABLE_EPT;
 		vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
 			 VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
 			 VMX_EPT_INVEPT_BIT;
@@ -2494,6 +2493,10 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 
+	if (enable_unrestricted_guest)
+		vmx->nested.nested_vmx_secondary_ctls_high |=
+			SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
 	/* miscellaneous data */
 	rdmsr(MSR_IA32_VMX_MISC,
 		vmx->nested.nested_vmx_misc_low,
-- 
cgit v1.2.3


From 795a149e78f49c0e260c56cee9978c5d001a84f1 Mon Sep 17 00:00:00 2001
From: Xiubo Li
Date: Fri, 13 Mar 2015 17:39:44 +0800
Subject: KVM: x86: Avoid using plain integer as NULL pointer warning

This patch fix the following sparse warning:

for file arch/x86/kvm/x86.c:
warning: Using plain integer as NULL pointer

Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d1a1feaa522b..130926392a25 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5919,7 +5919,7 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	lapic_irq.dest_id = apicid;
 
 	lapic_irq.delivery_mode = APIC_DM_REMRD;
-	kvm_irq_delivery_to_apic(kvm, 0, &lapic_irq, NULL);
+	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 52eb5a6d576b5bca14797a4085abdd68ad8c0b3f Mon Sep 17 00:00:00 2001
From: Xiubo Li
Date: Fri, 13 Mar 2015 17:39:45 +0800
Subject: KVM: x86: For the symbols used locally only should be static type

This patch fix the following sparse warnings:

for arch/x86/kvm/x86.c:
warning: symbol 'emulator_read_write' was not declared. Should it be static?
warning: symbol 'emulator_write_emulated' was not declared. Should it be static?
warning: symbol 'emulator_get_dr' was not declared. Should it be static?
warning: symbol 'emulator_set_dr' was not declared. Should it be static?

for arch/x86/kvm/pmu.c:
warning: symbol 'fixed_pmc_events' was not declared. Should it be static?

Signed-off-by: Xiubo Li <lixiubo@cmss.chinamobile.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/pmu.c |  2 +-
 arch/x86/kvm/svm.c |  3 ++-
 arch/x86/kvm/x86.c | 11 +++++++----
 3 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 8e6b7d869d2f..29fbf9dfdc54 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -38,7 +38,7 @@ static struct kvm_arch_event_perf_mapping {
 };
 
 /* mapping between fixed pmc index and arch_events array */
-int fixed_pmc_events[] = {1, 0, 7};
+static int fixed_pmc_events[] = {1, 0, 7};
 
 static bool pmc_is_gp(struct kvm_pmc *pmc)
 {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 001e630ea7c6..3ef203a45c3e 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2906,7 +2906,8 @@ static int rdpmc_interception(struct vcpu_svm *svm)
 	return 1;
 }
 
-bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
+static bool check_selective_cr0_intercepted(struct vcpu_svm *svm,
+					    unsigned long val)
 {
 	unsigned long cr0 = svm->vcpu.arch.cr0;
 	bool ret = false;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 130926392a25..cc2c759f69a3 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4476,7 +4476,8 @@ mmio:
 	return X86EMUL_CONTINUE;
 }
 
-int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
+			unsigned long addr,
 			void *val, unsigned int bytes,
 			struct x86_exception *exception,
 			const struct read_write_emulator_ops *ops)
@@ -4539,7 +4540,7 @@ static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
 				   exception, &read_emultor);
 }
 
-int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
+static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
 			    unsigned long addr,
 			    const void *val,
 			    unsigned int bytes,
@@ -4738,12 +4739,14 @@ static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
+			   unsigned long *dest)
 {
 	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
 }
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
+			   unsigned long value)
 {
 
 	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
-- 
cgit v1.2.3


From 431d452af13720463dda498999b2e9a08729c03a Mon Sep 17 00:00:00 2001
From: Zhonghui Fu
Date: Wed, 18 Mar 2015 15:54:27 +0100
Subject: PM / sleep: add pm-trace support for suspending phase

Occasionally, the system can't come back up after suspend/resume
due to problems of device suspending phase. This patch make
PM_TRACE infrastructure cover device suspending phase of
suspend/resume process, and the information in RTC can tell
developers which device suspending function make system hang.

Signed-off-by: Zhonghui Fu <zhonghui.fu@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pm-trace.h     | 23 +++++++++++++++++++++++
 arch/x86/include/asm/resume-trace.h | 21 ---------------------
 drivers/base/power/main.c           | 20 ++++++++++++++++----
 drivers/base/power/trace.c          |  6 +++---
 include/linux/pm-trace.h            | 35 +++++++++++++++++++++++++++++++++++
 include/linux/resume-trace.h        | 34 ----------------------------------
 kernel/power/main.c                 |  2 +-
 7 files changed, 78 insertions(+), 63 deletions(-)
 create mode 100644 arch/x86/include/asm/pm-trace.h
 delete mode 100644 arch/x86/include/asm/resume-trace.h
 create mode 100644 include/linux/pm-trace.h
 delete mode 100644 include/linux/resume-trace.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h
new file mode 100644
index 000000000000..7b7ac42c3661
--- /dev/null
+++ b/arch/x86/include/asm/pm-trace.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_PM_TRACE_H
+#define _ASM_X86_PM_TRACE_H
+
+#include <asm/asm.h>
+
+#define TRACE_RESUME(user)					\
+do {								\
+	if (pm_trace_enabled) {					\
+		const void *tracedata;				\
+		asm volatile(_ASM_MOV " $1f,%0\n"		\
+			     ".section .tracedata,\"a\"\n"	\
+			     "1:\t.word %c1\n\t"		\
+			     _ASM_PTR " %c2\n"			\
+			     ".previous"			\
+			     :"=r" (tracedata)			\
+			     : "i" (__LINE__), "i" (__FILE__));	\
+		generate_pm_trace(tracedata, user);		\
+	}							\
+} while (0)
+
+#define TRACE_SUSPEND(user)	TRACE_RESUME(user)
+
+#endif /* _ASM_X86_PM_TRACE_H */
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
deleted file mode 100644
index 3ff1c2cb1da5..000000000000
--- a/arch/x86/include/asm/resume-trace.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef _ASM_X86_RESUME_TRACE_H
-#define _ASM_X86_RESUME_TRACE_H
-
-#include <asm/asm.h>
-
-#define TRACE_RESUME(user)					\
-do {								\
-	if (pm_trace_enabled) {					\
-		const void *tracedata;				\
-		asm volatile(_ASM_MOV " $1f,%0\n"		\
-			     ".section .tracedata,\"a\"\n"	\
-			     "1:\t.word %c1\n\t"		\
-			     _ASM_PTR " %c2\n"			\
-			     ".previous"			\
-			     :"=r" (tracedata)			\
-			     : "i" (__LINE__), "i" (__FILE__));	\
-		generate_resume_trace(tracedata, user);		\
-	}							\
-} while (0)
-
-#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 9717d5f20139..3d874eca7104 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -23,7 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/pm.h>
 #include <linux/pm_runtime.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/async.h>
@@ -1017,6 +1017,9 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 	char *info = NULL;
 	int error = 0;
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	if (async_error)
 		goto Complete;
 
@@ -1057,6 +1060,7 @@ static int __device_suspend_noirq(struct device *dev, pm_message_t state, bool a
 
 Complete:
 	complete_all(&dev->power.completion);
+	TRACE_SUSPEND(error);
 	return error;
 }
 
@@ -1078,7 +1082,7 @@ static int device_suspend_noirq(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend_noirq, dev);
 		return 0;
@@ -1157,6 +1161,9 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 	char *info = NULL;
 	int error = 0;
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	__pm_runtime_disable(dev, false);
 
 	if (async_error)
@@ -1198,6 +1205,7 @@ static int __device_suspend_late(struct device *dev, pm_message_t state, bool as
 		async_error = error;
 
 Complete:
+	TRACE_SUSPEND(error);
 	complete_all(&dev->power.completion);
 	return error;
 }
@@ -1219,7 +1227,7 @@ static int device_suspend_late(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend_late, dev);
 		return 0;
@@ -1338,6 +1346,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	int error = 0;
 	DECLARE_DPM_WATCHDOG_ON_STACK(wd);
 
+	TRACE_DEVICE(dev);
+	TRACE_SUSPEND(0);
+
 	dpm_wait_for_children(dev, async);
 
 	if (async_error)
@@ -1444,6 +1455,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
 	if (error)
 		async_error = error;
 
+	TRACE_SUSPEND(error);
 	return error;
 }
 
@@ -1465,7 +1477,7 @@ static int device_suspend(struct device *dev)
 {
 	reinit_completion(&dev->power.completion);
 
-	if (pm_async_enabled && dev->power.async_suspend) {
+	if (is_async(dev)) {
 		get_device(dev);
 		async_schedule(async_suspend, dev);
 		return 0;
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index d94a1f5121cf..a311cfa4c5bd 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -7,7 +7,7 @@
  * devices may be working.
  */
 
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/export.h>
 #include <linux/rtc.h>
 
@@ -154,7 +154,7 @@ EXPORT_SYMBOL(set_trace_device);
  * it's not any guarantee, but it's a high _likelihood_ that
  * the match is valid).
  */
-void generate_resume_trace(const void *tracedata, unsigned int user)
+void generate_pm_trace(const void *tracedata, unsigned int user)
 {
 	unsigned short lineno = *(unsigned short *)tracedata;
 	const char *file = *(const char **)(tracedata + 2);
@@ -164,7 +164,7 @@ void generate_resume_trace(const void *tracedata, unsigned int user)
 	file_hash_value = hash_string(lineno, file, FILEHASH);
 	set_magic_time(user_hash_value, file_hash_value, dev_hash_value);
 }
-EXPORT_SYMBOL(generate_resume_trace);
+EXPORT_SYMBOL(generate_pm_trace);
 
 extern char __tracedata_start, __tracedata_end;
 static int show_file_hash(unsigned int value)
diff --git a/include/linux/pm-trace.h b/include/linux/pm-trace.h
new file mode 100644
index 000000000000..ecbde7a5548e
--- /dev/null
+++ b/include/linux/pm-trace.h
@@ -0,0 +1,35 @@
+#ifndef PM_TRACE_H
+#define PM_TRACE_H
+
+#ifdef CONFIG_PM_TRACE
+#include <asm/pm-trace.h>
+#include <linux/types.h>
+
+extern int pm_trace_enabled;
+
+static inline int pm_trace_is_enabled(void)
+{
+       return pm_trace_enabled;
+}
+
+struct device;
+extern void set_trace_device(struct device *);
+extern void generate_pm_trace(const void *tracedata, unsigned int user);
+extern int show_trace_dev_match(char *buf, size_t size);
+
+#define TRACE_DEVICE(dev) do { \
+	if (pm_trace_enabled) \
+		set_trace_device(dev); \
+	} while(0)
+
+#else
+
+static inline int pm_trace_is_enabled(void) { return 0; }
+
+#define TRACE_DEVICE(dev) do { } while (0)
+#define TRACE_RESUME(dev) do { } while (0)
+#define TRACE_SUSPEND(dev) do { } while (0)
+
+#endif
+
+#endif
diff --git a/include/linux/resume-trace.h b/include/linux/resume-trace.h
deleted file mode 100644
index f31db2368782..000000000000
--- a/include/linux/resume-trace.h
+++ /dev/null
@@ -1,34 +0,0 @@
-#ifndef RESUME_TRACE_H
-#define RESUME_TRACE_H
-
-#ifdef CONFIG_PM_TRACE
-#include <asm/resume-trace.h>
-#include <linux/types.h>
-
-extern int pm_trace_enabled;
-
-static inline int pm_trace_is_enabled(void)
-{
-       return pm_trace_enabled;
-}
-
-struct device;
-extern void set_trace_device(struct device *);
-extern void generate_resume_trace(const void *tracedata, unsigned int user);
-extern int show_trace_dev_match(char *buf, size_t size);
-
-#define TRACE_DEVICE(dev) do { \
-	if (pm_trace_enabled) \
-		set_trace_device(dev); \
-	} while(0)
-
-#else
-
-static inline int pm_trace_is_enabled(void) { return 0; }
-
-#define TRACE_DEVICE(dev) do { } while (0)
-#define TRACE_RESUME(dev) do { } while (0)
-
-#endif
-
-#endif
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 9a59d042ea84..86e8157a450f 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -11,7 +11,7 @@
 #include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
-#include <linux/resume-trace.h>
+#include <linux/pm-trace.h>
 #include <linux/workqueue.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-- 
cgit v1.2.3


From faac2458518e20130664d77b657303758f1aaf5a Mon Sep 17 00:00:00 2001
From: Bandan Das
Date: Mon, 16 Mar 2015 17:18:25 -0400
Subject: KVM: SVM: Fix confusing message if no exit handlers are installed

I hit this path on a AMD box and thought
someone was playing a April Fool's joke on me.

Signed-off-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/svm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 3ef203a45c3e..155534c0f5e8 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3562,7 +3562,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 
 	if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
 	    || !svm_exit_handlers[exit_code]) {
-		WARN_ONCE(1, "vmx: unexpected exit reason 0x%x\n", exit_code);
+		WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
 		kvm_queue_exception(vcpu, UD_VECTOR);
 		return 1;
 	}
-- 
cgit v1.2.3


From b97ea289cf6aff8d4cbcefe2b707bb9b00a73c73 Mon Sep 17 00:00:00 2001
From: Yijing Wang
Date: Mon, 16 Mar 2015 11:18:56 +0800
Subject: PCI: Assign resources before drivers claim devices
 (pci_scan_root_bus())

Previously, pci_scan_root_bus() created a root PCI bus, enumerated the
devices on it, and called pci_bus_add_devices(), which made the devices
available for drivers to claim them.

Most callers assigned resources to devices after pci_scan_root_bus()
returns, which may be after drivers have claimed the devices.  This is
incorrect; the PCI core should not change device resources while a driver
is managing the device.

Remove pci_bus_add_devices() from pci_scan_root_bus() and do it after any
resource assignment in the callers.

Note that ARM's pci_common_init_dev() already called pci_bus_add_devices()
after pci_scan_root_bus(), so we only need to remove the first call:

  pci_common_init_dev
    pcibios_init_hw
      pci_scan_root_bus
        pci_bus_add_devices        # first call
    pci_bus_assign_resources
    pci_bus_add_devices            # second call

[bhelgaas: changelog, drop "root_bus" var in alpha common_init_pci(),
return failure earlier in mn10300, add "return" in x86 pcibios_scan_root(),
return early if xtensa platform_pcibios_fixup() fails]
Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
CC: Richard Henderson <rth@twiddle.net>
CC: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
CC: Matt Turner <mattst88@gmail.com>
CC: David Howells <dhowells@redhat.com>
CC: Tony Luck <tony.luck@intel.com>
CC: Michal Simek <monstr@monstr.eu>
CC: Ralf Baechle <ralf@linux-mips.org>
CC: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
CC: Sebastian Ott <sebott@linux.vnet.ibm.com>
CC: "David S. Miller" <davem@davemloft.net>
CC: Chris Metcalf <cmetcalf@ezchip.com>
CC: Chris Zankel <chris@zankel.net>
CC: Max Filippov <jcmvbkbc@gmail.com>
CC: Thomas Gleixner <tglx@linutronix.de>
---
 arch/alpha/kernel/pci.c          |  7 +++++++
 arch/frv/mb93090-mb00/pci-vdk.c  |  6 +++++-
 arch/ia64/sn/kernel/io_init.c    |  2 ++
 arch/microblaze/pci/pci-common.c |  4 ++++
 arch/mips/pci/pci.c              |  1 +
 arch/mn10300/unit-asb2305/pci.c  |  6 +++++-
 arch/s390/pci/pci.c              |  2 +-
 arch/sh/drivers/pci/pci.c        |  1 +
 arch/sparc/kernel/leon_pci.c     |  1 +
 arch/tile/kernel/pci.c           |  2 ++
 arch/tile/kernel/pci_gx.c        |  2 ++
 arch/x86/pci/common.c            |  2 ++
 arch/xtensa/kernel/pci.c         | 15 +++++++++++++--
 drivers/pci/host/pci-versatile.c |  1 +
 drivers/pci/probe.c              |  1 -
 15 files changed, 47 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/pci.c b/arch/alpha/kernel/pci.c
index 98a1525fa164..82f738e5d54c 100644
--- a/arch/alpha/kernel/pci.c
+++ b/arch/alpha/kernel/pci.c
@@ -338,6 +338,8 @@ common_init_pci(void)
 
 		bus = pci_scan_root_bus(NULL, next_busno, alpha_mv.pci_ops,
 					hose, &resources);
+		if (!bus)
+			continue;
 		hose->bus = bus;
 		hose->need_domain_info = need_domain_info;
 		next_busno = bus->busn_res.end + 1;
@@ -353,6 +355,11 @@ common_init_pci(void)
 
 	pci_assign_unassigned_resources();
 	pci_fixup_irqs(alpha_mv.pci_swizzle, alpha_mv.pci_map_irq);
+	for (hose = hose_head; hose; hose = hose->next) {
+		bus = hose->bus;
+		if (bus)
+			pci_bus_add_devices(bus);
+	}
 }
 
 
diff --git a/arch/frv/mb93090-mb00/pci-vdk.c b/arch/frv/mb93090-mb00/pci-vdk.c
index b073f4d771a5..f211839e2cae 100644
--- a/arch/frv/mb93090-mb00/pci-vdk.c
+++ b/arch/frv/mb93090-mb00/pci-vdk.c
@@ -316,6 +316,7 @@ void pcibios_fixup_bus(struct pci_bus *bus)
 
 int __init pcibios_init(void)
 {
+	struct pci_bus *bus;
 	struct pci_ops *dir = NULL;
 	LIST_HEAD(resources);
 
@@ -383,12 +384,15 @@ int __init pcibios_init(void)
 	printk("PCI: Probing PCI hardware\n");
 	pci_add_resource(&resources, &pci_ioport_resource);
 	pci_add_resource(&resources, &pci_iomem_resource);
-	pci_scan_root_bus(NULL, 0, pci_root_ops, NULL, &resources);
+	bus = pci_scan_root_bus(NULL, 0, pci_root_ops, NULL, &resources);
 
 	pcibios_irq_init();
 	pcibios_fixup_irqs();
 	pcibios_resource_survey();
+	if (!bus)
+		return 0;
 
+	pci_bus_add_devices(bus);
 	return 0;
 }
 
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index 0b5ce82d203d..1be65eb074ec 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -271,7 +271,9 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
  	if (bus == NULL) {
 		kfree(res);
 		kfree(controller);
+		return;
 	}
+	pci_bus_add_devices(bus);
 }
 
 /*
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 48528fb81eff..ae838ed5fcf2 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -1382,6 +1382,10 @@ static int __init pcibios_init(void)
 
 	/* Call common code to handle resource allocation */
 	pcibios_resource_survey();
+	list_for_each_entry_safe(hose, tmp, &hose_list, list_node) {
+		if (hose->bus)
+			pci_bus_add_devices(hose->bus);
+	}
 
 	return 0;
 }
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index 1bf60b127377..9eb54b557c9f 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -114,6 +114,7 @@ static void pcibios_scanbus(struct pci_controller *hose)
 			pci_bus_size_bridges(bus);
 			pci_bus_assign_resources(bus);
 		}
+		pci_bus_add_devices(bus);
 	}
 }
 
diff --git a/arch/mn10300/unit-asb2305/pci.c b/arch/mn10300/unit-asb2305/pci.c
index 613ca1e55b4b..3dfe2d31c67b 100644
--- a/arch/mn10300/unit-asb2305/pci.c
+++ b/arch/mn10300/unit-asb2305/pci.c
@@ -342,6 +342,7 @@ static int __init pcibios_init(void)
 {
 	resource_size_t io_offset, mem_offset;
 	LIST_HEAD(resources);
+	struct pci_bus *bus;
 
 	ioport_resource.start	= 0xA0000000;
 	ioport_resource.end	= 0xDFFFFFFF;
@@ -371,11 +372,14 @@ static int __init pcibios_init(void)
 
 	pci_add_resource_offset(&resources, &pci_ioport_resource, io_offset);
 	pci_add_resource_offset(&resources, &pci_iomem_resource, mem_offset);
-	pci_scan_root_bus(NULL, 0, &pci_direct_ampci, NULL, &resources);
+	bus = pci_scan_root_bus(NULL, 0, &pci_direct_ampci, NULL, &resources);
+	if (!bus)
+		return 0;
 
 	pcibios_irq_init();
 	pcibios_fixup_irqs();
 	pcibios_resource_survey();
+	pci_bus_add_devices(bus);
 	return 0;
 }
 
diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c
index 753a56731951..a2a7391c0b9a 100644
--- a/arch/s390/pci/pci.c
+++ b/arch/s390/pci/pci.c
@@ -776,8 +776,8 @@ static int zpci_scan_bus(struct zpci_dev *zdev)
 		zpci_cleanup_bus_resources(zdev);
 		return -EIO;
 	}
-
 	zdev->bus->max_bus_speed = zdev->max_bus_speed;
+	pci_bus_add_devices(zdev->bus);
 	return 0;
 }
 
diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c
index 1bc09ee7948f..efc10519916a 100644
--- a/arch/sh/drivers/pci/pci.c
+++ b/arch/sh/drivers/pci/pci.c
@@ -69,6 +69,7 @@ static void pcibios_scanbus(struct pci_channel *hose)
 
 		pci_bus_size_bridges(bus);
 		pci_bus_assign_resources(bus);
+		pci_bus_add_devices(bus);
 	} else {
 		pci_free_resource_list(&resources);
 	}
diff --git a/arch/sparc/kernel/leon_pci.c b/arch/sparc/kernel/leon_pci.c
index 899b7203a4e4..297107679fdf 100644
--- a/arch/sparc/kernel/leon_pci.c
+++ b/arch/sparc/kernel/leon_pci.c
@@ -40,6 +40,7 @@ void leon_pci_init(struct platform_device *ofdev, struct leon_pci_info *info)
 
 		/* Assign devices with resources */
 		pci_assign_unassigned_resources();
+		pci_bus_add_devices(root_bus);
 	} else {
 		pci_free_resource_list(&resources);
 	}
diff --git a/arch/tile/kernel/pci.c b/arch/tile/kernel/pci.c
index 325df47f114d..9475a74cd53a 100644
--- a/arch/tile/kernel/pci.c
+++ b/arch/tile/kernel/pci.c
@@ -339,6 +339,8 @@ int __init pcibios_init(void)
 			struct pci_bus *next_bus;
 			struct pci_dev *dev;
 
+			pci_bus_add_devices(root_bus);
+
 			list_for_each_entry(dev, &root_bus->devices, bus_list) {
 				/*
 				 * Find the PCI host controller, ie. the 1st
diff --git a/arch/tile/kernel/pci_gx.c b/arch/tile/kernel/pci_gx.c
index 2c95f37ebbed..b1df847d0686 100644
--- a/arch/tile/kernel/pci_gx.c
+++ b/arch/tile/kernel/pci_gx.c
@@ -1030,6 +1030,8 @@ int __init pcibios_init(void)
 alloc_mem_map_failed:
 			break;
 		}
+
+		pci_bus_add_devices(root_bus);
 	}
 
 	return 0;
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3d2612b68694..95a0ba70376b 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -490,7 +490,9 @@ void pcibios_scan_root(int busnum)
 	if (!bus) {
 		pci_free_resource_list(&resources);
 		kfree(sd);
+		return;
 	}
+	pci_bus_add_devices(bus);
 }
 
 void __init pcibios_set_cache_line_size(void)
diff --git a/arch/xtensa/kernel/pci.c b/arch/xtensa/kernel/pci.c
index 5b3403388d7f..b848cc3dc913 100644
--- a/arch/xtensa/kernel/pci.c
+++ b/arch/xtensa/kernel/pci.c
@@ -174,7 +174,7 @@ static int __init pcibios_init(void)
 	struct pci_controller *pci_ctrl;
 	struct list_head resources;
 	struct pci_bus *bus;
-	int next_busno = 0;
+	int next_busno = 0, ret;
 
 	printk("PCI: Probing PCI hardware\n");
 
@@ -185,14 +185,25 @@ static int __init pcibios_init(void)
 		pci_controller_apertures(pci_ctrl, &resources);
 		bus = pci_scan_root_bus(NULL, pci_ctrl->first_busno,
 					pci_ctrl->ops, pci_ctrl, &resources);
+		if (!bus)
+			continue;
+
 		pci_ctrl->bus = bus;
 		pci_ctrl->last_busno = bus->busn_res.end;
 		if (next_busno <= pci_ctrl->last_busno)
 			next_busno = pci_ctrl->last_busno+1;
 	}
 	pci_bus_count = next_busno;
+	ret = platform_pcibios_fixup();
+	if (ret)
+		return ret;
 
-	return platform_pcibios_fixup();
+	for (pci_ctrl = pci_ctrl_head; pci_ctrl; pci_ctrl = pci_ctrl->next) {
+		if (pci_ctrl->bus)
+			pci_bus_add_devices(pci_ctrl->bus);
+	}
+
+	return 0;
 }
 
 subsys_initcall(pcibios_init);
diff --git a/drivers/pci/host/pci-versatile.c b/drivers/pci/host/pci-versatile.c
index 1ec694a52379..e3a2450db2b8 100644
--- a/drivers/pci/host/pci-versatile.c
+++ b/drivers/pci/host/pci-versatile.c
@@ -214,6 +214,7 @@ static int versatile_pci_probe(struct platform_device *pdev)
 
 	pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
 	pci_assign_unassigned_bus_resources(bus);
+	pci_bus_add_devices(bus);
 
 	return 0;
 }
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 88604f29d140..8ef0375ea314 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -2087,7 +2087,6 @@ struct pci_bus *pci_scan_root_bus(struct device *parent, int bus,
 	if (!found)
 		pci_bus_update_busn_res_end(b, max);
 
-	pci_bus_add_devices(b);
 	return b;
 }
 EXPORT_SYMBOL(pci_scan_root_bus);
-- 
cgit v1.2.3


From 9e8ce4b96b781b003e3174fbbc62e1d4388c8b8f Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Fri, 20 Mar 2015 14:56:19 +0100
Subject: Revert "x86/PCI: Refine the way to release PCI IRQ resources"

Commit b4b55cda5874 (Refine the way to release PCI IRQ resources)
introduced a regression in the PCI IRQ resource management by causing
the IRQ resource of a device, established when pci_enabled_device()
is called on a fully disabled device, to be released when the driver
is unbound from the device, regardless of the enable_cnt.

This leads to the situation that an ill-behaved driver can now make a
device unusable to subsequent drivers by an imbalance in their use of
pci_enable/disable_device().  That is a serious problem for secondary
drivers like vfio-pci, which are innocent of the transgressions of
the previous driver.

Since the solution of this problem is not immediate and requires
further discussion, revert commit b4b55cda5874 and the issue it was
supposed to address (a bug related to xen-pciback) will be taken
care of in a different way going forward.

Reported-by: Alex Williamson <alex.williamson@redhat.com>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
---
 arch/x86/include/asm/pci_x86.h |  2 ++
 arch/x86/pci/common.c          | 34 ++++++----------------------------
 arch/x86/pci/intel_mid_pci.c   |  4 ++--
 arch/x86/pci/irq.c             | 15 ++++++++++++++-
 drivers/acpi/pci_irq.c         |  9 ++++++++-
 5 files changed, 32 insertions(+), 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index fa1195dae425..164e3f8d3c3d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -93,6 +93,8 @@ extern raw_spinlock_t pci_config_lock;
 extern int (*pcibios_enable_irq)(struct pci_dev *dev);
 extern void (*pcibios_disable_irq)(struct pci_dev *dev);
 
+extern bool mp_should_keep_irq(struct device *dev);
+
 struct pci_raw_ops {
 	int (*read)(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3d2612b68694..2fb384724ebb 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -513,31 +513,6 @@ void __init pcibios_set_cache_line_size(void)
 	}
 }
 
-/*
- * Some device drivers assume dev->irq won't change after calling
- * pci_disable_device(). So delay releasing of IRQ resource to driver
- * unbinding time. Otherwise it will break PM subsystem and drivers
- * like xen-pciback etc.
- */
-static int pci_irq_notifier(struct notifier_block *nb, unsigned long action,
-			    void *data)
-{
-	struct pci_dev *dev = to_pci_dev(data);
-
-	if (action != BUS_NOTIFY_UNBOUND_DRIVER)
-		return NOTIFY_DONE;
-
-	if (pcibios_disable_irq)
-		pcibios_disable_irq(dev);
-
-	return NOTIFY_OK;
-}
-
-static struct notifier_block pci_irq_nb = {
-	.notifier_call = pci_irq_notifier,
-	.priority = INT_MIN,
-};
-
 int __init pcibios_init(void)
 {
 	if (!raw_pci_ops) {
@@ -550,9 +525,6 @@ int __init pcibios_init(void)
 
 	if (pci_bf_sort >= pci_force_bf)
 		pci_sort_breadthfirst();
-
-	bus_register_notifier(&pci_bus_type, &pci_irq_nb);
-
 	return 0;
 }
 
@@ -711,6 +683,12 @@ int pcibios_enable_device(struct pci_dev *dev, int mask)
 	return 0;
 }
 
+void pcibios_disable_device (struct pci_dev *dev)
+{
+	if (!pci_dev_msi_enabled(dev) && pcibios_disable_irq)
+		pcibios_disable_irq(dev);
+}
+
 int pci_ext_cfg_avail(void)
 {
 	if (raw_pci_ext_ops)
diff --git a/arch/x86/pci/intel_mid_pci.c b/arch/x86/pci/intel_mid_pci.c
index efb849323c74..852aa4c92da0 100644
--- a/arch/x86/pci/intel_mid_pci.c
+++ b/arch/x86/pci/intel_mid_pci.c
@@ -234,10 +234,10 @@ static int intel_mid_pci_irq_enable(struct pci_dev *dev)
 
 static void intel_mid_pci_irq_disable(struct pci_dev *dev)
 {
-	if (dev->irq_managed && dev->irq > 0) {
+	if (!mp_should_keep_irq(&dev->dev) && dev->irq_managed &&
+	    dev->irq > 0) {
 		mp_unmap_irq(dev->irq);
 		dev->irq_managed = 0;
-		dev->irq = 0;
 	}
 }
 
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index e71b3dbd87b8..5dc6ca5e1741 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -1256,9 +1256,22 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	return 0;
 }
 
+bool mp_should_keep_irq(struct device *dev)
+{
+	if (dev->power.is_prepared)
+		return true;
+#ifdef CONFIG_PM
+	if (dev->power.runtime_status == RPM_SUSPENDING)
+		return true;
+#endif
+
+	return false;
+}
+
 static void pirq_disable_irq(struct pci_dev *dev)
 {
-	if (io_apic_assign_pci_irqs && dev->irq_managed && dev->irq) {
+	if (io_apic_assign_pci_irqs && !mp_should_keep_irq(&dev->dev) &&
+	    dev->irq_managed && dev->irq) {
 		mp_unmap_irq(dev->irq);
 		dev->irq = 0;
 		dev->irq_managed = 0;
diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c
index e7f718d6918a..b1def411c0b8 100644
--- a/drivers/acpi/pci_irq.c
+++ b/drivers/acpi/pci_irq.c
@@ -485,6 +485,14 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	if (!pin || !dev->irq_managed || dev->irq <= 0)
 		return;
 
+	/* Keep IOAPIC pin configuration when suspending */
+	if (dev->dev.power.is_prepared)
+		return;
+#ifdef	CONFIG_PM
+	if (dev->dev.power.runtime_status == RPM_SUSPENDING)
+		return;
+#endif
+
 	entry = acpi_pci_irq_lookup(dev, pin);
 	if (!entry)
 		return;
@@ -505,6 +513,5 @@ void acpi_pci_irq_disable(struct pci_dev *dev)
 	if (gsi >= 0) {
 		acpi_unregister_gsi(gsi);
 		dev->irq_managed = 0;
-		dev->irq = 0;
 	}
 }
-- 
cgit v1.2.3


From 1daeaa315164c60b937f56fe3848d4328c358eba Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sat, 21 Mar 2015 18:54:21 -0400
Subject: x86/asm/entry: Fix execve() and sigreturn() syscalls to always return
 via IRET

Both the execve() and sigreturn() family of syscalls have the
ability to change registers in ways that may not be compatabile
with the syscall path they were called from.

In particular, SYSRET and SYSEXIT can't handle non-default %cs and %ss,
and some bits in eflags.

These syscalls have stubs that are hardcoded to jump to the IRET path,
and not return to the original syscall path.

The following commit:

   76f5df43cab5e76 ("Always allocate a complete "struct pt_regs" on the kernel stack")

recently changed this for some 32-bit compat syscalls, but introduced a bug where
execve from a 32-bit program to a 64-bit program would fail because it still returned
via SYSRETL. This caused Wine to fail when built for both 32-bit and 64-bit.

This patch sets TIF_NOTIFY_RESUME for execve() and sigreturn() so
that the IRET path is always taken on exit to userspace.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/1426978461-32089-1-git-send-email-brgerst@gmail.com
[ Improved the changelog and comments. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32_signal.c        |  2 ++
 arch/x86/include/asm/ptrace.h      |  2 +-
 arch/x86/include/asm/thread_info.h | 10 ++++++++++
 arch/x86/kernel/process_32.c       |  6 +-----
 arch/x86/kernel/process_64.c       |  1 +
 arch/x86/kernel/signal.c           |  2 ++
 6 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..1f5e2b0e09ff 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -203,6 +203,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 
 	err |= restore_xstate_sig(buf, 1);
 
+	force_iret();
+
 	return err;
 }
 
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 74bb2e0f3030..83b874da2762 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -251,7 +251,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
  */
 #define arch_ptrace_stop_needed(code, info)				\
 ({									\
-	set_thread_flag(TIF_NOTIFY_RESUME);				\
+	force_iret();							\
 	false;								\
 })
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ba115eb6fbcf..0abf7ab20ce2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -260,6 +260,16 @@ static inline bool is_ia32_task(void)
 #endif
 	return false;
 }
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
 #endif	/* !__ASSEMBLY__ */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 1b9963faf4eb..26c596d1ee07 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 	regs->ip		= new_ip;
 	regs->sp		= new_sp;
 	regs->flags		= X86_EFLAGS_IF;
-	/*
-	 * force it to the iret return path by making it look as if there was
-	 * some work pending.
-	 */
-	set_thread_flag(TIF_NOTIFY_RESUME);
+	force_iret();
 }
 EXPORT_SYMBOL_GPL(start_thread);
 
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 97f5658290b7..da8b74598d90 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -239,6 +239,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
+	force_iret();
 }
 
 void
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index edcb862cdcae..eaa2c5e3f2cd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -108,6 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 
 	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
 
+	force_iret();
+
 	return err;
 }
 
-- 
cgit v1.2.3


From d31bf07f71a5568b48c5ed448e4299050469f615 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:27 -0700
Subject: x86/mm/fault: Use TASK_SIZE_MAX in is_prefetch()

This is slightly shorter and slightly faster.  It's also more
correct: the split between user and kernel addresses is
TASK_SIZE_MAX, regardless of ti->flags.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/09156b63bad90a327827003c9e53faa82ef4c56e.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/fault.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..ae340d3761ca 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	instr = (void *)convert_ip_to_linear(current, regs);
 	max_instr = instr + 15;
 
-	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
 		return 0;
 
 	while (instr < max_instr) {
-- 
cgit v1.2.3


From c56716af8d27ca8dd6e45445ae1c0a05fd9753a6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:28 -0700
Subject: x86/asm/entry, perf: Fix incorrect TIF_IA32 check in
 code_segment_base()

We want to check whether user code is in 32-bit mode, not
whether the task is nominally 32-bit.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/33e5107085ce347a8303560302b15c2cadd62c4c.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..979963bb3977 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2161,10 +2161,9 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (user_mode(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
-	if (test_thread_flag(TIF_IA32)) {
-		if (user_mode(regs) && regs->cs != __USER32_CS)
-			return get_segment_base(regs->cs);
-	}
+	if (user_mode(regs) && !user_64bit_mode(regs) &&
+	    regs->cs != __USER32_CS)
+		return get_segment_base(regs->cs);
 #endif
 	return 0;
 }
-- 
cgit v1.2.3


From fb14b4eadf73500d3b2104f031472a268562c047 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 11 Mar 2015 18:34:09 +0100
Subject: x86/fpu: Document user_fpu_begin()

Currently, user_fpu_begin() has a single caller and it is not clear why
do we actually need it and why we should not worry about preemption
right after preempt_enable().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150311173409.GC5032@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 810f20fd4e4e..c58c9302152b 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -508,10 +508,12 @@ static inline int restore_xstate_sig(void __user *buf, int ia32_frame)
 }
 
 /*
- * Need to be preemption-safe.
+ * Needs to be preemption-safe.
  *
  * NOTE! user_fpu_begin() must be used only immediately before restoring
- * it. This function does not do any save/restore on their own.
+ * the save state. It does not do any saving/restoring on its own. In
+ * lazy FPU mode, it is just an optimization to avoid a #NM exception,
+ * the task can lose the FPU right after preempt_enable().
  */
 static inline void user_fpu_begin(void)
 {
-- 
cgit v1.2.3


From 8f4d81863ba4e8dfee93bd50840f1099a296251f Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 11 Mar 2015 18:34:29 +0100
Subject: x86/fpu: Introduce restore_init_xstate()

Extract the "use_eager_fpu()" code from drop_init_fpu() into a new,
simple helper restore_init_xstate(). The next patch adds another user.

- It is not clear why we do not check use_fxsr() like fpu_restore_checking()
  does. eager_fpu_init_bp() calls setup_init_fpu_buf() too, and we have the
  "eagerfpu=on" kernel option.

- Ignoring the fact that init_xstate_buf is "struct xsave_struct *", not
  "union thread_xstate *", it is not clear why we can not simply use
  fpu_restore_checking() and avoid the code duplication.

- It is not clear why we can't call setup_init_fpu_buf() unconditionally
  to always create init_xstate_buf(). Then do_device_not_available() path
  (at least) could use restore_init_xstate() too. It doesn't need to init
  fpu->state, its content doesn't matter until unlazy_fpu()/__switch_to()/etc
  which overwrites this memory anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150311173429.GD5032@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index c58c9302152b..7d2f7fa6b2dd 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -401,16 +401,20 @@ static inline void drop_fpu(struct task_struct *tsk)
 	preempt_enable();
 }
 
+static inline void restore_init_xstate(void)
+{
+	if (use_xsave())
+		xrstor_state(init_xstate_buf, -1);
+	else
+		fxrstor_checking(&init_xstate_buf->i387);
+}
+
 static inline void drop_init_fpu(struct task_struct *tsk)
 {
 	if (!use_eager_fpu())
 		drop_fpu(tsk);
-	else {
-		if (use_xsave())
-			xrstor_state(init_xstate_buf, -1);
-		else
-			fxrstor_checking(&init_xstate_buf->i387);
-	}
+	else
+		restore_init_xstate();
 }
 
 /*
-- 
cgit v1.2.3


From 9cb6ce823bbd1adbe15e30bd1435c84c2e271767 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Wed, 11 Mar 2015 18:34:49 +0100
Subject: x86/fpu: Use restore_init_xstate() instead of math_state_restore() on
 kthread exec

Change flush_thread() to do user_fpu_begin() and restore_init_xstate()
instead of math_state_restore().

Note: "TODO: cleanup this horror" is still valid. We do not need
init_fpu() at all, we only need fpu_alloc() and memset(0). But this
needs other changes, in particular user_fpu_begin() should set
used_math().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150311173449.GE5032@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index dcaf4b00d0b4..6b058296a456 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -143,7 +143,8 @@ void flush_thread(void)
 		/* kthread execs. TODO: cleanup this horror. */
 		if (WARN_ON(init_fpu(current)))
 			force_sig(SIGKILL, current);
-		math_state_restore();
+		user_fpu_begin();
+		restore_init_xstate();
 	}
 }
 
-- 
cgit v1.2.3


From f893959b0898bd876673adbeb6798bdf25c034d7 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 13 Mar 2015 18:30:30 +0100
Subject: x86/fpu: Don't abuse drop_init_fpu() in flush_thread()

flush_thread() -> drop_init_fpu() is suboptimal and confusing. It does
drop_fpu() or restore_init_xstate() depending on !use_eager_fpu(). But
flush_thread() too checks eagerfpu right after that, and if it is true
then restore_init_xstate() just burns CPU for no reason. We are going to
load init_xstate_buf again after we set used_math()/user_has_fpu(), until
then the FPU state can't survive after switch_to().

Remove it, and change the "if (!use_eager_fpu())" to call drop_fpu().
While at it, clean up the tsk/current usage.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150313173030.GA31217@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 6b058296a456..1d2ebadba7ac 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -132,17 +132,14 @@ void flush_thread(void)
 	flush_ptrace_hw_breakpoint(tsk);
 	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
 
-	drop_init_fpu(tsk);
-	/*
-	 * Free the FPU state for non xsave platforms. They get reallocated
-	 * lazily at the first use.
-	 */
-	if (!use_eager_fpu())
+	if (!use_eager_fpu()) {
+		/* FPU state will be reallocated lazily at the first use. */
+		drop_fpu(tsk);
 		free_thread_xstate(tsk);
-	else if (!used_math()) {
+	} else if (!used_math()) {
 		/* kthread execs. TODO: cleanup this horror. */
-		if (WARN_ON(init_fpu(current)))
-			force_sig(SIGKILL, current);
+		if (WARN_ON(init_fpu(tsk)))
+			force_sig(SIGKILL, tsk);
 		user_fpu_begin();
 		restore_init_xstate();
 	}
-- 
cgit v1.2.3


From d2d0ac9a4644e00120bb9b7427a512a99d2cacc5 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 14 Mar 2015 11:52:12 +0100
Subject: x86/fpu: Fold __drop_fpu() into its sole user

Fold it into drop_fpu(). Phew, one less FPU function to pay attention
to.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 7d2f7fa6b2dd..2d4adff428ac 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -378,8 +378,14 @@ static inline void __thread_fpu_begin(struct task_struct *tsk)
 	__thread_set_has_fpu(tsk);
 }
 
-static inline void __drop_fpu(struct task_struct *tsk)
+static inline void drop_fpu(struct task_struct *tsk)
 {
+	/*
+	 * Forget coprocessor state..
+	 */
+	preempt_disable();
+	tsk->thread.fpu_counter = 0;
+
 	if (__thread_has_fpu(tsk)) {
 		/* Ignore delayed exceptions from user space */
 		asm volatile("1: fwait\n"
@@ -387,16 +393,7 @@ static inline void __drop_fpu(struct task_struct *tsk)
 			     _ASM_EXTABLE(1b, 2b));
 		__thread_fpu_end(tsk);
 	}
-}
 
-static inline void drop_fpu(struct task_struct *tsk)
-{
-	/*
-	 * Forget coprocessor state..
-	 */
-	preempt_disable();
-	tsk->thread.fpu_counter = 0;
-	__drop_fpu(tsk);
 	clear_stopped_child_used_math(tsk);
 	preempt_enable();
 }
-- 
cgit v1.2.3


From b85e67d1483c72b77d1bdc265aa8ba91590794c1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Mon, 16 Mar 2015 10:21:55 +0100
Subject: x86/fpu: Rename drop_init_fpu() to fpu_reset_state()

Call it what it does and in accordance with the context where it is
used: we reset the FPU state either because we were unable to restore it
from the one saved in the task or because we simply want to reset it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/fpu-internal.h | 8 ++++++--
 arch/x86/kernel/i387.c              | 2 +-
 arch/x86/kernel/signal.c            | 2 +-
 arch/x86/kernel/traps.c             | 2 +-
 arch/x86/kernel/xsave.c             | 4 ++--
 5 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index 2d4adff428ac..da5e96756570 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -406,7 +406,11 @@ static inline void restore_init_xstate(void)
 		fxrstor_checking(&init_xstate_buf->i387);
 }
 
-static inline void drop_init_fpu(struct task_struct *tsk)
+/*
+ * Reset the FPU state in the eager case and drop it in the lazy case (later use
+ * will reinit it).
+ */
+static inline void fpu_reset_state(struct task_struct *tsk)
 {
 	if (!use_eager_fpu())
 		drop_fpu(tsk);
@@ -480,7 +484,7 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)
 {
 	if (fpu.preload) {
 		if (unlikely(restore_fpu_checking(new)))
-			drop_init_fpu(new);
+			fpu_reset_state(new);
 	}
 }
 
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 29e982ada854..41575b9b1021 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -108,7 +108,7 @@ void __kernel_fpu_end(void)
 
 	if (__thread_has_fpu(me)) {
 		if (WARN_ON(restore_fpu_checking(me)))
-			drop_init_fpu(me);
+			fpu_reset_state(me);
 	} else if (!use_eager_fpu()) {
 		stts();
 	}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..59eaae6185e2 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -679,7 +679,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		 * Ensure the signal handler starts with the new fpu state.
 		 */
 		if (used_math())
-			drop_init_fpu(current);
+			fpu_reset_state(current);
 	}
 	signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7ee7369d5aec..edf66c066da9 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -863,7 +863,7 @@ void math_state_restore(void)
 	kernel_fpu_disable();
 	__thread_fpu_begin(tsk);
 	if (unlikely(restore_fpu_checking(tsk))) {
-		drop_init_fpu(tsk);
+		fpu_reset_state(tsk);
 		force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
 	} else {
 		tsk->thread.fpu_counter++;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 0bf82c5ac529..65c29b070e09 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 			 config_enabled(CONFIG_IA32_EMULATION));
 
 	if (!buf) {
-		drop_init_fpu(tsk);
+		fpu_reset_state(tsk);
 		return 0;
 	}
 
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
 		 */
 		user_fpu_begin();
 		if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
-			drop_init_fpu(tsk);
+			fpu_reset_state(tsk);
 			return -1;
 		}
 	}
-- 
cgit v1.2.3


From 4bd5bf8c85e6bca5be9e7c4b3d7ad1942ae323f3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Fri, 13 Mar 2015 19:27:16 +0100
Subject: x86/fpu: Don't allocate fpu->state for swapper/0

Now that kthreads do not use FPU until they get executed, swapper/0
doesn't need to allocate fpu->state.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150313182716.GB8249@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/xsave.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 65c29b070e09..ada8df7b89c0 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -680,8 +680,6 @@ void xsave_init(void)
 
 static inline void __init eager_fpu_init_bp(void)
 {
-	current->thread.fpu.state =
-	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
 	if (!init_xstate_buf)
 		setup_init_fpu_buf();
 }
-- 
cgit v1.2.3


From 7fc253e277ecf1ea57c2d670bdbcda3dffd19453 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Sat, 14 Mar 2015 16:13:34 +0100
Subject: x86/fpu: Kill eager_fpu_init_bp()

Now that eager_fpu_init_bp() does setup_init_fpu_buf() only and
nothing else, we can remove it and move this code into its "caller",
eager_fpu_init().

This avoids the confusing games with "static __refdata void (*boot_func)":

init_xstate_buf can be NULL only during boot, so it is safe to call the
__init-annotated setup_init_fpu_buf() function in eager_fpu_init(), we
just need to mark it as __init_refok.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Pekka Riikonen <priikone@iki.fi>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Suresh Siddha <sbsiddha@gmail.com>
Link: http://lkml.kernel.org/r/20150314151334.GC13029@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/xsave.c | 20 +++++++-------------
 1 file changed, 7 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index ada8df7b89c0..87a815b85f3e 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -678,16 +678,12 @@ void xsave_init(void)
 	this_func();
 }
 
-static inline void __init eager_fpu_init_bp(void)
-{
-	if (!init_xstate_buf)
-		setup_init_fpu_buf();
-}
-
-void eager_fpu_init(void)
+/*
+ * setup_init_fpu_buf() is __init and it is OK to call it here because
+ * init_xstate_buf will be unset only once during boot.
+ */
+void __init_refok eager_fpu_init(void)
 {
-	static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
-
 	WARN_ON(used_math());
 	current_thread_info()->status = 0;
 
@@ -699,10 +695,8 @@ void eager_fpu_init(void)
 		return;
 	}
 
-	if (boot_func) {
-		boot_func();
-		boot_func = NULL;
-	}
+	if (!init_xstate_buf)
+		setup_init_fpu_buf();
 }
 
 /*
-- 
cgit v1.2.3


From f77ac507f893fc00c1b9ea0076f3c9e664b0f9ab Mon Sep 17 00:00:00 2001
From: Jesse Larrew
Date: Fri, 13 Mar 2015 11:03:39 -0500
Subject: x86/mce: Use safe MSR accesses for AMD quirk

Certain MSRs are only relevant to a kernel in host mode, and kvm had
chosen not to implement these MSRs at all for guests. If a guest kernel
ever tried to access these MSRs, the result was a general protection
fault.

KVM will be separately patched to return 0 when these MSRs are read,
and this patch ensures that MSR accesses are tolerant of exceptions.

Signed-off-by: Jesse Larrew <jesse.larrew@amd.com>
[ Drop {} braces around loop ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Joel Schopp <joel.schopp@amd.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-edac@vger.kernel.org
Link: http://lkml.kernel.org/r/1426262619-5016-1-git-send-email-jesse.larrew@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index d760931a4546..196a1e34fe39 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1541,7 +1541,7 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 if (c->x86 == 0x15 &&
 		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
 			 int i;
-			 u64 val, hwcr;
+			 u64 hwcr;
 			 bool need_toggle;
 			 u32 msrs[] = {
 				0x00000413, /* MC4_MISC0 */
@@ -1556,15 +1556,9 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 			 if (need_toggle)
 				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
 
-			 for (i = 0; i < ARRAY_SIZE(msrs); i++) {
-				 rdmsrl(msrs[i], val);
-
-				 /* CntP bit set? */
-				 if (val & BIT_64(62)) {
-					val &= ~BIT_64(62);
-					wrmsrl(msrs[i], val);
-				 }
-			 }
+			 /* Clear CntP bit safely */
+			 for (i = 0; i < ARRAY_SIZE(msrs); i++)
+				 msr_clear_bit(msrs[i], 62);
 
 			 /* restore old settings */
 			 if (need_toggle)
-- 
cgit v1.2.3


From c9ce8712838e48bf356144122c5ecdcdac5d1829 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 13 Mar 2015 23:30:47 +0100
Subject: x86/mce: Reindent __mcheck_cpu_apply_quirks() properly

Had some strange 3 tabs + 2 chars indentation, probably from me. Fix it.

No code changed:

  # arch/x86/kernel/cpu/mcheck/mce.o:

   text    data     bss     dec     hex filename
  21371    5923     264   27558    6ba6 mce.o.before
  21371    5923     264   27558    6ba6 mce.o.after

md5:
   eb3996c84d15e08ed836f043df2cbb01  mce.o.before.asm
   eb3996c84d15e08ed836f043df2cbb01  mce.o.after.asm

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce.c | 48 ++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 196a1e34fe39..8548b714a16b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1531,39 +1531,39 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		 * Various K7s with broken bank 0 around. Always disable
 		 * by default.
 		 */
-		 if (c->x86 == 6 && cfg->banks > 0)
+		if (c->x86 == 6 && cfg->banks > 0)
 			mce_banks[0].ctl = 0;
 
-		 /*
-		  * Turn off MC4_MISC thresholding banks on those models since
-		  * they're not supported there.
-		  */
-		 if (c->x86 == 0x15 &&
-		     (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
-			 int i;
-			 u64 hwcr;
-			 bool need_toggle;
-			 u32 msrs[] = {
+		/*
+		 * Turn off MC4_MISC thresholding banks on those models since
+		 * they're not supported there.
+		 */
+		if (c->x86 == 0x15 &&
+		    (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+			int i;
+			u64 hwcr;
+			bool need_toggle;
+			u32 msrs[] = {
 				0x00000413, /* MC4_MISC0 */
 				0xc0000408, /* MC4_MISC1 */
-			 };
+			};
 
-			 rdmsrl(MSR_K7_HWCR, hwcr);
+			rdmsrl(MSR_K7_HWCR, hwcr);
 
-			 /* McStatusWrEn has to be set */
-			 need_toggle = !(hwcr & BIT(18));
+			/* McStatusWrEn has to be set */
+			need_toggle = !(hwcr & BIT(18));
 
-			 if (need_toggle)
-				 wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+			if (need_toggle)
+				wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
 
-			 /* Clear CntP bit safely */
-			 for (i = 0; i < ARRAY_SIZE(msrs); i++)
-				 msr_clear_bit(msrs[i], 62);
+			/* Clear CntP bit safely */
+			for (i = 0; i < ARRAY_SIZE(msrs); i++)
+				msr_clear_bit(msrs[i], 62);
 
-			 /* restore old settings */
-			 if (need_toggle)
-				 wrmsrl(MSR_K7_HWCR, hwcr);
-		 }
+			/* restore old settings */
+			if (need_toggle)
+				wrmsrl(MSR_K7_HWCR, hwcr);
+		}
 	}
 
 	if (c->x86_vendor == X86_VENDOR_INTEL) {
-- 
cgit v1.2.3


From 37dea8c52cacb4439eb183ad736747bb678d2a5d Mon Sep 17 00:00:00 2001
From: Sudeep Holla
Date: Wed, 11 Mar 2015 11:54:29 +0100
Subject: x86/cpu/cacheinfo: Fix cache_get_priv_group() for Intel processors

The private pointer provided by the cacheinfo code is used to implement
the AMD L3 cache-specific attributes using a pointer to the northbridge
descriptor. It is needed for performing L3-specific operations and for
that we need a couple of PCI devices and other service information, all
contained in the northbridge descriptor.

This results in failure of cacheinfo setup as shown below as
cache_get_priv_group() returns the uninitialised private attributes which
are not valid for Intel processors.

  ------------[ cut here ]------------
  WARNING: CPU: 3 PID: 1 at fs/sysfs/group.c:102
  internal_create_group+0x151/0x280()
  sysfs: (bin_)attrs not set by subsystem for group: index3/
  Modules linked in:
  CPU: 3 PID: 1 Comm: swapper/0 Not tainted 4.0.0-rc3+ #1
  Hardware name: Dell Inc. Precision T3600/0PTTT9, BIOS A13 05/11/2014
  ...
  Call Trace:
    dump_stack
    warn_slowpath_common
    warn_slowpath_fmt
    internal_create_group
    sysfs_create_groups
    device_add
    cpu_device_create
    ? __kmalloc
    cache_add_dev
    cacheinfo_sysfs_init
    ? container_dev_init
    do_one_initcall
    kernel_init_freeable
    ? rest_init
    kernel_init
    ret_from_fork
    ? rest_init

This patch fixes the issue by checking if the L3 cache indices are
populated correctly (AMD-specific) before initializing the private
attributes.

Reported-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 8008bc2dd2d0..edcb0e28c336 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -556,7 +556,7 @@ cache_get_priv_group(struct cacheinfo *this_leaf)
 {
 	struct amd_northbridge *nb = this_leaf->priv;
 
-	if (this_leaf->level < 3)
+	if (this_leaf->level < 3 || !nb)
 		return NULL;
 
 	if (nb && nb->l3_cache.indices)
-- 
cgit v1.2.3


From 1c1d046be692493d00a4831d4fbc266745008e09 Mon Sep 17 00:00:00 2001
From: Arjun Sreedharan
Date: Mon, 16 Mar 2015 21:07:47 +0530
Subject: x86/boot: Standardize strcmp()

strcmp() is always expected to return 0 when arguments are equal,
negative when its first argument @str1 is less than its second argument
@str2 and a positive value otherwise. Previously strcmp("a", "b")
returned 1. Now it gives -1, as it is supposed to.

Until now this bug never triggered, because all uses for strcmp() in the
boot code tested for nonzero:

  triton:~/tip> git grep strcmp arch/x86/boot/
  arch/x86/boot/boot.h:int strcmp(const char *str1, const char *str2);
  arch/x86/boot/edd.c:            if (!strcmp(eddarg, "skipmbr") || !strcmp(eddarg, "skip")) {
  arch/x86/boot/edd.c:            else if (!strcmp(eddarg, "off"))
  arch/x86/boot/edd.c:            else if (!strcmp(eddarg, "on"))

should in the future strcmp() be used in a comparative way in the boot
code, it might have led to (not so subtle) bugs.

Signed-off-by: Arjun Sreedharan <arjun024@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1426520267-1803-1-git-send-email-arjun024@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/string.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c
index 493f3fd9f139..318b8465d302 100644
--- a/arch/x86/boot/string.c
+++ b/arch/x86/boot/string.c
@@ -30,7 +30,7 @@ int strcmp(const char *str1, const char *str2)
 	int delta = 0;
 
 	while (*s1 || *s2) {
-		delta = *s2 - *s1;
+		delta = *s1 - *s2;
 		if (delta)
 			return delta;
 		s1++;
-- 
cgit v1.2.3


From 4e16ed99416ef569a89782a7234f95007919fadd Mon Sep 17 00:00:00 2001
From: Matt Fleming
Date: Thu, 26 Feb 2015 18:47:00 +0000
Subject: perf/x86/intel: Fix Makefile to actually build the cqm driver

Someone fat fingered a merge conflict and lost the Makefile hunk.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: <acme@redhat.com>
Cc: <hpa@zytor.com>
Cc: <jolsa@redhat.com>
Cc: <kanaka.d.juvva@intel.com>
Cc: <tglx@linutronix.de>
Cc: <torvalds@linux-foundation.org>
Cc: <vikas.shivappa@linux.intel.com>
Link: http://lkml.kernel.org/r/1424976420.15321.35.camel@mfleming-mobl1.ger.corp.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 80091ae54c2b..6c1ca139f736 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -39,7 +39,7 @@ obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_amd_iommu.o
 endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
-- 
cgit v1.2.3


From 50f16a8bf9d7a92c437ed1867d0f7e1dc6a9aca9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 5 Mar 2015 22:10:19 +0100
Subject: perf: Remove type specific target pointers

The only reason CQM had to use a hard-coded pmu type was so it could use
cqm_target in hw_perf_event.

Do away with the {tp,bp,cqm}_target pointers and provide a non type
specific one.

This allows us to do away with that silly pmu type as well.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Vince Weaver <vince@deater.net>
Cc: acme@kernel.org
Cc: acme@redhat.com
Cc: hpa@zytor.com
Cc: jolsa@redhat.com
Cc: kanaka.d.juvva@intel.com
Cc: matt.fleming@intel.com
Cc: tglx@linutronix.de
Cc: torvalds@linux-foundation.org
Cc: vikas.shivappa@linux.intel.com
Link: http://lkml.kernel.org/r/20150305211019.GU21418@twins.programming.kicks-ass.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/kernel/hw_breakpoint.c            |  2 +-
 arch/arm64/kernel/hw_breakpoint.c          |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_cqm.c |  7 +++----
 include/linux/perf_event.h                 |  4 +---
 include/uapi/linux/perf_event.h            |  1 -
 kernel/events/core.c                       | 14 ++++----------
 kernel/events/hw_breakpoint.c              |  8 ++++----
 kernel/trace/trace_uprobe.c                | 10 +++++-----
 8 files changed, 19 insertions(+), 29 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c
index 7fc70ae21185..dc7d0a95bd36 100644
--- a/arch/arm/kernel/hw_breakpoint.c
+++ b/arch/arm/kernel/hw_breakpoint.c
@@ -648,7 +648,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 		 * Per-cpu breakpoints are not supported by our stepping
 		 * mechanism.
 		 */
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			return -EINVAL;
 
 		/*
diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c
index df1cf15377b4..d062f35911c2 100644
--- a/arch/arm64/kernel/hw_breakpoint.c
+++ b/arch/arm64/kernel/hw_breakpoint.c
@@ -527,7 +527,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp)
 	 * Disallow per-task kernel breakpoints since these would
 	 * complicate the stepping code.
 	 */
-	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.bp_target)
+	if (info->ctrl.privilege == AARCH64_BREAKPOINT_EL1 && bp->hw.target)
 		return -EINVAL;
 
 	return 0;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
index 9a8ef8376fcd..e4d1b8b738fa 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c
@@ -263,7 +263,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 	/*
 	 * Events that target same task are placed into the same cache group.
 	 */
-	if (a->hw.cqm_target == b->hw.cqm_target)
+	if (a->hw.target == b->hw.target)
 		return true;
 
 	/*
@@ -279,7 +279,7 @@ static bool __match_event(struct perf_event *a, struct perf_event *b)
 static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
 {
 	if (event->attach_state & PERF_ATTACH_TASK)
-		return perf_cgroup_from_task(event->hw.cqm_target);
+		return perf_cgroup_from_task(event->hw.target);
 
 	return event->cgrp;
 }
@@ -1365,8 +1365,7 @@ static int __init intel_cqm_init(void)
 
 	__perf_cpu_notifier(intel_cqm_cpu_notifier);
 
-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm",
-				PERF_TYPE_INTEL_CQM);
+	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
 	if (ret)
 		pr_err("Intel CQM perf registration failed: %d\n", ret);
 	else
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index dac4c2831d82..5aa49d7bfd07 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -119,7 +119,6 @@ struct hw_perf_event {
 			struct hrtimer	hrtimer;
 		};
 		struct { /* tracepoint */
-			struct task_struct	*tp_target;
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
@@ -129,7 +128,6 @@ struct hw_perf_event {
 			struct list_head	cqm_events_entry;
 			struct list_head	cqm_groups_entry;
 			struct list_head	cqm_group_entry;
-			struct task_struct	*cqm_target;
 		};
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
 		struct { /* breakpoint */
@@ -138,12 +136,12 @@ struct hw_perf_event {
 			 * problem hw_breakpoint has with context
 			 * creation and event initalization.
 			 */
-			struct task_struct		*bp_target;
 			struct arch_hw_breakpoint	info;
 			struct list_head		bp_list;
 		};
 #endif
 	};
+	struct task_struct		*target;
 	int				state;
 	local64_t			prev_count;
 	u64				sample_period;
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 3c8b45de57ec..1e3cd07cf76e 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -32,7 +32,6 @@ enum perf_type_id {
 	PERF_TYPE_HW_CACHE			= 3,
 	PERF_TYPE_RAW				= 4,
 	PERF_TYPE_BREAKPOINT			= 5,
-	PERF_TYPE_INTEL_CQM			= 6,
 
 	PERF_TYPE_MAX,				/* non-ABI */
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 71109a045450..525062b6fba1 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -7171,18 +7171,12 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 
 	if (task) {
 		event->attach_state = PERF_ATTACH_TASK;
-
-		if (attr->type == PERF_TYPE_TRACEPOINT)
-			event->hw.tp_target = task;
-#ifdef CONFIG_HAVE_HW_BREAKPOINT
 		/*
-		 * hw_breakpoint is a bit difficult here..
+		 * XXX pmu::event_init needs to know what task to account to
+		 * and we cannot use the ctx information because we need the
+		 * pmu before we get a ctx.
 		 */
-		else if (attr->type == PERF_TYPE_BREAKPOINT)
-			event->hw.bp_target = task;
-#endif
-		else if (attr->type == PERF_TYPE_INTEL_CQM)
-			event->hw.cqm_target = task;
+		event->hw.target = task;
 	}
 
 	if (!overflow_handler && parent_event) {
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 9803a6600d49..92ce5f4ccc26 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -116,12 +116,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type)
  */
 static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type)
 {
-	struct task_struct *tsk = bp->hw.bp_target;
+	struct task_struct *tsk = bp->hw.target;
 	struct perf_event *iter;
 	int count = 0;
 
 	list_for_each_entry(iter, &bp_task_head, hw.bp_list) {
-		if (iter->hw.bp_target == tsk &&
+		if (iter->hw.target == tsk &&
 		    find_slot_idx(iter) == type &&
 		    (iter->cpu < 0 || cpu == iter->cpu))
 			count += hw_breakpoint_weight(iter);
@@ -153,7 +153,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
 		int nr;
 
 		nr = info->cpu_pinned;
-		if (!bp->hw.bp_target)
+		if (!bp->hw.target)
 			nr += max_task_bp_pinned(cpu, type);
 		else
 			nr += task_bp_pinned(cpu, bp, type);
@@ -210,7 +210,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
 		weight = -weight;
 
 	/* Pinned counter cpu profiling */
-	if (!bp->hw.bp_target) {
+	if (!bp->hw.target) {
 		get_bp_info(bp->cpu, type)->cpu_pinned += weight;
 		return;
 	}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b11441321e7a..93fdc7791eaa 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1005,7 +1005,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 		return true;
 
 	list_for_each_entry(event, &filter->perf_events, hw.tp_list) {
-		if (event->hw.tp_target->mm == mm)
+		if (event->hw.target->mm == mm)
 			return true;
 	}
 
@@ -1015,7 +1015,7 @@ __uprobe_perf_filter(struct trace_uprobe_filter *filter, struct mm_struct *mm)
 static inline bool
 uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
 {
-	return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
+	return __uprobe_perf_filter(&tu->filter, event->hw.target->mm);
 }
 
 static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
@@ -1023,10 +1023,10 @@ static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
 	bool done;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		list_del(&event->hw.tp_list);
 		done = tu->filter.nr_systemwide ||
-			(event->hw.tp_target->flags & PF_EXITING) ||
+			(event->hw.target->flags & PF_EXITING) ||
 			uprobe_filter_event(tu, event);
 	} else {
 		tu->filter.nr_systemwide--;
@@ -1046,7 +1046,7 @@ static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
 	int err;
 
 	write_lock(&tu->filter.rwlock);
-	if (event->hw.tp_target) {
+	if (event->hw.target) {
 		/*
 		 * event->parent != NULL means copy_process(), we can avoid
 		 * uprobe_apply(). current->mm must be probed and we can rely
-- 
cgit v1.2.3


From a67e7277d01ccfd39b0db5a198c2643cc19dd79c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:29 -0700
Subject: x86/asm/entry: Add user_mode_ignore_vm86()

user_mode() is dangerous and user_mode_vm() has a confusing name.

Add user_mode_ignore_vm86() (equivalent to current user_mode()).
We'll change the small number of legitimate users of user_mode()
to user_mode_ignore_vm86().

Inspired by grsec, although this works rather differently.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/202c56ca63823c338af8e2e54948dbe222da6343.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ptrace.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 83b874da2762..4a040f0078f2 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -121,6 +121,23 @@ static inline int user_mode_vm(struct pt_regs *regs)
 #endif
 }
 
+/*
+ * This is the fastest way to check whether regs come from user space.
+ * It is unsafe if regs might come from vm86 mode, though -- in vm86
+ * mode, all bits of CS and SS are completely under the user's control.
+ * The CPU considers vm86 mode to be CPL 3 regardless of CS and SS.
+ *
+ * Do NOT use this function unless you have already ruled out the
+ * possibility that regs came from vm86 mode.
+ *
+ * We check for RPL != 0 instead of RPL == 3 because we don't use rings
+ * 1 or 2 and this is more efficient.
+ */
+static inline int user_mode_ignore_vm86(struct pt_regs *regs)
+{
+	return (regs->cs & SEGMENT_RPL_MASK) != 0;
+}
+
 static inline int v8086_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 383f3af3f88aadafe1fcf1948987ad538683fb8c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:30 -0700
Subject: x86/asm/entry, perf: Explicitly optimize vm86 handling in
 code_segment_base()

There's no point in checking the VM bit on 64-bit, and, since
we're explicitly checking it, we can use user_mode_ignore_vm86()
after the check.

While we're at it, rearrange the #ifdef slightly to make the code
flow a bit clearer.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/dc1457a734feccd03a19bb3538a7648582f57cdd.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 979963bb3977..56f7e60ad732 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2146,6 +2146,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
  */
 static unsigned long code_segment_base(struct pt_regs *regs)
 {
+	/*
+	 * For IA32 we look at the GDT/LDT segment base to convert the
+	 * effective IP to a linear address.
+	 */
+
+#ifdef CONFIG_X86_32
 	/*
 	 * If we are in VM86 mode, add the segment offset to convert to a
 	 * linear address.
@@ -2153,12 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (regs->flags & X86_VM_MASK)
 		return 0x10 * regs->cs;
 
-	/*
-	 * For IA32 we look at the GDT/LDT segment base to convert the
-	 * effective IP to a linear address.
-	 */
-#ifdef CONFIG_X86_32
-	if (user_mode(regs) && regs->cs != __USER_CS)
+	if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
 	if (user_mode(regs) && !user_64bit_mode(regs) &&
-- 
cgit v1.2.3


From ae60f0710ae6b33092267ef8ac853c498f6d3e5d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:31 -0700
Subject: x86/asm/entry: Use user_mode_ignore_vm86() where appropriate

A few of the user_mode() checks in traps.c are immediately after
explicit checks for vm86 mode.  Change them to user_mode_ignore_vm86().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0b324d5b75c3402be07f8d3c6245ed7f4995029e.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 277341128b47..1136961a679b 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -208,7 +208,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 		return -1;
 	}
 #endif
-	if (!user_mode(regs)) {
+	if (!user_mode_ignore_vm86(regs)) {
 		if (!fixup_exception(regs)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
@@ -471,7 +471,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 #endif
 
 	tsk = current;
-	if (!user_mode(regs)) {
+	if (!user_mode_ignore_vm86(regs)) {
 		if (fixup_exception(regs))
 			goto exit;
 
@@ -688,7 +688,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 * We already checked v86 mode above, so we can check for kernel mode
 	 * by just checking the CPL of CS.
 	 */
-	if ((dr6 & DR_STEP) && !user_mode(regs)) {
+	if ((dr6 & DR_STEP) && !user_mode_ignore_vm86(regs)) {
 		tsk->thread.debugreg6 &= ~DR_STEP;
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
-- 
cgit v1.2.3


From efa704510342b81ae58d7b8a0c7f676a4289b603 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:32 -0700
Subject: x86/asm/entry: Make user_mode() work correctly if regs came from VM86
 mode

user_mode() is now identical to user_mode_vm().  Subsequent patches
will change all callers of user_mode_vm() to user_mode() and then
delete user_mode_vm().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/0dd03eacb5f0a2b5ba0240de25347a31b493c289.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ptrace.h | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 4a040f0078f2..70c439f9071b 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -96,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs)
 }
 
 /*
- * user_mode_vm(regs) determines whether a register set came from user mode.
- * This is true if V8086 mode was enabled OR if the register set was from
- * protected mode with RPL-3 CS value.  This tricky test checks that with
- * one comparison.  Many places in the kernel can bypass this full check
- * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
+ * user_mode(regs) determines whether a register set came from user
+ * mode.  On x86_32, this is true if V8086 mode was enabled OR if the
+ * register set was from protected mode with RPL-3 CS value.  This
+ * tricky test checks that with one comparison.
+ *
+ * On x86_64, vm86 mode is mercifully nonexistent, and we don't need
+ * the extra check.
  */
 static inline int user_mode(struct pt_regs *regs)
 {
@@ -113,12 +115,7 @@ static inline int user_mode(struct pt_regs *regs)
 
 static inline int user_mode_vm(struct pt_regs *regs)
 {
-#ifdef CONFIG_X86_32
-	return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
-		USER_RPL;
-#else
 	return user_mode(regs);
-#endif
 }
 
 /*
-- 
cgit v1.2.3


From f39b6f0ef855a38ea17329a4e621ff97750dfcc2 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:33 -0700
Subject: x86/asm/entry: Change all 'user_mode_vm()' calls to 'user_mode()'

user_mode_vm() and user_mode() are now the same.  Change all callers
of user_mode_vm() to user_mode().

The next patch will remove the definition of user_mode_vm.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/43b1f57f3df70df5a08b0925897c660725015554.1426728647.git.luto@kernel.org
[ Merged to a more recent kernel. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/alternative.c  |  2 +-
 arch/x86/kernel/crash.c        |  2 +-
 arch/x86/kernel/dumpstack.c    |  4 ++--
 arch/x86/kernel/dumpstack_32.c |  4 ++--
 arch/x86/kernel/i387.c         |  2 +-
 arch/x86/kernel/irq_32.c       |  2 +-
 arch/x86/kernel/irq_64.c       |  2 +-
 arch/x86/kernel/kgdb.c         |  4 ++--
 arch/x86/kernel/kprobes/core.c |  4 ++--
 arch/x86/kernel/process_32.c   |  2 +-
 arch/x86/kernel/ptrace.c       |  2 +-
 arch/x86/kernel/time.c         |  2 +-
 arch/x86/kernel/traps.c        | 16 ++++++++--------
 arch/x86/kernel/uprobes.c      |  2 +-
 arch/x86/mm/fault.c            |  6 +++---
 arch/x86/oprofile/backtrace.c  |  2 +-
 drivers/misc/sgi-xp/xpc_main.c |  2 +-
 17 files changed, 30 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index af397cc98d05..5c993c94255e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -715,7 +715,7 @@ int poke_int3_handler(struct pt_regs *regs)
 	if (likely(!bp_patching_in_progress))
 		return 0;
 
-	if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+	if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
 		return 0;
 
 	/* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f90c716..c76d3e37c6e1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
 #ifdef CONFIG_X86_32
 	struct pt_regs fixed_regs;
 
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		crash_fixup_ss_esp(&fixed_regs, regs);
 		regs = &fixed_regs;
 	}
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d8d039..ab3b65639a3e 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
 	print_modules();
 	show_regs(regs);
 #ifdef CONFIG_X86_32
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		sp = regs->sp;
 		ss = regs->ss & 0xffff;
 	} else {
@@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err)
 	unsigned long flags = oops_begin();
 	int sig = SIGSEGV;
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 		report_bug(regs->ip, regs);
 
 	if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd4230c..39891ff50d03 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs)
 	int i;
 
 	show_regs_print_info(KERN_EMERG);
-	__show_regs(regs, !user_mode_vm(regs));
+	__show_regs(regs, !user_mode(regs));
 
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
 	 */
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		unsigned int code_prologue = code_bytes * 43 / 64;
 		unsigned int code_len = code_bytes;
 		unsigned char c;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fce0b71..29c740deafec 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
 static inline bool interrupted_user_mode(void)
 {
 	struct pt_regs *regs = get_irq_regs();
-	return regs && user_mode_vm(regs);
+	return regs && user_mode(regs);
 }
 
 /*
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5eb8f4..f9fd86a7fcc7 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
 	if (unlikely(!desc))
 		return false;
 
-	if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
 		desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d5558c..394e643d7830 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 	u64 estack_top, estack_bottom;
 	u64 curbase = (u64)task_stack_page(current);
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return;
 
 	if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f8d283..7fe3a9d377ea 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
 #ifdef CONFIG_X86_32
 	switch (regno) {
 	case GDB_SS:
-		if (!user_mode_vm(regs))
+		if (!user_mode(regs))
 			*(unsigned long *)mem = __KERNEL_DS;
 		break;
 	case GDB_SP:
-		if (!user_mode_vm(regs))
+		if (!user_mode(regs))
 			*(unsigned long *)mem = kernel_stack_pointer(regs);
 		break;
 	case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 4e3d5a9621fe..24d079604fd5 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
 	struct kprobe *p;
 	struct kprobe_ctlblk *kcb;
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return 0;
 
 	addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
 	struct die_args *args = data;
 	int ret = NOTIFY_DONE;
 
-	if (args->regs && user_mode_vm(args->regs))
+	if (args->regs && user_mode(args->regs))
 		return ret;
 
 	if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 26c596d1ee07..c5e987022ca0 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
 	unsigned long sp;
 	unsigned short ss, gs;
 
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		sp = regs->sp;
 		ss = regs->ss & 0xffff;
 		gs = get_user_gs(regs);
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 1e125817cf9f..a7bc79480719 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1415,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
 	memset(info, 0, sizeof(*info));
 	info->si_signo = SIGTRAP;
 	info->si_code = si_code;
-	info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+	info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
 }
 
 void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e16eaa..d39c09119db6 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
-	if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+	if (!user_mode(regs) && in_lock_functions(pc)) {
 #ifdef CONFIG_FRAME_POINTER
 		return *(unsigned long *)(regs->bp + sizeof(long));
 #else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1136961a679b..d4e265952102 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 {
 	enum ctx_state prev_state;
 
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		/* Other than that, we're just an exception. */
 		prev_state = exception_enter();
 	} else {
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
 	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode_vm(regs))
+	if (user_mode(regs))
 		return exception_exit(prev_state);
 	else
 		rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  *
  * IST exception handlers normally cannot schedule.  As a special
  * exception, if the exception interrupted userspace code (i.e.
- * user_mode_vm(regs) would return true) and the exception was not
+ * user_mode(regs) would return true) and the exception was not
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
@@ -167,7 +167,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
 {
-	BUG_ON(!user_mode_vm(regs));
+	BUG_ON(!user_mode(regs));
 
 	/*
 	 * Sanity check: we need to be on the normal thread stack.  This
@@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		goto exit;
 	conditional_sti(regs);
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 		die("bounds", regs, error_code);
 
 	if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -587,7 +587,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
 	/* Copy the remainder of the stack from the current stack. */
 	memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
 
-	BUG_ON(!user_mode_vm(&new_stack->regs));
+	BUG_ON(!user_mode(&new_stack->regs));
 	return new_stack;
 }
 NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +637,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 * then it's very likely the result of an icebp/int01 trap.
 	 * User wants a sigtrap for that.
 	 */
-	if (!dr6 && user_mode_vm(regs))
+	if (!dr6 && user_mode(regs))
 		user_icebp = 1;
 
 	/* Catch kmemcheck conditions first of all! */
@@ -721,7 +721,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 		return;
 	conditional_sti(regs);
 
-	if (!user_mode_vm(regs))
+	if (!user_mode(regs))
 	{
 		if (!fixup_exception(regs)) {
 			task->thread.error_code = error_code;
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0679e..0b81ad67da07 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
 	int ret = NOTIFY_DONE;
 
 	/* We are only interested in userspace traps */
-	if (regs && !user_mode_vm(regs))
+	if (regs && !user_mode(regs))
 		return NOTIFY_DONE;
 
 	switch (val) {
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ae340d3761ca..181c53bac3a7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs)
 	int ret = 0;
 
 	/* kprobe_running() needs smp_processor_id() */
-	if (kprobes_built_in() && !user_mode_vm(regs)) {
+	if (kprobes_built_in() && !user_mode(regs)) {
 		preempt_disable();
 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
 			ret = 1;
@@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 	if (error_code & PF_USER)
 		return false;
 
-	if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC))
+	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
 		return false;
 
 	return true;
@@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * User-mode registers count as a user access even for any
 	 * potential system fault or CPU buglet:
 	 */
-	if (user_mode_vm(regs)) {
+	if (user_mode(regs)) {
 		local_irq_enable();
 		error_code |= PF_USER;
 		flags |= FAULT_FLAG_USER;
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 5d04be5efb64..4e664bdb535a 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
 {
 	struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
 
-	if (!user_mode_vm(regs)) {
+	if (!user_mode(regs)) {
 		unsigned long stack = kernel_stack_pointer(regs);
 		if (depth)
 			dump_trace(NULL, regs, (unsigned long *)stack, 0,
diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c
index 82dc5748f873..7f327121e6d7 100644
--- a/drivers/misc/sgi-xp/xpc_main.c
+++ b/drivers/misc/sgi-xp/xpc_main.c
@@ -1210,7 +1210,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args)
 
 		if (((die_args->trapnr == X86_TRAP_MF) ||
 		     (die_args->trapnr == X86_TRAP_XF)) &&
-		    !user_mode_vm(die_args->regs))
+		    !user_mode(die_args->regs))
 			xpc_die_deactivate();
 
 		break;
-- 
cgit v1.2.3


From 7a2806741e7327a6b20ccef42e8d56588cb2fef5 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:34 -0700
Subject: x86/asm/entry: Remove user_mode_vm()

It has no callers anymore.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a594afd6a0bddb1311bd7c92a15201c87fbb8681.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ptrace.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 70c439f9071b..d20bae298852 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -113,11 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
 #endif
 }
 
-static inline int user_mode_vm(struct pt_regs *regs)
-{
-	return user_mode(regs);
-}
-
 /*
  * This is the fastest way to check whether regs come from user space.
  * It is unsafe if regs might come from vm86 mode, though -- in vm86
-- 
cgit v1.2.3


From d74ef1118a146ae1135c8b26fff2bfee980fd7a4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 18 Mar 2015 18:33:35 -0700
Subject: x86/asm/entry: Replace some open-coded VM86 checks with v8086_mode()
 checks

This allows us to remove some unnecessary ifdefs.  There should
be no change to the generated code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f7e00f0d668e253abf0bd8bf36491ac47bd761ff.1426728647.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/traps.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index d4e265952102..c8eb469a94a4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -194,8 +194,7 @@ static nokprobe_inline int
 do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 		  struct pt_regs *regs,	long error_code)
 {
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		/*
 		 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
 		 * On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 		}
 		return -1;
 	}
-#endif
+
 	if (!user_mode_ignore_vm86(regs)) {
 		if (!fixup_exception(regs)) {
 			tsk->thread.error_code = error_code;
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	prev_state = exception_enter();
 	conditional_sti(regs);
 
-#ifdef CONFIG_X86_32
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 		goto exit;
 	}
-#endif
 
 	tsk = current;
 	if (!user_mode_ignore_vm86(regs)) {
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	/* It's safe to allow irq's after DR6 has been saved */
 	preempt_conditional_sti(regs);
 
-	if (regs->flags & X86_VM_MASK) {
+	if (v8086_mode(regs)) {
 		handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
 					X86_TRAP_DB);
 		preempt_conditional_cli(regs);
-- 
cgit v1.2.3


From 34061f134f70d33296fa56678cee122dd7010401 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 23 Mar 2015 14:03:59 +0100
Subject: x86/asm/entry/64: Fix incorrect comment

The recent old_rsp -> rsp_scratch rename also changed this
comment, but in this case "old_rsp" was not referring to
PER_CPU(old_rsp).

Fix this.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427115839-6397-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0c91256d73df..9184392a47b3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -657,7 +657,7 @@ common_interrupt:
 	ASM_CLAC
 	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
 	interrupt do_IRQ
-	/* 0(%rsp): rsp_scratch */
+	/* 0(%rsp): old RSP */
 ret_from_intr:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-- 
cgit v1.2.3


From 633d6f17cd91ad5bf2370265946f716e42d388c6 Mon Sep 17 00:00:00 2001
From: Juergen Gross
Date: Fri, 20 Mar 2015 13:55:38 +0100
Subject: x86/xen: prepare p2m list for memory hotplug

Commit 054954eb051f35e74b75a566a96fe756015352c8 ("xen: switch to linear
virtual mapped sparse p2m list") introduced a regression regarding to
memory hotplug for a pv-domain: as the virtual space for the p2m list
is allocated for the to be expected memory size of the domain only,
hotplugged memory above that size will not be usable by the domain.

Correct this by using a configurable size for the p2m list in case of
memory hotplug enabled (default supported memory size is 512 GB for
64 bit domains and 4 GB for 32 bit domains).

Signed-off-by: Juergen Gross <jgross@suse.com>
Cc: <stable@vger.kernel.org> # 3.19+
Reviewed-by: Daniel Kiper <daniel.kiper@oracle.com>
Signed-off-by: David Vrabel <david.vrabel@citrix.com>
---
 arch/x86/xen/p2m.c  | 10 +++++++++-
 drivers/xen/Kconfig | 17 +++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 9f93af56a5fc..b47124d4cd67 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -91,6 +91,12 @@ EXPORT_SYMBOL_GPL(xen_p2m_size);
 unsigned long xen_max_p2m_pfn __read_mostly;
 EXPORT_SYMBOL_GPL(xen_max_p2m_pfn);
 
+#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#define P2M_LIMIT CONFIG_XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+#else
+#define P2M_LIMIT 0
+#endif
+
 static DEFINE_SPINLOCK(p2m_update_lock);
 
 static unsigned long *p2m_mid_missing_mfn;
@@ -385,9 +391,11 @@ static void __init xen_rebuild_p2m_list(unsigned long *p2m)
 void __init xen_vmalloc_p2m_tree(void)
 {
 	static struct vm_struct vm;
+	unsigned long p2m_limit;
 
+	p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE;
 	vm.flags = VM_ALLOC;
-	vm.size = ALIGN(sizeof(unsigned long) * xen_max_p2m_pfn,
+	vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit),
 			PMD_SIZE * PMDS_PER_MID_PAGE);
 	vm_area_register_early(&vm, PMD_SIZE * PMDS_PER_MID_PAGE);
 	pr_notice("p2m virtual area at %p, size is %lx\n", vm.addr, vm.size);
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index b812462083fc..94d96809e686 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -55,6 +55,23 @@ config XEN_BALLOON_MEMORY_HOTPLUG
 
 	  In that case step 3 should be omitted.
 
+config XEN_BALLOON_MEMORY_HOTPLUG_LIMIT
+	int "Hotplugged memory limit (in GiB) for a PV guest"
+	default 512 if X86_64
+	default 4 if X86_32
+	range 0 64 if X86_32
+	depends on XEN_HAVE_PVMMU
+	depends on XEN_BALLOON_MEMORY_HOTPLUG
+	help
+	  Maxmium amount of memory (in GiB) that a PV guest can be
+	  expanded to when using memory hotplug.
+
+	  A PV guest can have more memory than this limit if is
+	  started with a larger maximum.
+
+	  This value is used to allocate enough space in internal
+	  tables needed for physical memory administration.
+
 config XEN_SCRUB_PAGES
 	bool "Scrub pages before returning them to system"
 	depends on XEN_BALLOON
-- 
cgit v1.2.3


From 0a4e6be9ca17c54817cf814b4b5aa60478c6df27 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti
Date: Mon, 23 Mar 2015 20:21:51 -0300
Subject: x86: kvm: Revert "remove sched notifier for cross-cpu migrations"

The following point:

    2. per-CPU pvclock time info is updated if the
       underlying CPU changes.

Is not true anymore since "KVM: x86: update pvclock area conditionally,
on cpu migration".

Add task migration notification back.

Problem noticed by Andy Lutomirski.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
CC: stable@kernel.org # 3.11+
---
 arch/x86/include/asm/pvclock.h |  1 +
 arch/x86/kernel/pvclock.c      | 44 ++++++++++++++++++++++++++++++++++++++++++
 arch/x86/vdso/vclock_gettime.c | 16 +++++++--------
 include/linux/sched.h          |  8 ++++++++
 kernel/sched/core.c            | 15 ++++++++++++++
 5 files changed, 76 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index d6b078e9fa28..25b1cc07d496 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -95,6 +95,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 
 struct pvclock_vsyscall_time_info {
 	struct pvclock_vcpu_time_info pvti;
+	u32 migrate_count;
 } __attribute__((__aligned__(SMP_CACHE_BYTES)));
 
 #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info)
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d229a58..e5ecd20e72dd 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
 	set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
 }
 
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+	if (!pvclock_vdso_info) {
+		BUG();
+		return NULL;
+	}
+
+	return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+	return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
 #ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+			        void *v)
+{
+	struct task_migration_notifier *mn = v;
+	struct pvclock_vsyscall_time_info *pvti;
+
+	pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+	/* this is NULL when pvclock vsyscall is not initialized */
+	if (unlikely(pvti == NULL))
+		return NOTIFY_DONE;
+
+	pvti->migrate_count++;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+	.notifier_call = pvclock_task_migrate,
+};
+
 /*
  * Initialize the generic pvclock vsyscall state.  This will allocate
  * a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
 
 	WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
 
+	pvclock_vdso_info = i;
+
 	for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
 		__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
 			     __pa(i) + (idx*PAGE_SIZE),
 			     PAGE_KERNEL_VVAR);
 	}
 
+
+	register_task_migration_notifier(&pvclock_migrate);
+
 	return 0;
 }
 #endif
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 9793322751e0..30933760ee5f 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -82,18 +82,15 @@ static notrace cycle_t vread_pvclock(int *mode)
 	cycle_t ret;
 	u64 last;
 	u32 version;
+	u32 migrate_count;
 	u8 flags;
 	unsigned cpu, cpu1;
 
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
-	 *
+	 * When looping to get a consistent (time-info, tsc) pair, we
+	 * also need to deal with the possibility we can switch vcpus,
+	 * so make sure we always re-fetch time-info for the current vcpu.
 	 */
 	do {
 		cpu = __getcpu() & VGETCPU_CPU_MASK;
@@ -104,6 +101,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 		pvti = get_pvti(cpu);
 
+		migrate_count = pvti->migrate_count;
+
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
@@ -115,7 +114,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
 	} while (unlikely(cpu != cpu1 ||
 			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
+			  pvti->pvti.version != version ||
+			  pvti->migrate_count != migrate_count));
 
 	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
 		*mode = VCLOCK_NONE;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6d77432e14ff..be98910cc1e2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -176,6 +176,14 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
 
+/* Notifier for when a task gets migrated to a new CPU */
+struct task_migration_notifier {
+	struct task_struct *task;
+	int from_cpu;
+	int to_cpu;
+};
+extern void register_task_migration_notifier(struct notifier_block *n);
+
 extern unsigned long get_parent_ip(unsigned long addr);
 
 extern void dump_cpu_task(int cpu);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f0f831e8a345..d0c4209bb836 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -996,6 +996,13 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 		rq_clock_skip_update(rq, true);
 }
 
+static ATOMIC_NOTIFIER_HEAD(task_migration_notifier);
+
+void register_task_migration_notifier(struct notifier_block *n)
+{
+	atomic_notifier_chain_register(&task_migration_notifier, n);
+}
+
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
@@ -1026,10 +1033,18 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 	trace_sched_migrate_task(p, new_cpu);
 
 	if (task_cpu(p) != new_cpu) {
+		struct task_migration_notifier tmn;
+
 		if (p->sched_class->migrate_task_rq)
 			p->sched_class->migrate_task_rq(p, new_cpu);
 		p->se.nr_migrations++;
 		perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+
+		tmn.task = p;
+		tmn.from_cpu = task_cpu(p);
+		tmn.to_cpu = new_cpu;
+
+		atomic_notifier_call_chain(&task_migration_notifier, 0, &tmn);
 	}
 
 	__set_task_cpu(p, new_cpu);
-- 
cgit v1.2.3


From c806a6ad35bfa6c92249cd0ca4772d5ac3f8cb68 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Wed, 18 Mar 2015 19:38:22 +0100
Subject: KVM: x86: call irq notifiers with directed EOI

kvm_ioapic_update_eoi() wasn't called if directed EOI was enabled.
We need to do that for irq notifiers.  (Like with edge interrupts.)

Fix it by skipping EOI broadcast only.

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=82211
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Tested-by: Bandan Das <bsd@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/ioapic.c | 4 +++-
 arch/x86/kvm/lapic.c  | 3 +--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index b1947e0f3e10..46d4449772bc 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -422,6 +422,7 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 			struct kvm_ioapic *ioapic, int vector, int trigger_mode)
 {
 	int i;
+	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	for (i = 0; i < IOAPIC_NUM_PINS; i++) {
 		union kvm_ioapic_redirect_entry *ent = &ioapic->redirtbl[i];
@@ -443,7 +444,8 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 		kvm_notify_acked_irq(ioapic->kvm, KVM_IRQCHIP_IOAPIC, i);
 		spin_lock(&ioapic->lock);
 
-		if (trigger_mode != IOAPIC_LEVEL_TRIG)
+		if (trigger_mode != IOAPIC_LEVEL_TRIG ||
+		    kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)
 			continue;
 
 		ASSERT(ent->fields.trig_mode == IOAPIC_LEVEL_TRIG);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index bd4e34de24c7..4ee827d7bf36 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -833,8 +833,7 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
-	if (!(kvm_apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI) &&
-	    kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
+	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
 		int trigger_mode;
 		if (apic_test_vector(vector, apic->regs + APIC_TMR))
 			trigger_mode = IOAPIC_LEVEL_TRIG;
-- 
cgit v1.2.3


From a123374ff3c6850e1340b6da010bb43668d710e1 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 19 Mar 2015 21:52:41 +0100
Subject: KVM: x86: inline kvm_ioapic_handles_vector()

An overhead from function call is not appropriate for its size and
frequency of execution.

Suggested-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/ioapic.c | 7 -------
 arch/x86/kvm/ioapic.h | 8 +++++++-
 2 files changed, 7 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index a2e9d961c7fe..24f0f17639d6 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -473,13 +473,6 @@ static void __kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu,
 	}
 }
 
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
-{
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	smp_rmb();
-	return test_bit(vector, ioapic->handled_vectors);
-}
-
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector, int trigger_mode)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index 38d8402ea65c..6e265cfcd86a 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -98,13 +98,19 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 }
 
+static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+{
+	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
+	smp_rmb();
+	return test_bit(vector, ioapic->handled_vectors);
+}
+
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		int short_hand, unsigned int dest, int dest_mode);
 int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2);
 void kvm_ioapic_update_eoi(struct kvm_vcpu *vcpu, int vector,
 			int trigger_mode);
-bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector);
 int kvm_ioapic_init(struct kvm *kvm);
 void kvm_ioapic_destroy(struct kvm *kvm);
 int kvm_ioapic_set_irq(struct kvm_ioapic *ioapic, int irq, int irq_source_id,
-- 
cgit v1.2.3


From a76c7f4604937bc781bfc411ef92c59474ddadda Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Sun, 22 Mar 2015 20:48:14 +0100
Subject: x86/asm/entry/64: Fold syscall32_cpu_init() into its sole user

Having syscall32/sysenter32 initialization in a separate tiny
function, called from within a function that is already syscall
init specific, serves no real purpose.

Its existense also caused an unintended effect of having
wrmsrl(MSR_CSTAR) performed twice: once we set it to a dummy
function returning -ENOSYS, and immediately after
(if CONFIG_IA32_EMULATION), we set it to point to the proper
syscall32 entry point, ia32_cstar_target.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 31 ++++++++++---------------------
 1 file changed, 10 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d79f139871e0..66d62aef7214 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -959,24 +959,6 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 #endif
 }
 
-#ifdef CONFIG_X86_64
-# ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
-	/*
-	 * Always load these, in case some future 64-bit CPU supports
-	 * SYSENTER from compat mode too:
-	 */
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
-	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
-	wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-# endif
-#endif
-
 /*
  * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
  * on 32-bit kernels:
@@ -1187,10 +1169,17 @@ void syscall_init(void)
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
 	wrmsrl(MSR_LSTAR, system_call);
+#ifndef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, ignore_sysret);
-
-#ifdef CONFIG_IA32_EMULATION
-	syscall32_cpu_init();
+#else
+	wrmsrl(MSR_CSTAR, ia32_cstar_target);
+	/*
+	 * Always load these, in case some future 64-bit CPU supports
+	 * SYSENTER from compat mode too:
+	 */
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 #endif
 
 	/* Flags to clear on syscall */
-- 
cgit v1.2.3


From bf80bbd7dcf525e41e0673fbaa8cd21d2344b460 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Mon, 23 Mar 2015 10:42:52 -0500
Subject: x86/mce: Add an AMD severities-grading function

Add a severities function that caters to AMD processors. This allows us
to do some vendor-specific work within the function if necessary.

Also, introduce a vendor flag bitfield for vendor-specific settings. The
severities code uses this to define error scope based on the prescence
of the flags field.

This is based off of work by Boris Petkov.

Testing details:
Fam10h, Model 9h (Greyhound)
Fam15h: Models 0h-0fh (Orochi), 30h-3fh (Kaveri) and 60h-6fh (Carrizo),
Fam16h Model 00h-0fh (Kabini)

Boris:
Intel SNB
AMD K8 (JH-E0)

Signed-off-by: Aravind Gopalakrishnan <aravind.gopalakrishnan@amd.com>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: linux-edac@vger.kernel.org
Link: http://lkml.kernel.org/r/1427125373-2918-2-git-send-email-Aravind.Gopalakrishnan@amd.com
[ Fixup build, clean up comments. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/mce.h                |  6 ++++
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 56 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/mcheck/mce.c          |  9 +++++
 3 files changed, 71 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index fd38a23e729f..b574fbf62d39 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -116,6 +116,12 @@ struct mca_config {
 	u32 rip_msr;
 };
 
+struct mce_vendor_flags {
+	__u64		overflow_recov	: 1, /* cpuid_ebx(80000007) */
+			__reserved_0	: 63;
+};
+extern struct mce_vendor_flags mce_flags;
+
 extern struct mca_config mca_cfg;
 extern void mce_register_decode_chain(struct notifier_block *nb);
 extern void mce_unregister_decode_chain(struct notifier_block *nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb433043a7f..e16f3f201e06 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,12 +186,68 @@ static int error_context(struct mce *m)
 	return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
 }
 
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, enum context ctx)
+{
+	/* Processor Context Corrupt, no need to fumble too much, die! */
+	if (m->status & MCI_STATUS_PCC)
+		return MCE_PANIC_SEVERITY;
+
+	if (m->status & MCI_STATUS_UC) {
+
+		/*
+		 * On older systems where overflow_recov flag is not present, we
+		 * should simply panic if an error overflow occurs. If
+		 * overflow_recov flag is present and set, then software can try
+		 * to at least kill process to prolong system operation.
+		 */
+		if (mce_flags.overflow_recov) {
+			/* software can try to contain */
+			if (!(m->mcgstatus & MCG_STATUS_RIPV))
+				if (ctx == IN_KERNEL)
+					return MCE_PANIC_SEVERITY;
+
+				/* kill current process */
+				return MCE_AR_SEVERITY;
+		} else {
+			/* at least one error was not logged */
+			if (m->status & MCI_STATUS_OVER)
+				return MCE_PANIC_SEVERITY;
+		}
+
+		/*
+		 * For any other case, return MCE_UC_SEVERITY so that we log the
+		 * error and exit #MC handler.
+		 */
+		return MCE_UC_SEVERITY;
+	}
+
+	/*
+	 * deferred error: poll handler catches these and adds to mce_ring so
+	 * memory-failure can take recovery actions.
+	 */
+	if (m->status & MCI_STATUS_DEFERRED)
+		return MCE_DEFERRED_SEVERITY;
+
+	/*
+	 * corrected error: poll handler catches these and passes responsibility
+	 * of decoding the error to EDAC
+	 */
+	return MCE_KEEP_SEVERITY;
+}
+
 int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
 {
 	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
 	enum context ctx = error_context(m);
 	struct severity *s;
 
+	if (m->cpuvendor == X86_VENDOR_AMD)
+		return mce_severity_amd(m, ctx);
+
 	for (s = severities;; s++) {
 		if ((m->status & s->mask) != s->result)
 			continue;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8548b714a16b..1189f1150a19 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -64,6 +64,7 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
 DEFINE_PER_CPU(unsigned, mce_exception_count);
 
 struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
 
 struct mca_config mca_cfg __read_mostly = {
 	.bootlog  = -1,
@@ -1534,6 +1535,13 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
 		if (c->x86 == 6 && cfg->banks > 0)
 			mce_banks[0].ctl = 0;
 
+		/*
+		 * overflow_recov is supported for F15h Models 00h-0fh
+		 * even though we don't have a CPUID bit for it.
+		 */
+		if (c->x86 == 0x15 && c->x86_model <= 0xf)
+			mce_flags.overflow_recov = 1;
+
 		/*
 		 * Turn off MC4_MISC thresholding banks on those models since
 		 * they're not supported there.
@@ -1633,6 +1641,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 		break;
 	case X86_VENDOR_AMD:
 		mce_amd_feature_init(c);
+		mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 43eaa2a1ad70d72876cdbb2eb5450a2665e4770f Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Mon, 23 Mar 2015 10:42:53 -0500
Subject: x86/mce: Define mce_severity function pointer

Rename mce_severity() to mce_severity_intel() and assign the
mce_severity function pointer to mce_severity_amd() during init on AMD.
This way, we can avoid a test to call mce_severity_amd every time we get
into mce_severity(). And it's cleaner to do it this way.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Suggested-by: Tony Luck <tony.luck@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Chen Yucong <slaoub@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1427125373-2918-3-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/include/asm/mce.h                |  2 ++
 arch/x86/kernel/cpu/mcheck/mce-internal.h |  2 +-
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 19 ++++++++++++++-----
 arch/x86/kernel/cpu/mcheck/mce.c          |  1 +
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index b574fbf62d39..1f5a86d518db 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -134,9 +134,11 @@ extern int mce_p5_enabled;
 #ifdef CONFIG_X86_MCE
 int mcheck_init(void);
 void mcheck_cpu_init(struct cpuinfo_x86 *c);
+void mcheck_vendor_init_severity(void);
 #else
 static inline int mcheck_init(void) { return 0; }
 static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
+static inline void mcheck_vendor_init_severity(void) {}
 #endif
 
 #ifdef CONFIG_X86_ANCIENT_MCE
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index e12f0bfb45c1..fe32074b865b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -24,7 +24,7 @@ struct mce_bank {
 	char			attrname[ATTR_LEN];	/* attribute name */
 };
 
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);
 
 extern struct mce_bank *mce_banks;
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index e16f3f201e06..155c9261d3ef 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -190,8 +190,10 @@ static int error_context(struct mce *m)
  * See AMD Error Scope Hierarchy table in a newer BKDG. For example
  * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
  */
-static int mce_severity_amd(struct mce *m, enum context ctx)
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
 {
+	enum context ctx = error_context(m);
+
 	/* Processor Context Corrupt, no need to fumble too much, die! */
 	if (m->status & MCI_STATUS_PCC)
 		return MCE_PANIC_SEVERITY;
@@ -239,15 +241,12 @@ static int mce_severity_amd(struct mce *m, enum context ctx)
 	return MCE_KEEP_SEVERITY;
 }
 
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
 {
 	enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
 	enum context ctx = error_context(m);
 	struct severity *s;
 
-	if (m->cpuvendor == X86_VENDOR_AMD)
-		return mce_severity_amd(m, ctx);
-
 	for (s = severities;; s++) {
 		if ((m->status & s->mask) != s->result)
 			continue;
@@ -272,6 +271,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
 	}
 }
 
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+		    mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		mce_severity = mce_severity_amd;
+}
+
 #ifdef CONFIG_DEBUG_FS
 static void *s_start(struct seq_file *f, loff_t *pos)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 1189f1150a19..c7df30748629 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -2026,6 +2026,7 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mcheck_vendor_init_severity();
 
 	return 0;
 }
-- 
cgit v1.2.3


From b3fe8ba320ace38cee6859b4c015d81627254ddb Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 19 Mar 2015 18:17:45 +0100
Subject: x86/asm/entry/64: Change the THREAD_INFO() definition to not depend
 on KERNEL_STACK_OFFSET

This changes the THREAD_INFO() definition and all its callsites
so that they do not count stack position from
(top of stack - KERNEL_STACK_OFFSET), but from top of stack.

Semi-mysterious expressions THREAD_INFO(%rsp,RIP) - "why RIP??"
are now replaced by more logical THREAD_INFO(%rsp,SIZEOF_PTREGS)
- "calculate thread_info's address using information that
rsp is SIZEOF_PTREGS bytes below top of stack".

While at it, replace "(off)-THREAD_SIZE(reg)" with equivalent
"((off)-THREAD_SIZE)(reg)". The form without parentheses
falsely looks like we invoke THREAD_SIZE() macro.

Improve comment atop THREAD_INFO macro definition.

This patch does not change generated code (verified by objdump).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426785469-15125-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S          | 30 +++++++++++++++---------------
 arch/x86/include/asm/thread_info.h |  8 +++++---
 arch/x86/kernel/entry_64.S         |  4 ++--
 3 files changed, 22 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ad9efef65a6b..50190e15c1b6 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rsp,0
 	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d
+	movl	TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
@@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target)
 	jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -177,10 +177,10 @@ sysenter_dispatch:
 	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
+	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl	$~0x200,EFLAGS(%rsp)
 	movl	RIP(%rsp),%edx		/* User %eip */
@@ -225,7 +225,7 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -240,7 +240,7 @@ sysexit_from_sys_call:
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags+THREAD_INFO(%rsp,RIP)
+	testl %edi,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jz \exit
 	CLEAR_RREGS
 	jmp int_with_check
@@ -262,7 +262,7 @@ sysenter_fix_flags:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jz	sysenter_auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target)
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
+	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -364,10 +364,10 @@ cstar_dispatch:
 	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	RESTORE_RSI_RDI_RDX
 	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
@@ -402,7 +402,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -469,8 +469,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
-	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
+	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 0abf7ab20ce2..ae9c2f13b476 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -207,10 +207,12 @@ static inline unsigned long current_stack_pointer(void)
 	_ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
 
 /*
- * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in
- * a certain register (to be used in assembler memory operands).
+ * ASM operand which evaluates to thread_info address
+ * if it is known that "reg" is exactly "off" bytes below stack top.
+ * Example (fetch thread_info->fieldname):
+ *  mov TI_fieldname+THREAD_INFO(reg, off),%eax
  */
-#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg)
+#define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg)
 
 #endif
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9184392a47b3..8076df982a18 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -258,7 +258,7 @@ GLOBAL(system_call_after_swapgs)
 	SAVE_C_REGS_EXCEPT_RAX_RCX_R11
 	movq	$-ENOSYS,RAX(%rsp)
 	CFI_REL_OFFSET rip,RIP
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
+	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -276,7 +276,7 @@ system_call_fastpath:
  * Has incompletely filled pt_regs, iret frame is also incomplete.
  */
 ret_from_sys_call:
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
 
 	LOCKDEP_SYS_EXIT
-- 
cgit v1.2.3


From ef593260f0cae2699874f098fb5b19fb46502cb3 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 19 Mar 2015 18:17:46 +0100
Subject: x86/asm/entry: Get rid of KERNEL_STACK_OFFSET

PER_CPU_VAR(kernel_stack) was set up in a way where it points
five stack slots below the top of stack.

Presumably, it was done to avoid one "sub $5*8,%rsp"
in syscall/sysenter code paths, where iret frame needs to be
created by hand.

Ironically, none of them benefits from this optimization,
since all of them need to allocate additional data on stack
(struct pt_regs), so they still have to perform subtraction.

This patch eliminates KERNEL_STACK_OFFSET.

PER_CPU_VAR(kernel_stack) now points directly to top of stack.
pt_regs allocations are adjusted to allocate iret frame as well.
Hopefully we can merge it later with 32-bit specific
PER_CPU_VAR(cpu_current_top_of_stack) variable...

Net result in generated code is that constants in several insns
are changed.

This change is necessary for changing struct pt_regs creation
in SYSCALL64 code path from MOV to PUSH instructions.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426785469-15125-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S          | 4 ++--
 arch/x86/include/asm/thread_info.h | 5 ++---
 arch/x86/kernel/cpu/common.c       | 2 +-
 arch/x86/kernel/entry_64.S         | 5 ++---
 arch/x86/kernel/process_32.c       | 2 +-
 arch/x86/kernel/process_64.c       | 3 +--
 arch/x86/kernel/smpboot.c          | 3 +--
 arch/x86/xen/smp.c                 | 3 +--
 8 files changed, 11 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 50190e15c1b6..acbff3fb96a1 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -311,7 +311,7 @@ ENDPROC(ia32_sysenter_target)
 ENTRY(ia32_cstar_target)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
+	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	SWAPGS_UNSAFE_STACK
@@ -323,7 +323,7 @@ ENTRY(ia32_cstar_target)
 	 * disabled irqs and here we enable it straight after entry:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	ALLOC_PT_GPREGS_ON_STACK 8	/* +8: space for orig_ax */
+	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
 	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX(%rsp)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ae9c2f13b476..ad0ee3423da5 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -172,7 +172,6 @@ struct thread_info {
 #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW)
 
 #define STACK_WARN		(THREAD_SIZE/8)
-#define KERNEL_STACK_OFFSET	(5*(BITS_PER_LONG/8))
 
 /*
  * macros/functions for gaining access to the thread information structure
@@ -201,10 +200,10 @@ static inline unsigned long current_stack_pointer(void)
 
 #else /* !__ASSEMBLY__ */
 
-/* how to get the thread information struct from ASM */
+/* Load thread_info address into "reg" */
 #define GET_THREAD_INFO(reg) \
 	_ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \
-	_ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ;
+	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
  * ASM operand which evaluates to thread_info address
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 66d62aef7214..002216ab9145 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1116,7 +1116,7 @@ static __init int setup_disablecpuid(char *arg)
 __setup("clearcpuid=", setup_disablecpuid);
 
 DEFINE_PER_CPU(unsigned long, kernel_stack) =
-	(unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+	(unsigned long)&init_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(kernel_stack);
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8076df982a18..eae69bba3a2c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -225,7 +225,7 @@ ENDPROC(native_usergs_sysret64)
 ENTRY(system_call)
 	CFI_STARTPROC	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,KERNEL_STACK_OFFSET
+	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
 	SWAPGS_UNSAFE_STACK
@@ -242,9 +242,8 @@ GLOBAL(system_call_after_swapgs)
 	 * so we can enable interrupts only after we're done with using rsp_scratch:
 	 */
 	movq	%rsp,PER_CPU_VAR(rsp_scratch)
-	/* kernel_stack is set so that 5 slots (iret frame) are preallocated */
 	movq	PER_CPU_VAR(kernel_stack),%rsp
-	ALLOC_PT_GPREGS_ON_STACK 8		/* +8: space for orig_ax */
+	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
 	movq	%rcx,RIP(%rsp)
 	movq	PER_CPU_VAR(rsp_scratch),%rcx
 	movq	%r11,EFLAGS(%rsp)
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index c5e987022ca0..8ed2106b06da 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -308,7 +308,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	load_sp0(tss, next);
 	this_cpu_write(kernel_stack,
 		       (unsigned long)task_stack_page(next_p) +
-		       THREAD_SIZE - KERNEL_STACK_OFFSET);
+		       THREAD_SIZE);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index da8b74598d90..4baaa972f52a 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -410,8 +410,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	load_sp0(tss, next);
 
 	this_cpu_write(kernel_stack,
-		  (unsigned long)task_stack_page(next_p) +
-		  THREAD_SIZE - KERNEL_STACK_OFFSET);
+		(unsigned long)task_stack_page(next_p) + THREAD_SIZE);
 
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 759388c538cf..7b20ffd2fffc 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -813,8 +813,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	initial_gs = per_cpu_offset(cpu);
 #endif
 	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) -
-		KERNEL_STACK_OFFSET + THREAD_SIZE;
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = idle->thread.sp;
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..765b7684f858 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -452,8 +452,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	clear_tsk_thread_flag(idle, TIF_FORK);
 #endif
 	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) -
-		KERNEL_STACK_OFFSET + THREAD_SIZE;
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
-- 
cgit v1.2.3


From 9ed8e7d86061e7c3fb3855358d51ba4abb19ceb1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 19 Mar 2015 18:17:47 +0100
Subject: x86/asm/entry/64: Use PUSH instructions to build pt_regs on stack

With this change, on SYSCALL64 code path we are now populating
pt_regs->cs, pt_regs->ss and pt_regs->rcx unconditionally and
therefore don't need to do that in FIXUP_TOP_OF_STACK.

We lose a number of large instructions there:

    text    data     bss     dec     hex filename
   13298       0       0   13298    33f2 entry_64_before.o
   12978       0       0   12978    32b2 entry_64.o

What's more important, we convert two "MOVQ $imm,off(%rsp)" to
"PUSH $imm" (the ones which fill pt_regs->cs,ss).

Before this patch, placing them on fast path was slowing it down
by two cycles: this form of MOV is very large, 12 bytes, and
this probably reduces decode bandwidth to one instruction per cycle
when CPU sees them.

Therefore they were living in FIXUP_TOP_OF_STACK instead (away
from fast path).

"PUSH $imm" is a small 2-byte instruction. Moving it to fast path does
not slow it down in my measurements.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426785469-15125-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 54 +++++++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index eae69bba3a2c..3ea4f6d8bc98 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -126,11 +126,8 @@ ENDPROC(native_usergs_sysret64)
  * manipulation.
  */
 	.macro FIXUP_TOP_OF_STACK tmp offset=0
-	movq $__USER_DS,SS+\offset(%rsp)
-	movq $__USER_CS,CS+\offset(%rsp)
-	movq RIP+\offset(%rsp),\tmp  /* get rip */
-	movq \tmp,RCX+\offset(%rsp)  /* copy it to rcx as sysret would do */
-	movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */
+	/* copy flags to r11 as sysret would do */
+	movq EFLAGS+\offset(%rsp),\tmp
 	movq \tmp,R11+\offset(%rsp)
 	.endm
 
@@ -214,7 +211,6 @@ ENDPROC(native_usergs_sysret64)
  * r9   arg5
  * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
  *
- * Interrupts are off on entry.
  * Only called from user space.
  *
  * When user can change pt_regs->foo always force IRET. That is because
@@ -228,6 +224,12 @@ ENTRY(system_call)
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
+
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
 	SWAPGS_UNSAFE_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
@@ -236,27 +238,35 @@ ENTRY(system_call)
 	 */
 GLOBAL(system_call_after_swapgs)
 
-	/*
-	 * We use 'rsp_scratch' as a scratch register, hence this block must execute
-	 * atomically in the face of possible interrupt-driven task preemption,
-	 * so we can enable interrupts only after we're done with using rsp_scratch:
-	 */
 	movq	%rsp,PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(kernel_stack),%rsp
-	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
-	movq	%rcx,RIP(%rsp)
-	movq	PER_CPU_VAR(rsp_scratch),%rcx
-	movq	%r11,EFLAGS(%rsp)
-	movq	%rcx,RSP(%rsp)
+
+	/* Construct struct pt_regs on stack */
+	pushq_cfi $__USER_DS			/* pt_regs->ss */
+	pushq_cfi PER_CPU_VAR(rsp_scratch)	/* pt_regs->sp */
 	/*
-	 * No need to follow this irqs off/on section - it's straight
-	 * and short:
+	 * Re-enable interrupts.
+	 * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+	 * must execute atomically in the face of possible interrupt-driven
+	 * task preemption. We must enable interrupts only after we're done
+	 * with using rsp_scratch:
 	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	movq_cfi rax,ORIG_RAX
-	SAVE_C_REGS_EXCEPT_RAX_RCX_R11
-	movq	$-ENOSYS,RAX(%rsp)
-	CFI_REL_OFFSET rip,RIP
+	pushq_cfi	%r11			/* pt_regs->flags */
+	pushq_cfi	$__USER_CS		/* pt_regs->cs */
+	pushq_cfi	%rcx			/* pt_regs->ip */
+	CFI_REL_OFFSET rip,0
+	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
+	pushq_cfi_reg	rdi			/* pt_regs->di */
+	pushq_cfi_reg	rsi			/* pt_regs->si */
+	pushq_cfi_reg	rdx			/* pt_regs->dx */
+	pushq_cfi_reg	rcx			/* pt_regs->cx */
+	pushq_cfi	$-ENOSYS		/* pt_regs->ax */
+	pushq_cfi_reg	r8			/* pt_regs->r8 */
+	pushq_cfi_reg	r9			/* pt_regs->r9 */
+	pushq_cfi_reg	r10			/* pt_regs->r10 */
+	sub	$(7*8),%rsp /* pt_regs->r11,bp,bx,r12-15 not saved */
+
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
-- 
cgit v1.2.3


From a71ffdd780760dc62c3d4cffb98eaaedaf5068b8 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 19 Mar 2015 18:17:48 +0100
Subject: x86/asm/entry/64: Get rid of the
 FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK macros

The FIXUP_TOP_OF_STACK macro is only necessary because we don't save %r11
to pt_regs->r11 on SYSCALL64 fast path, but we want ptrace to see it populated.

Bite the bullet, add a single additional PUSH instruction, and remove
the FIXUP_TOP_OF_STACK macro.

The RESTORE_TOP_OF_STACK macro is already a nop. Remove it too.

On SandyBridge CPU, it does not get slower:
measured 54.22 ns per getpid syscall before and after last two
changes on defconfig kernel.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426785469-15125-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 35 ++---------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3ea4f6d8bc98..3f8daba28eb0 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -22,8 +22,6 @@
  * - CFI macros are used to generate dwarf2 unwind information for better
  * backtraces. They don't change any code.
  * - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
  * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
  * - idtentry - Define exception entry points.
  */
@@ -118,23 +116,6 @@ ENDPROC(native_usergs_sysret64)
 # define TRACE_IRQS_IRETQ_DEBUG		TRACE_IRQS_IRETQ
 #endif
 
-/*
- * C code is not supposed to know that the iret frame is not populated.
- * Every time a C function with an pt_regs argument is called from
- * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-	.macro FIXUP_TOP_OF_STACK tmp offset=0
-	/* copy flags to r11 as sysret would do */
-	movq EFLAGS+\offset(%rsp),\tmp
-	movq \tmp,R11+\offset(%rsp)
-	.endm
-
-	.macro RESTORE_TOP_OF_STACK tmp offset=0
-	/* nothing to do */
-	.endm
-
 /*
  * empty frame
  */
@@ -265,7 +246,8 @@ GLOBAL(system_call_after_swapgs)
 	pushq_cfi_reg	r8			/* pt_regs->r8 */
 	pushq_cfi_reg	r9			/* pt_regs->r9 */
 	pushq_cfi_reg	r10			/* pt_regs->r10 */
-	sub	$(7*8),%rsp /* pt_regs->r11,bp,bx,r12-15 not saved */
+	pushq_cfi_reg	r11			/* pt_regs->r11 */
+	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
 
 	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
 	jnz tracesys
@@ -312,7 +294,6 @@ ret_from_sys_call:
 	CFI_RESTORE_STATE
 
 int_ret_from_sys_call_fixup:
-	FIXUP_TOP_OF_STACK %r11
 	jmp int_ret_from_sys_call
 
 	/* Do syscall entry tracing */
@@ -328,7 +309,6 @@ tracesys:
 
 tracesys_phase2:
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %rdi
 	movq %rsp, %rdi
 	movq $AUDIT_ARCH_X86_64, %rsi
 	movq %rax,%rdx
@@ -421,9 +401,7 @@ ENTRY(stub_\func)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	FIXUP_TOP_OF_STACK %r11, 8
 	call sys_\func
-	RESTORE_TOP_OF_STACK %r11, 8
 	ret
 	CFI_ENDPROC
 END(stub_\func)
@@ -438,7 +416,6 @@ ENTRY(stub_execve)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call sys_execve
 	movq %rax,RAX(%rsp)
 	RESTORE_EXTRA_REGS
@@ -451,9 +428,7 @@ ENTRY(stub_execveat)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call sys_execveat
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
@@ -469,7 +444,6 @@ ENTRY(stub_rt_sigreturn)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call sys_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
 	RESTORE_EXTRA_REGS
@@ -483,7 +457,6 @@ ENTRY(stub_x32_rt_sigreturn)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call sys32_x32_rt_sigreturn
 	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
 	RESTORE_EXTRA_REGS
@@ -496,9 +469,7 @@ ENTRY(stub_x32_execve)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call compat_sys_execve
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
@@ -510,9 +481,7 @@ ENTRY(stub_x32_execveat)
 	addq $8, %rsp
 	DEFAULT_FRAME 0
 	SAVE_EXTRA_REGS
-	FIXUP_TOP_OF_STACK %r11
 	call compat_sys_execveat
-	RESTORE_TOP_OF_STACK %r11
 	movq %rax,RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp int_ret_from_sys_call
-- 
cgit v1.2.3


From 65c2377486c0b68f149f7d8770499a86b15786b6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 19 Mar 2015 18:17:49 +0100
Subject: x86/asm/entry/64: Get rid of int_ret_from_sys_call_fixup

With the FIXUP_TOP_OF_STACK macro removed, this intermediate jump
is unnecessary.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1426785469-15125-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3f8daba28eb0..df04ee069b1f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -268,7 +268,7 @@ system_call_fastpath:
  */
 ret_from_sys_call:
 	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
-	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
+	jnz int_ret_from_sys_call	/* Go the slow path */
 
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_NONE)
@@ -293,9 +293,6 @@ ret_from_sys_call:
 
 	CFI_RESTORE_STATE
 
-int_ret_from_sys_call_fixup:
-	jmp int_ret_from_sys_call
-
 	/* Do syscall entry tracing */
 tracesys:
 	movq %rsp, %rdi
-- 
cgit v1.2.3


From 84f53788458c95309b88948b69ff95921e9c74a8 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Sun, 22 Mar 2015 22:01:12 +0100
Subject: x86/asm: Deobfuscate segment.h

This file just defines a number of constants, and a few macros
and inline functions. It is particularly badly written.

For example, it is not trivial to see how descriptors are
numbered (you'd expect that should be easy, right?).

This change deobfuscates it via the following changes:

Group all GDT_ENTRY_foo together (move intervening stuff away).

Number them explicitly: use a number, not PREV_DEFINE+1, +2, +3:
I want to immediately see that GDT_ENTRY_PNPBIOS_CS32 is 18.
Seeing (GDT_ENTRY_KERNEL_BASE+6) instead is not useful.

The above change allows to remove GDT_ENTRY_KERNEL_BASE
and GDT_ENTRY_PNPBIOS_BASE, which weren't used anywhere else.

After a group of GDT_ENTRY_foo, define all selector values.

Remove or improve some comments. In particular:
Comment deleted as stating the obvious:
    /*
     * The GDT has 32 entries
     */
    #define GDT_ENTRIES 32

"The segment offset needs to contain a RPL. Grr. -AK"
    changed to
"Selectors need to also have a correct RPL (+3 thingy)"

"GDT layout to get 64bit syscall right (sysret hardcodes gdt
offsets)" expanded into a description *how exactly* sysret
hardcodes them.

Patch was tested to compile and not change vmlinux.o
on 32-bit and 64-bit builds (verified with objdump).

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/segment.h | 125 ++++++++++++++++++-----------------------
 1 file changed, 54 insertions(+), 71 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index db257a58571f..117028f58882 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -15,12 +15,10 @@
 /* Simple and small GDT entries for booting only */
 
 #define GDT_ENTRY_BOOT_CS	2
+#define GDT_ENTRY_BOOT_DS	3
+#define GDT_ENTRY_BOOT_TSS	4
 #define __BOOT_CS		(GDT_ENTRY_BOOT_CS * 8)
-
-#define GDT_ENTRY_BOOT_DS	(GDT_ENTRY_BOOT_CS + 1)
 #define __BOOT_DS		(GDT_ENTRY_BOOT_DS * 8)
-
-#define GDT_ENTRY_BOOT_TSS	(GDT_ENTRY_BOOT_CS + 2)
 #define __BOOT_TSS		(GDT_ENTRY_BOOT_TSS * 8)
 
 #define SEGMENT_RPL_MASK	0x3 /*
@@ -80,97 +78,86 @@
 #define GDT_ENTRY_TLS_MIN	6
 #define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
+#define GDT_ENTRY_KERNEL_CS		12
+#define GDT_ENTRY_KERNEL_DS		13
 #define GDT_ENTRY_DEFAULT_USER_CS	14
-
 #define GDT_ENTRY_DEFAULT_USER_DS	15
+#define GDT_ENTRY_TSS			16
+#define GDT_ENTRY_LDT			17
+#define GDT_ENTRY_PNPBIOS_CS32		18
+#define GDT_ENTRY_PNPBIOS_CS16		19
+#define GDT_ENTRY_PNPBIOS_DS		20
+#define GDT_ENTRY_PNPBIOS_TS1		21
+#define GDT_ENTRY_PNPBIOS_TS2		22
+#define GDT_ENTRY_APMBIOS_BASE		23
+
+#define GDT_ENTRY_ESPFIX_SS		26
+#define GDT_ENTRY_PERCPU		27
+#define GDT_ENTRY_STACK_CANARY		28
 
-#define GDT_ENTRY_KERNEL_BASE		(12)
-
-#define GDT_ENTRY_KERNEL_CS		(GDT_ENTRY_KERNEL_BASE+0)
-
-#define GDT_ENTRY_KERNEL_DS		(GDT_ENTRY_KERNEL_BASE+1)
-
-#define GDT_ENTRY_TSS			(GDT_ENTRY_KERNEL_BASE+4)
-#define GDT_ENTRY_LDT			(GDT_ENTRY_KERNEL_BASE+5)
-
-#define GDT_ENTRY_PNPBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+6)
-#define GDT_ENTRY_APMBIOS_BASE		(GDT_ENTRY_KERNEL_BASE+11)
+#define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
-#define GDT_ENTRY_ESPFIX_SS		(GDT_ENTRY_KERNEL_BASE+14)
+#define __KERNEL_CS			(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS			(GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8+3)
+#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8+3)
 #define __ESPFIX_SS			(GDT_ENTRY_ESPFIX_SS*8)
-
-#define GDT_ENTRY_PERCPU		(GDT_ENTRY_KERNEL_BASE+15)
+#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
+#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
+#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == PNP_CS32)
+#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)	/* data segment for BIOS */
+#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8)	/* transfer data segment */
+#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8)	/* another data segment */
 #ifdef CONFIG_SMP
-#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
+#define __KERNEL_PERCPU			(GDT_ENTRY_PERCPU*8)
 #else
-#define __KERNEL_PERCPU 0
+#define __KERNEL_PERCPU			0
 #endif
-
-#define GDT_ENTRY_STACK_CANARY		(GDT_ENTRY_KERNEL_BASE+16)
 #ifdef CONFIG_CC_STACKPROTECTOR
 #define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
 #else
 #define __KERNEL_STACK_CANARY		0
 #endif
 
-#define GDT_ENTRY_DOUBLEFAULT_TSS	31
-
-/*
- * The GDT has 32 entries
- */
 #define GDT_ENTRIES 32
 
-/* The PnP BIOS entries in the GDT */
-#define GDT_ENTRY_PNPBIOS_CS32		(GDT_ENTRY_PNPBIOS_BASE + 0)
-#define GDT_ENTRY_PNPBIOS_CS16		(GDT_ENTRY_PNPBIOS_BASE + 1)
-#define GDT_ENTRY_PNPBIOS_DS		(GDT_ENTRY_PNPBIOS_BASE + 2)
-#define GDT_ENTRY_PNPBIOS_TS1		(GDT_ENTRY_PNPBIOS_BASE + 3)
-#define GDT_ENTRY_PNPBIOS_TS2		(GDT_ENTRY_PNPBIOS_BASE + 4)
-
-/* The PnP BIOS selectors */
-#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8)	/* segment for calling fn */
-#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8)	/* code segment for BIOS */
-#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)	/* data segment for BIOS */
-#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8)	/* transfer data segment */
-#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8)	/* another data segment */
+#else /* 64-bit: */
 
-
-/*
- * Matching rules for certain types of segments.
- */
-
-/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
-
-
-#else
 #include <asm/cache.h>
 
-#define GDT_ENTRY_KERNEL32_CS 1
-#define GDT_ENTRY_KERNEL_CS 2
-#define GDT_ENTRY_KERNEL_DS 3
-
-#define __KERNEL32_CS   (GDT_ENTRY_KERNEL32_CS * 8)
-
+#define GDT_ENTRY_KERNEL32_CS	1
+#define GDT_ENTRY_KERNEL_CS	2
+#define GDT_ENTRY_KERNEL_DS	3
 /*
  * we cannot use the same code segment descriptor for user and kernel
  * -- not even in the long flat mode, because of different DPL /kkeil
- * The segment offset needs to contain a RPL. Grr. -AK
- * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
+ * GDT layout to get 64bit syscall/sysret right. sysret hardcodes selectors:
+ * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ * ss = STAR.SYSRET_CS+8 (in either case)
+ * thus USER_DS should be between 32-bit and 64-bit code selectors:
  */
 #define GDT_ENTRY_DEFAULT_USER32_CS 4
 #define GDT_ENTRY_DEFAULT_USER_DS 5
 #define GDT_ENTRY_DEFAULT_USER_CS 6
-#define __USER32_CS   (GDT_ENTRY_DEFAULT_USER32_CS*8+3)
-#define __USER32_DS	__USER_DS
 
-#define GDT_ENTRY_TSS 8	/* needs two entries */
-#define GDT_ENTRY_LDT 10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN 12
-#define GDT_ENTRY_TLS_MAX 14
+#define GDT_ENTRY_TSS		8  /* needs two entries */
+#define GDT_ENTRY_LDT		10 /* needs two entries */
+#define GDT_ENTRY_TLS_MIN	12
+#define GDT_ENTRY_TLS_MAX	14
 
-#define GDT_ENTRY_PER_CPU 15	/* Abused to load per CPU data from limit */
-#define __PER_CPU_SEG	(GDT_ENTRY_PER_CPU * 8 + 3)
+#define GDT_ENTRY_PER_CPU	15 /* abused to load per CPU data from limit */
+
+/* Selectors need to also have a correct RPL (+3 thingy) */
+#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
+#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
+#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
+#define __KERNEL32_CS	(GDT_ENTRY_KERNEL32_CS*8)
+#define __USER32_CS	(GDT_ENTRY_DEFAULT_USER32_CS*8+3)
+#define __USER32_DS	__USER_DS
+#define __PER_CPU_SEG	(GDT_ENTRY_PER_CPU*8+3)
 
 /* TLS indexes for 64bit - hardcoded in arch_prctl */
 #define FS_TLS 0
@@ -183,10 +170,6 @@
 
 #endif
 
-#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
-#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
 #ifndef CONFIG_PARAVIRT
 #define get_kernel_rpl()  0
 #endif
-- 
cgit v1.2.3


From d56fe4bf5f3cc30d455c80520f7b71da36ae00e6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 24 Mar 2015 14:41:37 +0100
Subject: x86/asm/entry/64: Always set up SYSENTER MSRs

On CONFIG_IA32_EMULATION=y kernels we set up
MSR_IA32_SYSENTER_CS/ESP/EIP, but on !CONFIG_IA32_EMULATION
kernels we leave them unchanged.

Clear them to make sure the instruction is disabled properly.

SYSCALL is set up properly in both cases.

Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 002216ab9145..c928a7ae1099 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1169,9 +1169,8 @@ void syscall_init(void)
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
 	wrmsrl(MSR_LSTAR, system_call);
-#ifndef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, ignore_sysret);
-#else
+
+#ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, ia32_cstar_target);
 	/*
 	 * Always load these, in case some future 64-bit CPU supports
@@ -1180,6 +1179,11 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0);
+	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
 #endif
 
 	/* Flags to clear on syscall */
-- 
cgit v1.2.3


From 1ddc6f3c60d75a7577dd33bc441e309febe2fc76 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 24 Mar 2015 19:43:11 +0100
Subject: x86/asm/entry/64: Improve the THREAD_INFO() macro explanation

Explain the background, and add a real example.

Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/20150324184311.GA14760@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/thread_info.h | 27 +++++++++++++++++++++++----
 1 file changed, 23 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index ad0ee3423da5..813dfbb867a7 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -206,10 +206,29 @@ static inline unsigned long current_stack_pointer(void)
 	_ASM_SUB $(THREAD_SIZE),reg ;
 
 /*
- * ASM operand which evaluates to thread_info address
- * if it is known that "reg" is exactly "off" bytes below stack top.
- * Example (fetch thread_info->fieldname):
- *  mov TI_fieldname+THREAD_INFO(reg, off),%eax
+ * ASM operand which evaluates to a 'thread_info' address of
+ * the current task, if it is known that "reg" is exactly "off"
+ * bytes below the top of the stack currently.
+ *
+ * ( The kernel stack's size is known at build time, it is usually
+ *   2 or 4 pages, and the bottom  of the kernel stack contains
+ *   the thread_info structure. So to access the thread_info very
+ *   quickly from assembly code we can calculate down from the
+ *   top of the kernel stack to the bottom, using constant,
+ *   build-time calculations only. )
+ *
+ * For example, to fetch the current thread_info->flags value into %eax
+ * on x86-64 defconfig kernels, in syscall entry code where RSP is
+ * currently at exactly SIZEOF_PTREGS bytes away from the top of the
+ * stack:
+ *
+ *      mov TI_flags+THREAD_INFO(%rsp, SIZEOF_PTREGS), %eax
+ *
+ * will translate to:
+ *
+ *      8b 84 24 b8 c0 ff ff      mov    -0x3f48(%rsp), %eax
+ *
+ * which is below the current RSP by almost 16K.
  */
 #define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg)
 
-- 
cgit v1.2.3


From f9d71854b4fe9b22ca199c4676da5a6ece1e5c17 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 24 Mar 2015 19:44:11 +0100
Subject: x86/asm/entry/64: Merge the field offset into the THREAD_INFO() macro

Before:

   TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d

After:

   movl    THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d

to turn it into a clear thread_info accessor.

No code changed:

 md5:
   fb4cb2b3ce05d89940ca304efc8ff183  ia32entry.o.before.asm
   fb4cb2b3ce05d89940ca304efc8ff183  ia32entry.o.after.asm

   e39f2958a5d1300158e276e4f7663263  entry_64.o.before.asm
   e39f2958a5d1300158e276e4f7663263  entry_64.o.after.asm

Acked-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/20150324184411.GB14760@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S          | 30 +++++++++++++++---------------
 arch/x86/include/asm/thread_info.h |  4 ++--
 arch/x86/kernel/entry_64.S         |  4 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index acbff3fb96a1..32e94aec6073 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rsp,0
 	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d
+	movl	THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
@@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target)
 	jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	orl     $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -177,10 +177,10 @@ sysenter_dispatch:
 	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl	$_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	andl    $~TS_COMPAT,THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl	$~0x200,EFLAGS(%rsp)
 	movl	RIP(%rsp),%edx		/* User %eip */
@@ -225,7 +225,7 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -240,7 +240,7 @@ sysexit_from_sys_call:
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl %edi, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz \exit
 	CLEAR_RREGS
 	jmp int_with_check
@@ -262,7 +262,7 @@ sysenter_fix_flags:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz	sysenter_auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target)
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	orl     $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -364,10 +364,10 @@ cstar_dispatch:
 	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	andl $~TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	RESTORE_RSI_RDI_RDX
 	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
@@ -402,7 +402,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -469,8 +469,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
-	orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS)
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 813dfbb867a7..224285b674ca 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -222,7 +222,7 @@ static inline unsigned long current_stack_pointer(void)
  * currently at exactly SIZEOF_PTREGS bytes away from the top of the
  * stack:
  *
- *      mov TI_flags+THREAD_INFO(%rsp, SIZEOF_PTREGS), %eax
+ *      mov THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
  *
  * will translate to:
  *
@@ -230,7 +230,7 @@ static inline unsigned long current_stack_pointer(void)
  *
  * which is below the current RSP by almost 16K.
  */
-#define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg)
+#define THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
 
 #endif
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index df04ee069b1f..8f01a4f1cf9e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -249,7 +249,7 @@ GLOBAL(system_call_after_swapgs)
 	pushq_cfi_reg	r11			/* pt_regs->r11 */
 	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
 
-	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -267,7 +267,7 @@ system_call_fastpath:
  * Has incompletely filled pt_regs, iret frame is also incomplete.
  */
 ret_from_sys_call:
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS)
+	testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz int_ret_from_sys_call	/* Go the slow path */
 
 	LOCKDEP_SYS_EXIT
-- 
cgit v1.2.3


From dca5b52ad76b10c3adc29e2a006d4b1721c44a8d Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 24 Mar 2015 19:44:42 +0100
Subject: x86/asm/entry/64: Rename THREAD_INFO() to ASM_THREAD_INFO()

The THREAD_INFO() macro has a somewhat confusingly generic name,
defined in a generic .h C header file. It also does not make it
clear that it constructs a memory operand for use in assembly
code.

Rename it to ASM_THREAD_INFO() to make it all glaringly
obvious on first glance.

Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/20150324184442.GC14760@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S          | 30 +++++++++++++++---------------
 arch/x86/include/asm/thread_info.h |  4 ++--
 arch/x86/kernel/entry_64.S         |  4 ++--
 3 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 32e94aec6073..5d2641ce9957 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target)
 	CFI_REL_OFFSET rsp,0
 	pushfq_cfi
 	/*CFI_REL_OFFSET rflags,0*/
-	movl	THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d
+	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
@@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target)
 	jnz sysenter_fix_flags
 sysenter_flags_fixed:
 
-	orl     $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz  sysenter_tracesys
 	cmpq	$(IA32_NR_syscalls-1),%rax
@@ -177,10 +177,10 @@ sysenter_dispatch:
 	movq	%rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl	$_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
-	andl    $~TS_COMPAT,THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	/* clear IF, that popfq doesn't enable interrupts early */
 	andl	$~0x200,EFLAGS(%rsp)
 	movl	RIP(%rsp),%edx		/* User %eip */
@@ -225,7 +225,7 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
@@ -240,7 +240,7 @@ sysexit_from_sys_call:
 	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl %edi, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz \exit
 	CLEAR_RREGS
 	jmp int_with_check
@@ -262,7 +262,7 @@ sysenter_fix_flags:
 
 sysenter_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz	sysenter_auditsys
 #endif
 	SAVE_EXTRA_REGS
@@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target)
 1:	movl	(%r8),%r9d
 	_ASM_EXTABLE(1b,ia32_badarg)
 	ASM_CLAC
-	orl     $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl   $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	orl     $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl   $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	CFI_REMEMBER_STATE
 	jnz   cstar_tracesys
 	cmpq $IA32_NR_syscalls-1,%rax
@@ -364,10 +364,10 @@ cstar_dispatch:
 	movq %rax,RAX(%rsp)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
-	testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz sysretl_audit
 sysretl_from_sys_call:
-	andl $~TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	RESTORE_RSI_RDI_RDX
 	movl RIP(%rsp),%ecx
 	CFI_REGISTER rip,rcx
@@ -402,7 +402,7 @@ sysretl_audit:
 
 cstar_tracesys:
 #ifdef CONFIG_AUDITSYSCALL
-	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jz cstar_auditsys
 #endif
 	xchgl %r9d,%ebp
@@ -469,8 +469,8 @@ ENTRY(ia32_syscall)
 	   this could be a problem. */
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS_EXCEPT_R891011
-	orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
 	cmpq $(IA32_NR_syscalls-1),%rax
 	ja ia32_badsys
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 224285b674ca..ea2dbe82cba3 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -222,7 +222,7 @@ static inline unsigned long current_stack_pointer(void)
  * currently at exactly SIZEOF_PTREGS bytes away from the top of the
  * stack:
  *
- *      mov THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
+ *      mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
  *
  * will translate to:
  *
@@ -230,7 +230,7 @@ static inline unsigned long current_stack_pointer(void)
  *
  * which is below the current RSP by almost 16K.
  */
-#define THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
+#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
 
 #endif
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8f01a4f1cf9e..daf5d94c0e78 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -249,7 +249,7 @@ GLOBAL(system_call_after_swapgs)
 	pushq_cfi_reg	r11			/* pt_regs->r11 */
 	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
 
-	testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
 system_call_fastpath:
 #if __SYSCALL_MASK == ~0
@@ -267,7 +267,7 @@ system_call_fastpath:
  * Has incompletely filled pt_regs, iret frame is also incomplete.
  */
 ret_from_sys_call:
-	testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz int_ret_from_sys_call	/* Go the slow path */
 
 	LOCKDEP_SYS_EXIT
-- 
cgit v1.2.3


From b3494a4ab20f6bdf74cdf2badf7918bb65ee8a00 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Mon, 23 Mar 2015 12:32:54 -0700
Subject: x86/asm/entry: Check for syscall exit work with IRQs disabled

We currently have a race: if we're preempted during syscall
exit, we can fail to process syscall return work that is queued
up while we're preempted in ret_from_sys_call after checking
ti.flags.

Fix it by disabling interrupts before checking ti.flags.

Reported-by: Stefan Seyfried <stefan.seyfried@googlemail.com>
Reported-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Fixes: 96b6352c1271 ("x86_64, entry: Remove the syscall exit audit")
Link: http://lkml.kernel.org/r/189320d42b4d671df78c10555976bb10af1ffc75.1427137498.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1d74d161687c..2babb393915e 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -364,12 +364,21 @@ system_call_fastpath:
  * Has incomplete stack frame and undefined top of stack.
  */
 ret_from_sys_call:
-	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
-	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
-
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
+
+	/*
+	 * We must check ti flags with interrupts (or at least preemption)
+	 * off because we must *never* return to userspace without
+	 * processing exit work that is enqueued if we're preempted here.
+	 * In particular, returning to userspace with any of the one-shot
+	 * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
+	 * very bad.
+	 */
+	testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+	jnz int_ret_from_sys_call_fixup	/* Go the the slow path */
+
 	CFI_REMEMBER_STATE
 	/*
 	 * sysretq will re-enable interrupts:
@@ -386,7 +395,7 @@ ret_from_sys_call:
 
 int_ret_from_sys_call_fixup:
 	FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
-	jmp int_ret_from_sys_call
+	jmp int_ret_from_sys_call_irqs_off
 
 	/* Do syscall tracing */
 tracesys:
@@ -432,6 +441,7 @@ tracesys_phase2:
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
+int_ret_from_sys_call_irqs_off:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	mask to check */
 GLOBAL(int_with_check)
-- 
cgit v1.2.3


From 72d64cc76941cde45e65e2a5b9fb81d527963645 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 24 Mar 2015 20:45:42 +0100
Subject: x86/asm: Further improve segment.h readability

 - extend/clarify explanations where necessary

 - move comments from macro values to before the macro, to
   make them more consistent, and to reduce preprocessor overhead

 - sort GDT index and selector values likewise by number

 - use consistent, modern kernel coding style across the file

 - capitalize consistently

 - use consistent vertical spacing

 - remove the unused get_limit() method (noticed by Andy Lutomirski)

No change in code (verified with objdump -d):

 64-bit defconfig+kvmconfig:

   815a129bc1f80de6445c1d8ca5b97cad  vmlinux.o.before.asm
   815a129bc1f80de6445c1d8ca5b97cad  vmlinux.o.after.asm

 32-bit defconfig+kvmconfig:

   e659ef045159ddf41a0771b33a34aae5  vmlinux.o.before.asm
   e659ef045159ddf41a0771b33a34aae5  vmlinux.o.after.asm

Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/segment.h | 242 ++++++++++++++++++++++++-----------------
 1 file changed, 141 insertions(+), 101 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 117028f58882..d394899e055c 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -3,8 +3,10 @@
 
 #include <linux/const.h>
 
-/* Constructor for a conventional segment GDT (or LDT) entry */
-/* This is a macro so it can be used in initializers */
+/*
+ * Constructor for a conventional segment GDT (or LDT) entry.
+ * This is a macro so it can be used in initializers.
+ */
 #define GDT_ENTRY(flags, base, limit)			\
 	((((base)  & _AC(0xff000000,ULL)) << (56-24)) |	\
 	 (((flags) & _AC(0x0000f0ff,ULL)) << 40) |	\
@@ -12,71 +14,78 @@
 	 (((base)  & _AC(0x00ffffff,ULL)) << 16) |	\
 	 (((limit) & _AC(0x0000ffff,ULL))))
 
-/* Simple and small GDT entries for booting only */
+/* Simple and small GDT entries for booting only: */
 
 #define GDT_ENTRY_BOOT_CS	2
 #define GDT_ENTRY_BOOT_DS	3
 #define GDT_ENTRY_BOOT_TSS	4
-#define __BOOT_CS		(GDT_ENTRY_BOOT_CS * 8)
-#define __BOOT_DS		(GDT_ENTRY_BOOT_DS * 8)
-#define __BOOT_TSS		(GDT_ENTRY_BOOT_TSS * 8)
-
-#define SEGMENT_RPL_MASK	0x3 /*
-				     * Bottom two bits of selector give the ring
-				     * privilege level
-				     */
-#define SEGMENT_TI_MASK		0x4 /* Bit 2 is table indicator (LDT/GDT) */
-#define USER_RPL		0x3 /* User mode is privilege level 3 */
-#define SEGMENT_LDT		0x4 /* LDT segment has TI set... */
-#define SEGMENT_GDT		0x0 /* ... GDT has it cleared */
+#define __BOOT_CS		(GDT_ENTRY_BOOT_CS*8)
+#define __BOOT_DS		(GDT_ENTRY_BOOT_DS*8)
+#define __BOOT_TSS		(GDT_ENTRY_BOOT_TSS*8)
+
+/*
+ * Bottom two bits of selector give the ring
+ * privilege level
+ */
+#define SEGMENT_RPL_MASK	0x3
+
+/* User mode is privilege level 3: */
+#define USER_RPL		0x3
+
+/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */
+#define SEGMENT_TI_MASK		0x4
+/* LDT segment has TI set ... */
+#define SEGMENT_LDT		0x4
+/* ... GDT has it cleared */
+#define SEGMENT_GDT		0x0
 
 #ifdef CONFIG_X86_32
 /*
  * The layout of the per-CPU GDT under Linux:
  *
- *   0 - null
+ *   0 - null								<=== cacheline #1
  *   1 - reserved
  *   2 - reserved
  *   3 - reserved
  *
- *   4 - unused			<==== new cacheline
+ *   4 - unused								<=== cacheline #2
  *   5 - unused
  *
  *  ------- start of TLS (Thread-Local Storage) segments:
  *
  *   6 - TLS segment #1			[ glibc's TLS segment ]
  *   7 - TLS segment #2			[ Wine's %fs Win32 segment ]
- *   8 - TLS segment #3
+ *   8 - TLS segment #3							<=== cacheline #3
  *   9 - reserved
  *  10 - reserved
  *  11 - reserved
  *
  *  ------- start of kernel segments:
  *
- *  12 - kernel code segment		<==== new cacheline
+ *  12 - kernel code segment						<=== cacheline #4
  *  13 - kernel data segment
  *  14 - default user CS
  *  15 - default user DS
- *  16 - TSS
+ *  16 - TSS								<=== cacheline #5
  *  17 - LDT
  *  18 - PNPBIOS support (16->32 gate)
  *  19 - PNPBIOS support
- *  20 - PNPBIOS support
+ *  20 - PNPBIOS support						<=== cacheline #6
  *  21 - PNPBIOS support
  *  22 - PNPBIOS support
  *  23 - APM BIOS support
- *  24 - APM BIOS support
+ *  24 - APM BIOS support						<=== cacheline #7
  *  25 - APM BIOS support
  *
  *  26 - ESPFIX small SS
  *  27 - per-cpu			[ offset to per-cpu data area ]
- *  28 - stack_canary-20		[ for stack protector ]
+ *  28 - stack_canary-20		[ for stack protector ]		<=== cacheline #8
  *  29 - unused
  *  30 - unused
  *  31 - TSS for double fault handler
  */
-#define GDT_ENTRY_TLS_MIN	6
-#define GDT_ENTRY_TLS_MAX 	(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
+#define GDT_ENTRY_TLS_MIN		6
+#define GDT_ENTRY_TLS_MAX 		(GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
 
 #define GDT_ENTRY_KERNEL_CS		12
 #define GDT_ENTRY_KERNEL_DS		13
@@ -97,96 +106,134 @@
 
 #define GDT_ENTRY_DOUBLEFAULT_TSS	31
 
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES			32
+
+/*
+ * Segment selector values corresponding to the above entries:
+ */
+
 #define __KERNEL_CS			(GDT_ENTRY_KERNEL_CS*8)
 #define __KERNEL_DS			(GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8+3)
+#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
 #define __ESPFIX_SS			(GDT_ENTRY_ESPFIX_SS*8)
-#define PNP_CS32   (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
-#define PNP_CS16   (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
+
+/* segment for calling fn: */
+#define PNP_CS32			(GDT_ENTRY_PNPBIOS_CS32*8)
+/* code segment for BIOS: */
+#define PNP_CS16			(GDT_ENTRY_PNPBIOS_CS16*8)
+
 /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */
-#define SEGMENT_IS_PNP_CODE(x)   (((x) & 0xf4) == PNP_CS32)
-#define PNP_DS     (GDT_ENTRY_PNPBIOS_DS * 8)	/* data segment for BIOS */
-#define PNP_TS1    (GDT_ENTRY_PNPBIOS_TS1 * 8)	/* transfer data segment */
-#define PNP_TS2    (GDT_ENTRY_PNPBIOS_TS2 * 8)	/* another data segment */
+#define SEGMENT_IS_PNP_CODE(x)		(((x) & 0xf4) == PNP_CS32)
+
+/* data segment for BIOS: */
+#define PNP_DS				(GDT_ENTRY_PNPBIOS_DS*8)
+/* transfer data segment: */
+#define PNP_TS1				(GDT_ENTRY_PNPBIOS_TS1*8)
+/* another data segment: */
+#define PNP_TS2				(GDT_ENTRY_PNPBIOS_TS2*8)
+
 #ifdef CONFIG_SMP
-#define __KERNEL_PERCPU			(GDT_ENTRY_PERCPU*8)
+# define __KERNEL_PERCPU		(GDT_ENTRY_PERCPU*8)
 #else
-#define __KERNEL_PERCPU			0
+# define __KERNEL_PERCPU		0
 #endif
+
 #ifdef CONFIG_CC_STACKPROTECTOR
-#define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
+# define __KERNEL_STACK_CANARY		(GDT_ENTRY_STACK_CANARY*8)
 #else
-#define __KERNEL_STACK_CANARY		0
+# define __KERNEL_STACK_CANARY		0
 #endif
 
-#define GDT_ENTRIES 32
-
 #else /* 64-bit: */
 
 #include <asm/cache.h>
 
-#define GDT_ENTRY_KERNEL32_CS	1
-#define GDT_ENTRY_KERNEL_CS	2
-#define GDT_ENTRY_KERNEL_DS	3
+#define GDT_ENTRY_KERNEL32_CS		1
+#define GDT_ENTRY_KERNEL_CS		2
+#define GDT_ENTRY_KERNEL_DS		3
+
 /*
- * we cannot use the same code segment descriptor for user and kernel
- * -- not even in the long flat mode, because of different DPL /kkeil
- * GDT layout to get 64bit syscall/sysret right. sysret hardcodes selectors:
- * if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
- * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ * We cannot use the same code segment descriptor for user and kernel mode,
+ * not even in long flat mode, because of different DPL.
+ *
+ * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes
+ * selectors:
+ *
+ *   if returning to 32-bit userspace: cs = STAR.SYSRET_CS,
+ *   if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16,
+ *
  * ss = STAR.SYSRET_CS+8 (in either case)
+ *
  * thus USER_DS should be between 32-bit and 64-bit code selectors:
  */
-#define GDT_ENTRY_DEFAULT_USER32_CS 4
-#define GDT_ENTRY_DEFAULT_USER_DS 5
-#define GDT_ENTRY_DEFAULT_USER_CS 6
+#define GDT_ENTRY_DEFAULT_USER32_CS	4
+#define GDT_ENTRY_DEFAULT_USER_DS	5
+#define GDT_ENTRY_DEFAULT_USER_CS	6
+
+/* Needs two entries */
+#define GDT_ENTRY_TSS			8
+/* Needs two entries */
+#define GDT_ENTRY_LDT			10
 
-#define GDT_ENTRY_TSS		8  /* needs two entries */
-#define GDT_ENTRY_LDT		10 /* needs two entries */
-#define GDT_ENTRY_TLS_MIN	12
-#define GDT_ENTRY_TLS_MAX	14
+#define GDT_ENTRY_TLS_MIN		12
+#define GDT_ENTRY_TLS_MAX		14
 
-#define GDT_ENTRY_PER_CPU	15 /* abused to load per CPU data from limit */
+/* Abused to load per CPU data from limit */
+#define GDT_ENTRY_PER_CPU		15
 
-/* Selectors need to also have a correct RPL (+3 thingy) */
-#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
-#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
-#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
-#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)
-#define __KERNEL32_CS	(GDT_ENTRY_KERNEL32_CS*8)
-#define __USER32_CS	(GDT_ENTRY_DEFAULT_USER32_CS*8+3)
-#define __USER32_DS	__USER_DS
-#define __PER_CPU_SEG	(GDT_ENTRY_PER_CPU*8+3)
+/*
+ * Number of entries in the GDT table:
+ */
+#define GDT_ENTRIES			16
 
-/* TLS indexes for 64bit - hardcoded in arch_prctl */
-#define FS_TLS 0
-#define GS_TLS 1
+/*
+ * Segment selector values corresponding to the above entries:
+ *
+ * Note, selectors also need to have a correct RPL,
+ * expressed with the +3 value for user-space selectors:
+ */
+#define __KERNEL32_CS			(GDT_ENTRY_KERNEL32_CS*8)
+#define __KERNEL_CS			(GDT_ENTRY_KERNEL_CS*8)
+#define __KERNEL_DS			(GDT_ENTRY_KERNEL_DS*8)
+#define __USER32_CS			(GDT_ENTRY_DEFAULT_USER32_CS*8 + 3)
+#define __USER_DS			(GDT_ENTRY_DEFAULT_USER_DS*8 + 3)
+#define __USER32_DS			__USER_DS
+#define __USER_CS			(GDT_ENTRY_DEFAULT_USER_CS*8 + 3)
+#define __PER_CPU_SEG			(GDT_ENTRY_PER_CPU*8 + 3)
 
-#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
-#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
+/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */
+#define FS_TLS				0
+#define GS_TLS				1
 
-#define GDT_ENTRIES 16
+#define GS_TLS_SEL			((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
+#define FS_TLS_SEL			((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
 
 #endif
 
 #ifndef CONFIG_PARAVIRT
-#define get_kernel_rpl()  0
+# define get_kernel_rpl()		0
 #endif
 
-#define IDT_ENTRIES 256
-#define NUM_EXCEPTION_VECTORS 32
-/* Bitmask of exception vectors which push an error code on the stack */
-#define EXCEPTION_ERRCODE_MASK  0x00027d00
-#define GDT_SIZE (GDT_ENTRIES * 8)
-#define GDT_ENTRY_TLS_ENTRIES 3
-#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
+#define IDT_ENTRIES			256
+#define NUM_EXCEPTION_VECTORS		32
+
+/* Bitmask of exception vectors which push an error code on the stack: */
+#define EXCEPTION_ERRCODE_MASK		0x00027d00
+
+#define GDT_SIZE			(GDT_ENTRIES*8)
+#define GDT_ENTRY_TLS_ENTRIES		3
+#define TLS_SIZE			(GDT_ENTRY_TLS_ENTRIES* 8)
 
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
+
 extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5];
 #ifdef CONFIG_TRACING
-#define trace_early_idt_handlers early_idt_handlers
+# define trace_early_idt_handlers early_idt_handlers
 #endif
 
 /*
@@ -211,37 +258,30 @@ do {									\
 } while (0)
 
 /*
- * Save a segment register away
+ * Save a segment register away:
  */
 #define savesegment(seg, value)				\
 	asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
 
 /*
- * x86_32 user gs accessors.
+ * x86-32 user GS accessors:
  */
 #ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_32_LAZY_GS
-#define get_user_gs(regs)	(u16)({unsigned long v; savesegment(gs, v); v;})
-#define set_user_gs(regs, v)	loadsegment(gs, (unsigned long)(v))
-#define task_user_gs(tsk)	((tsk)->thread.gs)
-#define lazy_save_gs(v)		savesegment(gs, (v))
-#define lazy_load_gs(v)		loadsegment(gs, (v))
-#else	/* X86_32_LAZY_GS */
-#define get_user_gs(regs)	(u16)((regs)->gs)
-#define set_user_gs(regs, v)	do { (regs)->gs = (v); } while (0)
-#define task_user_gs(tsk)	(task_pt_regs(tsk)->gs)
-#define lazy_save_gs(v)		do { } while (0)
-#define lazy_load_gs(v)		do { } while (0)
-#endif	/* X86_32_LAZY_GS */
+# ifdef CONFIG_X86_32_LAZY_GS
+#  define get_user_gs(regs)		(u16)({ unsigned long v; savesegment(gs, v); v; })
+#  define set_user_gs(regs, v)		loadsegment(gs, (unsigned long)(v))
+#  define task_user_gs(tsk)		((tsk)->thread.gs)
+#  define lazy_save_gs(v)		savesegment(gs, (v))
+#  define lazy_load_gs(v)		loadsegment(gs, (v))
+# else	/* X86_32_LAZY_GS */
+#  define get_user_gs(regs)		(u16)((regs)->gs)
+#  define set_user_gs(regs, v)		do { (regs)->gs = (v); } while (0)
+#  define task_user_gs(tsk)		(task_pt_regs(tsk)->gs)
+#  define lazy_save_gs(v)		do { } while (0)
+#  define lazy_load_gs(v)		do { } while (0)
+# endif	/* X86_32_LAZY_GS */
 #endif	/* X86_32 */
 
-static inline unsigned long get_limit(unsigned long segment)
-{
-	unsigned long __limit;
-	asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
-	return __limit + 1;
-}
-
 #endif /* !__ASSEMBLY__ */
 #endif /* __KERNEL__ */
 
-- 
cgit v1.2.3


From e32edf4fd0fa4897e12ca66118ab67bf257e16e4 Mon Sep 17 00:00:00 2001
From: Nikolay Nikolaev
Date: Thu, 26 Mar 2015 14:39:28 +0000
Subject: KVM: Redesign kvm_io_bus_ API to pass VCPU structure to the
 callbacks.

This is needed in e.g. ARM vGIC emulation, where the MMIO handling
depends on the VCPU that does the access.

Signed-off-by: Nikolay Nikolaev <n.nikolaev@virtualopensystems.com>
Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/powerpc/kvm/mpic.c    | 10 ++++++----
 arch/powerpc/kvm/powerpc.c |  4 ++--
 arch/s390/kvm/diag.c       |  2 +-
 arch/x86/kvm/i8254.c       | 14 +++++++++-----
 arch/x86/kvm/i8259.c       | 12 ++++++------
 arch/x86/kvm/ioapic.c      |  8 ++++----
 arch/x86/kvm/lapic.c       |  4 ++--
 arch/x86/kvm/vmx.c         |  2 +-
 arch/x86/kvm/x86.c         | 13 +++++++------
 include/linux/kvm_host.h   | 10 +++++-----
 virt/kvm/coalesced_mmio.c  |  5 +++--
 virt/kvm/eventfd.c         |  4 ++--
 virt/kvm/iodev.h           | 23 +++++++++++++++--------
 virt/kvm/kvm_main.c        | 32 ++++++++++++++++----------------
 14 files changed, 79 insertions(+), 64 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 39b3a8f816f2..8542f07491d4 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1374,8 +1374,9 @@ static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val)
 	return -ENXIO;
 }
 
-static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
-			 int len, void *ptr)
+static int kvm_mpic_read(struct kvm_vcpu *vcpu,
+			 struct kvm_io_device *this,
+			 gpa_t addr, int len, void *ptr)
 {
 	struct openpic *opp = container_of(this, struct openpic, mmio);
 	int ret;
@@ -1415,8 +1416,9 @@ static int kvm_mpic_read(struct kvm_io_device *this, gpa_t addr,
 	return ret;
 }
 
-static int kvm_mpic_write(struct kvm_io_device *this, gpa_t addr,
-			  int len, const void *ptr)
+static int kvm_mpic_write(struct kvm_vcpu *vcpu,
+			  struct kvm_io_device *this,
+			  gpa_t addr, int len, const void *ptr)
 {
 	struct openpic *opp = container_of(this, struct openpic, mmio);
 	int ret;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 27c0face86f4..24bfe401373e 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -807,7 +807,7 @@ int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-	ret = kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+	ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
 			      bytes, &run->mmio.data);
 
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -880,7 +880,7 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 
-	ret = kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, run->mmio.phys_addr,
+	ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr,
 			       bytes, &run->mmio.data);
 
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 9254afff250c..329ec75e9214 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -213,7 +213,7 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
 	 * - gpr 3 contains the virtqueue index (passed as datamatch)
 	 * - gpr 4 contains the index on the bus (optionally)
 	 */
-	ret = kvm_io_bus_write_cookie(vcpu->kvm, KVM_VIRTIO_CCW_NOTIFY_BUS,
+	ret = kvm_io_bus_write_cookie(vcpu, KVM_VIRTIO_CCW_NOTIFY_BUS,
 				      vcpu->run->s.regs.gprs[2] & 0xffffffff,
 				      8, &vcpu->run->s.regs.gprs[3],
 				      vcpu->run->s.regs.gprs[4]);
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 298781d4cfb4..4dce6f8b6129 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -443,7 +443,8 @@ static inline int pit_in_range(gpa_t addr)
 		(addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
 }
 
-static int pit_ioport_write(struct kvm_io_device *this,
+static int pit_ioport_write(struct kvm_vcpu *vcpu,
+				struct kvm_io_device *this,
 			    gpa_t addr, int len, const void *data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
@@ -519,7 +520,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
 	return 0;
 }
 
-static int pit_ioport_read(struct kvm_io_device *this,
+static int pit_ioport_read(struct kvm_vcpu *vcpu,
+			   struct kvm_io_device *this,
 			   gpa_t addr, int len, void *data)
 {
 	struct kvm_pit *pit = dev_to_pit(this);
@@ -589,7 +591,8 @@ static int pit_ioport_read(struct kvm_io_device *this,
 	return 0;
 }
 
-static int speaker_ioport_write(struct kvm_io_device *this,
+static int speaker_ioport_write(struct kvm_vcpu *vcpu,
+				struct kvm_io_device *this,
 				gpa_t addr, int len, const void *data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
@@ -606,8 +609,9 @@ static int speaker_ioport_write(struct kvm_io_device *this,
 	return 0;
 }
 
-static int speaker_ioport_read(struct kvm_io_device *this,
-			       gpa_t addr, int len, void *data)
+static int speaker_ioport_read(struct kvm_vcpu *vcpu,
+				   struct kvm_io_device *this,
+				   gpa_t addr, int len, void *data)
 {
 	struct kvm_pit *pit = speaker_to_pit(this);
 	struct kvm_kpit_state *pit_state = &pit->pit_state;
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cc31f7c06d3d..8ff4eaa2bf19 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -528,42 +528,42 @@ static int picdev_read(struct kvm_pic *s,
 	return 0;
 }
 
-static int picdev_master_write(struct kvm_io_device *dev,
+static int picdev_master_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			       gpa_t addr, int len, const void *val)
 {
 	return picdev_write(container_of(dev, struct kvm_pic, dev_master),
 			    addr, len, val);
 }
 
-static int picdev_master_read(struct kvm_io_device *dev,
+static int picdev_master_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			      gpa_t addr, int len, void *val)
 {
 	return picdev_read(container_of(dev, struct kvm_pic, dev_master),
 			    addr, len, val);
 }
 
-static int picdev_slave_write(struct kvm_io_device *dev,
+static int picdev_slave_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			      gpa_t addr, int len, const void *val)
 {
 	return picdev_write(container_of(dev, struct kvm_pic, dev_slave),
 			    addr, len, val);
 }
 
-static int picdev_slave_read(struct kvm_io_device *dev,
+static int picdev_slave_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			     gpa_t addr, int len, void *val)
 {
 	return picdev_read(container_of(dev, struct kvm_pic, dev_slave),
 			    addr, len, val);
 }
 
-static int picdev_eclr_write(struct kvm_io_device *dev,
+static int picdev_eclr_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			     gpa_t addr, int len, const void *val)
 {
 	return picdev_write(container_of(dev, struct kvm_pic, dev_eclr),
 			    addr, len, val);
 }
 
-static int picdev_eclr_read(struct kvm_io_device *dev,
+static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
 			    gpa_t addr, int len, void *val)
 {
 	return picdev_read(container_of(dev, struct kvm_pic, dev_eclr),
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index b1947e0f3e10..8bf2e49708e3 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -498,8 +498,8 @@ static inline int ioapic_in_range(struct kvm_ioapic *ioapic, gpa_t addr)
 		 (addr < ioapic->base_address + IOAPIC_MEM_LENGTH)));
 }
 
-static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
-			    void *val)
+static int ioapic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+				gpa_t addr, int len, void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 result;
@@ -541,8 +541,8 @@ static int ioapic_mmio_read(struct kvm_io_device *this, gpa_t addr, int len,
 	return 0;
 }
 
-static int ioapic_mmio_write(struct kvm_io_device *this, gpa_t addr, int len,
-			     const void *val)
+static int ioapic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
+				 gpa_t addr, int len, const void *val)
 {
 	struct kvm_ioapic *ioapic = to_ioapic(this);
 	u32 data;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index e55b5fc344eb..ba57bb79e795 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1038,7 +1038,7 @@ static int apic_mmio_in_range(struct kvm_lapic *apic, gpa_t addr)
 	    addr < apic->base_address + LAPIC_MMIO_LENGTH;
 }
 
-static int apic_mmio_read(struct kvm_io_device *this,
+static int apic_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 			   gpa_t address, int len, void *data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
@@ -1358,7 +1358,7 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
 	return ret;
 }
 
-static int apic_mmio_write(struct kvm_io_device *this,
+static int apic_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this,
 			    gpa_t address, int len, const void *data)
 {
 	struct kvm_lapic *apic = to_lapic(this);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f7b20b417a3a..317da9bde728 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5822,7 +5822,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	gpa_t gpa;
 
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
-	if (!kvm_io_bus_write(vcpu->kvm, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
+	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		skip_emulated_instruction(vcpu);
 		return 1;
 	}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70be41b3..5573d633144c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4115,8 +4115,8 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
 	do {
 		n = min(len, 8);
 		if (!(vcpu->arch.apic &&
-		      !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
-		    && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+		      !kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
+		    && kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
 		handled += n;
 		addr += n;
@@ -4135,8 +4135,9 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 	do {
 		n = min(len, 8);
 		if (!(vcpu->arch.apic &&
-		      !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
-		    && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
+		      !kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
+					 addr, n, v))
+		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
 		handled += n;
@@ -4630,10 +4631,10 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 	int r;
 
 	if (vcpu->arch.pio.in)
-		r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
+		r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
 				    vcpu->arch.pio.size, pd);
 	else
-		r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
+		r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
 				     vcpu->arch.pio.port, vcpu->arch.pio.size,
 				     pd);
 	return r;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ae9c72012004..9605e46fce0b 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -165,12 +165,12 @@ enum kvm_bus {
 	KVM_NR_BUSES
 };
 
-int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val);
-int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-			    int len, const void *val, long cookie);
-int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr, int len,
-		    void *val);
+int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+			    gpa_t addr, int len, const void *val, long cookie);
+int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
+		    int len, void *val);
 int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 			    int len, struct kvm_io_device *dev);
 int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 00d86427af0f..c831a40ffc1a 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -60,8 +60,9 @@ static int coalesced_mmio_has_room(struct kvm_coalesced_mmio_dev *dev)
 	return 1;
 }
 
-static int coalesced_mmio_write(struct kvm_io_device *this,
-				gpa_t addr, int len, const void *val)
+static int coalesced_mmio_write(struct kvm_vcpu *vcpu,
+				struct kvm_io_device *this, gpa_t addr,
+				int len, const void *val)
 {
 	struct kvm_coalesced_mmio_dev *dev = to_mmio(this);
 	struct kvm_coalesced_mmio_ring *ring = dev->kvm->coalesced_mmio_ring;
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index fc5f43e54a80..26c72f3663f2 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -715,8 +715,8 @@ ioeventfd_in_range(struct _ioeventfd *p, gpa_t addr, int len, const void *val)
 
 /* MMIO/PIO writes trigger an event if the addr/val match */
 static int
-ioeventfd_write(struct kvm_io_device *this, gpa_t addr, int len,
-		const void *val)
+ioeventfd_write(struct kvm_vcpu *vcpu, struct kvm_io_device *this, gpa_t addr,
+		int len, const void *val)
 {
 	struct _ioeventfd *p = to_ioeventfd(this);
 
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
index 12fd3caffd2b..9ef709cc2cae 100644
--- a/virt/kvm/iodev.h
+++ b/virt/kvm/iodev.h
@@ -20,6 +20,7 @@
 #include <asm/errno.h>
 
 struct kvm_io_device;
+struct kvm_vcpu;
 
 /**
  * kvm_io_device_ops are called under kvm slots_lock.
@@ -27,11 +28,13 @@ struct kvm_io_device;
  * or non-zero to have it passed to the next device.
  **/
 struct kvm_io_device_ops {
-	int (*read)(struct kvm_io_device *this,
+	int (*read)(struct kvm_vcpu *vcpu,
+		    struct kvm_io_device *this,
 		    gpa_t addr,
 		    int len,
 		    void *val);
-	int (*write)(struct kvm_io_device *this,
+	int (*write)(struct kvm_vcpu *vcpu,
+		     struct kvm_io_device *this,
 		     gpa_t addr,
 		     int len,
 		     const void *val);
@@ -49,16 +52,20 @@ static inline void kvm_iodevice_init(struct kvm_io_device *dev,
 	dev->ops = ops;
 }
 
-static inline int kvm_iodevice_read(struct kvm_io_device *dev,
-				    gpa_t addr, int l, void *v)
+static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu,
+				    struct kvm_io_device *dev, gpa_t addr,
+				    int l, void *v)
 {
-	return dev->ops->read ? dev->ops->read(dev, addr, l, v) : -EOPNOTSUPP;
+	return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v)
+				: -EOPNOTSUPP;
 }
 
-static inline int kvm_iodevice_write(struct kvm_io_device *dev,
-				     gpa_t addr, int l, const void *v)
+static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
+				     struct kvm_io_device *dev, gpa_t addr,
+				     int l, const void *v)
 {
-	return dev->ops->write ? dev->ops->write(dev, addr, l, v) : -EOPNOTSUPP;
+	return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v)
+				 : -EOPNOTSUPP;
 }
 
 static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index a1093700f3a4..664d67a099f6 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2997,7 +2997,7 @@ static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
 	return off;
 }
 
-static int __kvm_io_bus_write(struct kvm_io_bus *bus,
+static int __kvm_io_bus_write(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
 			      struct kvm_io_range *range, const void *val)
 {
 	int idx;
@@ -3008,7 +3008,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
 
 	while (idx < bus->dev_count &&
 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_write(bus->range[idx].dev, range->addr,
+		if (!kvm_iodevice_write(vcpu, bus->range[idx].dev, range->addr,
 					range->len, val))
 			return idx;
 		idx++;
@@ -3018,7 +3018,7 @@ static int __kvm_io_bus_write(struct kvm_io_bus *bus,
 }
 
 /* kvm_io_bus_write - called under kvm->slots_lock */
-int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_write(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		     int len, const void *val)
 {
 	struct kvm_io_bus *bus;
@@ -3030,14 +3030,14 @@ int kvm_io_bus_write(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		.len = len,
 	};
 
-	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	r = __kvm_io_bus_write(bus, &range, val);
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	r = __kvm_io_bus_write(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
 
 /* kvm_io_bus_write_cookie - called under kvm->slots_lock */
-int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
-			    int len, const void *val, long cookie)
+int kvm_io_bus_write_cookie(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx,
+			    gpa_t addr, int len, const void *val, long cookie)
 {
 	struct kvm_io_bus *bus;
 	struct kvm_io_range range;
@@ -3047,12 +3047,12 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		.len = len,
 	};
 
-	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
 
 	/* First try the device referenced by cookie. */
 	if ((cookie >= 0) && (cookie < bus->dev_count) &&
 	    (kvm_io_bus_cmp(&range, &bus->range[cookie]) == 0))
-		if (!kvm_iodevice_write(bus->range[cookie].dev, addr, len,
+		if (!kvm_iodevice_write(vcpu, bus->range[cookie].dev, addr, len,
 					val))
 			return cookie;
 
@@ -3060,11 +3060,11 @@ int kvm_io_bus_write_cookie(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	 * cookie contained garbage; fall back to search and return the
 	 * correct cookie value.
 	 */
-	return __kvm_io_bus_write(bus, &range, val);
+	return __kvm_io_bus_write(vcpu, bus, &range, val);
 }
 
-static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
-			     void *val)
+static int __kvm_io_bus_read(struct kvm_vcpu *vcpu, struct kvm_io_bus *bus,
+			     struct kvm_io_range *range, void *val)
 {
 	int idx;
 
@@ -3074,7 +3074,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 
 	while (idx < bus->dev_count &&
 		kvm_io_bus_cmp(range, &bus->range[idx]) == 0) {
-		if (!kvm_iodevice_read(bus->range[idx].dev, range->addr,
+		if (!kvm_iodevice_read(vcpu, bus->range[idx].dev, range->addr,
 				       range->len, val))
 			return idx;
 		idx++;
@@ -3085,7 +3085,7 @@ static int __kvm_io_bus_read(struct kvm_io_bus *bus, struct kvm_io_range *range,
 EXPORT_SYMBOL_GPL(kvm_io_bus_write);
 
 /* kvm_io_bus_read - called under kvm->slots_lock */
-int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
+int kvm_io_bus_read(struct kvm_vcpu *vcpu, enum kvm_bus bus_idx, gpa_t addr,
 		    int len, void *val)
 {
 	struct kvm_io_bus *bus;
@@ -3097,8 +3097,8 @@ int kvm_io_bus_read(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 		.len = len,
 	};
 
-	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
-	r = __kvm_io_bus_read(bus, &range, val);
+	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
+	r = __kvm_io_bus_read(vcpu, bus, &range, val);
 	return r < 0 ? r : 0;
 }
 
-- 
cgit v1.2.3


From af669ac6dc3f66bb56fb9612b9826adac6292794 Mon Sep 17 00:00:00 2001
From: Andre Przywara
Date: Thu, 26 Mar 2015 14:39:29 +0000
Subject: KVM: move iodev.h from virt/kvm/ to include/kvm

iodev.h contains definitions for the kvm_io_bus framework. This is
needed both by the generic KVM code in virt/kvm as well as by
architecture specific code under arch/. Putting the header file in
virt/kvm and using local includes in the architecture part seems at
least dodgy to me, so let's move the file into include/kvm, so that a
more natural "#include <kvm/iodev.h>" can be used by all of the code.
This also solves a problem later when using struct kvm_io_device
in arm_vgic.h.
Fixing up the FSF address in the GPL header and a wrong include path
on the way.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Acked-by: Christoffer Dall <christoffer.dall@linaro.org>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/powerpc/kvm/mpic.c   |  2 +-
 arch/x86/kvm/i8254.h      |  2 +-
 arch/x86/kvm/ioapic.h     |  2 +-
 arch/x86/kvm/irq.h        |  2 +-
 arch/x86/kvm/lapic.h      |  2 +-
 include/kvm/iodev.h       | 76 ++++++++++++++++++++++++++++++++++++++++++++++
 virt/kvm/coalesced_mmio.c |  2 +-
 virt/kvm/eventfd.c        |  2 +-
 virt/kvm/iodev.h          | 77 -----------------------------------------------
 virt/kvm/kvm_main.c       |  2 +-
 10 files changed, 84 insertions(+), 85 deletions(-)
 create mode 100644 include/kvm/iodev.h
 delete mode 100644 virt/kvm/iodev.h

(limited to 'arch/x86')

diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 8542f07491d4..4703fadd2737 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -34,7 +34,7 @@
 #include <asm/kvm_para.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_ppc.h>
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #define MAX_CPU     32
 #define MAX_SRC     256
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index dd1b16b611b0..c84990b42b5b 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -3,7 +3,7 @@
 
 #include <linux/kthread.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 struct kvm_kpit_channel_state {
 	u32 count; /* can be 65536 */
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index c2e36d934af4..d9e02cab7b96 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -3,7 +3,7 @@
 
 #include <linux/kvm_host.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 struct kvm;
 struct kvm_vcpu;
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2d03568e9498..ad68c73008c5 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -27,7 +27,7 @@
 #include <linux/kvm_host.h>
 #include <linux/spinlock.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 #include "ioapic.h"
 #include "lapic.h"
 
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 0bc6c656625b..e284c2880c56 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -1,7 +1,7 @@
 #ifndef __KVM_X86_LAPIC_H
 #define __KVM_X86_LAPIC_H
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 
diff --git a/include/kvm/iodev.h b/include/kvm/iodev.h
new file mode 100644
index 000000000000..a6d208b916f5
--- /dev/null
+++ b/include/kvm/iodev.h
@@ -0,0 +1,76 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __KVM_IODEV_H__
+#define __KVM_IODEV_H__
+
+#include <linux/kvm_types.h>
+#include <linux/errno.h>
+
+struct kvm_io_device;
+struct kvm_vcpu;
+
+/**
+ * kvm_io_device_ops are called under kvm slots_lock.
+ * read and write handlers return 0 if the transaction has been handled,
+ * or non-zero to have it passed to the next device.
+ **/
+struct kvm_io_device_ops {
+	int (*read)(struct kvm_vcpu *vcpu,
+		    struct kvm_io_device *this,
+		    gpa_t addr,
+		    int len,
+		    void *val);
+	int (*write)(struct kvm_vcpu *vcpu,
+		     struct kvm_io_device *this,
+		     gpa_t addr,
+		     int len,
+		     const void *val);
+	void (*destructor)(struct kvm_io_device *this);
+};
+
+
+struct kvm_io_device {
+	const struct kvm_io_device_ops *ops;
+};
+
+static inline void kvm_iodevice_init(struct kvm_io_device *dev,
+				     const struct kvm_io_device_ops *ops)
+{
+	dev->ops = ops;
+}
+
+static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu,
+				    struct kvm_io_device *dev, gpa_t addr,
+				    int l, void *v)
+{
+	return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v)
+				: -EOPNOTSUPP;
+}
+
+static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
+				     struct kvm_io_device *dev, gpa_t addr,
+				     int l, const void *v)
+{
+	return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v)
+				 : -EOPNOTSUPP;
+}
+
+static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
+{
+	if (dev->ops->destructor)
+		dev->ops->destructor(dev);
+}
+
+#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index c831a40ffc1a..571c1ce37d15 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -8,7 +8,7 @@
  *
  */
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c
index 26c72f3663f2..9ff4193dfa49 100644
--- a/virt/kvm/eventfd.c
+++ b/virt/kvm/eventfd.c
@@ -36,7 +36,7 @@
 #include <linux/seqlock.h>
 #include <trace/events/kvm.h>
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 /*
diff --git a/virt/kvm/iodev.h b/virt/kvm/iodev.h
deleted file mode 100644
index 9ef709cc2cae..000000000000
--- a/virt/kvm/iodev.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
- */
-
-#ifndef __KVM_IODEV_H__
-#define __KVM_IODEV_H__
-
-#include <linux/kvm_types.h>
-#include <asm/errno.h>
-
-struct kvm_io_device;
-struct kvm_vcpu;
-
-/**
- * kvm_io_device_ops are called under kvm slots_lock.
- * read and write handlers return 0 if the transaction has been handled,
- * or non-zero to have it passed to the next device.
- **/
-struct kvm_io_device_ops {
-	int (*read)(struct kvm_vcpu *vcpu,
-		    struct kvm_io_device *this,
-		    gpa_t addr,
-		    int len,
-		    void *val);
-	int (*write)(struct kvm_vcpu *vcpu,
-		     struct kvm_io_device *this,
-		     gpa_t addr,
-		     int len,
-		     const void *val);
-	void (*destructor)(struct kvm_io_device *this);
-};
-
-
-struct kvm_io_device {
-	const struct kvm_io_device_ops *ops;
-};
-
-static inline void kvm_iodevice_init(struct kvm_io_device *dev,
-				     const struct kvm_io_device_ops *ops)
-{
-	dev->ops = ops;
-}
-
-static inline int kvm_iodevice_read(struct kvm_vcpu *vcpu,
-				    struct kvm_io_device *dev, gpa_t addr,
-				    int l, void *v)
-{
-	return dev->ops->read ? dev->ops->read(vcpu, dev, addr, l, v)
-				: -EOPNOTSUPP;
-}
-
-static inline int kvm_iodevice_write(struct kvm_vcpu *vcpu,
-				     struct kvm_io_device *dev, gpa_t addr,
-				     int l, const void *v)
-{
-	return dev->ops->write ? dev->ops->write(vcpu, dev, addr, l, v)
-				 : -EOPNOTSUPP;
-}
-
-static inline void kvm_iodevice_destructor(struct kvm_io_device *dev)
-{
-	if (dev->ops->destructor)
-		dev->ops->destructor(dev);
-}
-
-#endif /* __KVM_IODEV_H__ */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 664d67a099f6..c5460b645e75 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -16,7 +16,7 @@
  *
  */
 
-#include "iodev.h"
+#include <kvm/iodev.h>
 
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
-- 
cgit v1.2.3


From f0e4b2776c12c1633ccef17f210733d6e1b6b2b3 Mon Sep 17 00:00:00 2001
From: Andre Przywara
Date: Thu, 26 Mar 2015 14:39:31 +0000
Subject: KVM: x86: remove now unneeded include directory from Makefile

virt/kvm was never really a good include directory for anything else
than locally included headers.
With the move of iodev.h there is no need anymore to add this
directory the compiler's include path, so remove it from the x86 kvm
Makefile.

Signed-off-by: Andre Przywara <andre.przywara@arm.com>
Reviewed-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
---
 arch/x86/kvm/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 08f790dfadc9..16e8f962eaad 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
 
-ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
+ccflags-y += -Iarch/x86/kvm
 
 CFLAGS_x86.o := -I.
 CFLAGS_svm.o := -I.
-- 
cgit v1.2.3


From b3a2a9076d3149781c8622d6a98a51045ff946e4 Mon Sep 17 00:00:00 2001
From: Jan Kiszka
Date: Mon, 23 Mar 2015 19:27:19 +0100
Subject: KVM: nVMX: Add support for rdtscp

If the guest CPU is supposed to support rdtscp and the host has rdtscp
enabled in the secondary execution controls, we can also expose this
feature to L1. Just extend nested_vmx_exit_handled to properly route
EXIT_REASON_RDTSCP.

Signed-off-by: Jan Kiszka <jan.kiszka@siemens.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/include/uapi/asm/vmx.h | 1 +
 arch/x86/kvm/vmx.c              | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index c5f1a1deb91a..1fe92181ee9e 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -67,6 +67,7 @@
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_INVEPT              50
+#define EXIT_REASON_RDTSCP              51
 #define EXIT_REASON_PREEMPTION_TIMER    52
 #define EXIT_REASON_INVVPID             53
 #define EXIT_REASON_WBINVD              54
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 50c675b46901..fdd9f8b88e10 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2467,6 +2467,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_secondary_ctls_low = 0;
 	vmx->nested.nested_vmx_secondary_ctls_high &=
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -7510,7 +7511,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 	case EXIT_REASON_RDPMC:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING);
-	case EXIT_REASON_RDTSC:
+	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
@@ -8517,6 +8518,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 						exec_control);
 			}
 		}
+		if (nested && !vmx->rdtscp_enabled)
+			vmx->nested.nested_vmx_secondary_ctls_high &=
+				~SECONDARY_EXEC_RDTSCP;
 	}
 
 	/* Exposing INVPCID only when PCID is exposed */
@@ -9146,8 +9150,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 			exec_control &= ~SECONDARY_EXEC_RDTSCP;
 		/* Take the following fields only from vmcs12 */
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-                                  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
 		if (nested_cpu_has(vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
-- 
cgit v1.2.3


From 0f1b5ca240c65ed9533f193720f337bf24fb2f2f Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 17 Feb 2015 18:18:04 -0800
Subject: perf/x86/intel: Add new cache events table for Haswell

Haswell offcore events are quite different from Sandy Bridge.
Add a new table to handle Haswell properly.

Note that the offcore bits listed in the SDM are not quite correct
(this is currently being fixed). An uptodate list of bits is
in the patch.

The basic setup is similar to Sandy Bridge. The prefetch columns
have been removed, as prefetch counting is not very reliable
on Haswell. One L1 event that is not in the event list anymore
has been also removed.

- data reads do not include code reads (comparable to earlier Sandy Bridge tables)
- data counts include speculative execution (except L1 write, dtlb, bpu)
- remote node access includes both remote memory, remote cache, remote mmio.
- prefetches are not included in the counts for consistency
  (different from Sandy Bridge, which includes prefetches in the remote node)

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Removed the HSM30 comments; we don't have them for SNB/IVB either. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 194 ++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9f1dd18fa395..5ef64bf88ecd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -415,6 +415,196 @@ static __initconst const u64 snb_hw_cache_event_ids
 
 };
 
+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts because they are not
+ *   reliably counted.
+ */
+
+#define HSW_DEMAND_DATA_RD		BIT_ULL(0)
+#define HSW_DEMAND_RFO			BIT_ULL(1)
+#define HSW_ANY_RESPONSE		BIT_ULL(16)
+#define HSW_SUPPLIER_NONE		BIT_ULL(17)
+#define HSW_L3_MISS_LOCAL_DRAM		BIT_ULL(22)
+#define HSW_L3_MISS_REMOTE_HOP0		BIT_ULL(27)
+#define HSW_L3_MISS_REMOTE_HOP1		BIT_ULL(28)
+#define HSW_L3_MISS_REMOTE_HOP2P	BIT_ULL(29)
+#define HSW_L3_MISS			(HSW_L3_MISS_LOCAL_DRAM| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_SNOOP_NONE			BIT_ULL(31)
+#define HSW_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define HSW_SNOOP_MISS			BIT_ULL(33)
+#define HSW_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define HSW_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define HSW_SNOOP_HITM			BIT_ULL(36)
+#define HSW_SNOOP_NON_DRAM		BIT_ULL(37)
+#define HSW_ANY_SNOOP			(HSW_SNOOP_NONE| \
+					 HSW_SNOOP_NOT_NEEDED|HSW_SNOOP_MISS| \
+					 HSW_SNOOP_HIT_NO_FWD|HSW_SNOOP_HIT_WITH_FWD| \
+					 HSW_SNOOP_HITM|HSW_SNOOP_NON_DRAM)
+#define HSW_SNOOP_DRAM			(HSW_ANY_SNOOP & ~HSW_SNOOP_NON_DRAM)
+#define HSW_DEMAND_READ			HSW_DEMAND_DATA_RD
+#define HSW_DEMAND_WRITE		HSW_DEMAND_RFO
+#define HSW_L3_MISS_REMOTE		(HSW_L3_MISS_REMOTE_HOP0|\
+					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
+#define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
+
+static __initconst const u64 hsw_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x280,	/* ICACHE.MISSES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_UOPS_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x108,	/* DTLB_LOAD_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_UOPS_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x149,	/* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x6085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0x185,	/* ITLB_MISSES.MISS_CAUSES_A_WALK */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 hsw_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_LLC_ACCESS,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS|HSW_ANY_SNOOP,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_READ|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_LOCAL_DRAM|
+				       HSW_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = HSW_DEMAND_WRITE|
+				       HSW_L3_MISS_REMOTE|
+				       HSW_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 static __initconst const u64 westmere_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2520,8 +2710,8 @@ __init int intel_pmu_init(void)
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
 		x86_pmu.late_ack = true;
-		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids));
-		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
 
 		intel_pmu_lbr_init_hsw();
 
-- 
cgit v1.2.3


From 91f1b70582c62576f429cf78d53751c66677553d Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 17 Feb 2015 18:18:05 -0800
Subject: perf/x86/intel: Add Broadwell core support

Add Broadwell support for Broadwell to perf.

The basic support is very similar to Haswell. We use the new cache
event list added for Haswell earlier. The only differences
are a few bits related to remote nodes. To avoid an extra,
mostly identical, table these are patched up in the initialization code.

The constraint list has one new event that needs to be handled over Haswell.

Includes code and testing from Kan Liang.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 47 ++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 5ef64bf88ecd..28838536a9f7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -220,6 +220,15 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };
 
+struct event_constraint intel_bdw_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x148, 0x4),	/* L1D_PEND_MISS.PENDING */
+	INTEL_EVENT_CONSTRAINT(0xa3, 0x4),	/* CYCLE_ACTIVITY.* */
+	EVENT_CONSTRAINT_END
+};
+
 static u64 intel_pmu_event_map(int hw_event)
 {
 	return intel_perfmon_event_map[hw_event];
@@ -453,6 +462,12 @@ static __initconst const u64 snb_hw_cache_event_ids
 					 HSW_L3_MISS_REMOTE_HOP1|HSW_L3_MISS_REMOTE_HOP2P)
 #define HSW_LLC_ACCESS			HSW_ANY_RESPONSE
 
+#define BDW_L3_MISS_LOCAL		BIT(26)
+#define BDW_L3_MISS			(BDW_L3_MISS_LOCAL| \
+					 HSW_L3_MISS_REMOTE_HOP0|HSW_L3_MISS_REMOTE_HOP1| \
+					 HSW_L3_MISS_REMOTE_HOP2P)
+
+
 static __initconst const u64 hsw_hw_cache_event_ids
 				[PERF_COUNT_HW_CACHE_MAX]
 				[PERF_COUNT_HW_CACHE_OP_MAX]
@@ -2730,6 +2745,38 @@ __init int intel_pmu_init(void)
 		pr_cont("Haswell events, ");
 		break;
 
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+
+		/* L3_MISS_LOCAL_DRAM is BIT(26) in Broadwell */
+		hw_cache_extra_regs[C(LL)][C(OP_READ)][C(RESULT_MISS)] = HSW_DEMAND_READ |
+									 BDW_L3_MISS|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(LL)][C(OP_WRITE)][C(RESULT_MISS)] = HSW_DEMAND_WRITE|BDW_L3_MISS|
+									  HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)] = HSW_DEMAND_READ|
+									     BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
+									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
+
+		intel_pmu_lbr_init_snb();
+
+		x86_pmu.event_constraints = intel_bdw_event_constraints;
+		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_snbep_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Broadwell events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
-- 
cgit v1.2.3


From 294fe0f52a44c6f207211de0686c369a961b5533 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Tue, 17 Feb 2015 18:18:06 -0800
Subject: perf/x86/intel: Add INST_RETIRED.ALL workarounds

On Broadwell INST_RETIRED.ALL cannot be used with any period
that doesn't have the lowest 6 bits cleared. And the period
should not be smaller than 128.

This is erratum BDM11 and BDM55:

  http://www.intel.com/content/dam/www/public/us/en/documents/specification-updates/5th-gen-core-family-spec-update.pdf

BDM11: When using a period < 100; we may get incorrect PEBS/PMI
interrupts and/or an invalid counter state.
BDM55: When bit0-5 of the period are !0 we may get redundant PEBS
records on overflow.

Add a new callback to enforce this, and set it for Broadwell.

How does this handle the case when an app requests a specific
period with some of the bottom bits set?

Short answer:

Any useful instruction sampling period needs to be 4-6 orders
of magnitude larger than 128, as an PMI every 128 instructions
would instantly overwhelm the system and be throttled.
So the +-64 error from this is really small compared to the
period, much smaller than normal system jitter.

Long answer (by Peterz):

IFF we guarantee perf_event_attr::sample_period >= 128.

Suppose we start out with sample_period=192; then we'll set period_left
to 192, we'll end up with left = 128 (we truncate the lower bits). We
get an interrupt, find that period_left = 64 (>0 so we return 0 and
don't get an overflow handler), up that to 128. Then we trigger again,
at n=256. Then we find period_left = -64 (<=0 so we return 1 and do get
an overflow). We increment with sample_period so we get left = 128. We
fire again, at n=384, period_left = 0 (<=0 so we return 1 and get an
overflow). And on and on.

So while the individual interrupts are 'wrong' we get then with
interval=256,128 in exactly the right ratio to average out at 192. And
this works for everything >=128.

So the num_samples*fixed_period thing is still entirely correct +- 127,
which is good enough I'd say, as you already have that error anyhow.

So no need to 'fix' the tools, al we need to do is refuse to create
INST_RETIRED:ALL events with sample_period < 128.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
[ Updated comments and changelog a bit. ]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424225886-18652-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  9 +++++++++
 arch/x86/kernel/cpu/perf_event.h       |  1 +
 arch/x86/kernel/cpu/perf_event_intel.c | 27 +++++++++++++++++++++++++++
 3 files changed, 37 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e0dab5ce61e9..ec6e982fd464 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -451,6 +451,12 @@ int x86_pmu_hw_config(struct perf_event *event)
 	if (event->attr.type == PERF_TYPE_RAW)
 		event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
 
+	if (event->attr.sample_period && x86_pmu.limit_period) {
+		if (x86_pmu.limit_period(event, event->attr.sample_period) >
+				event->attr.sample_period)
+			return -EINVAL;
+	}
+
 	return x86_setup_perfctr(event);
 }
 
@@ -988,6 +994,9 @@ int x86_perf_event_set_period(struct perf_event *event)
 	if (left > x86_pmu.max_period)
 		left = x86_pmu.max_period;
 
+	if (x86_pmu.limit_period)
+		left = x86_pmu.limit_period(event, left);
+
 	per_cpu(pmc_prev_left[idx], smp_processor_id()) = left;
 
 	/*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index a371d27d6795..87e5081f4cdc 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -451,6 +451,7 @@ struct x86_pmu {
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
 	bool		late_ack;
+	unsigned	(*limit_period)(struct perf_event *event, unsigned l);
 
 	/*
 	 * sysfs attrs
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 28838536a9f7..fc6dbc46af4a 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2096,6 +2096,32 @@ hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 	return c;
 }
 
+/*
+ * Broadwell:
+ *
+ * The INST_RETIRED.ALL period always needs to have lowest 6 bits cleared
+ * (BDM55) and it must not use a period smaller than 100 (BDM11). We combine
+ * the two to enforce a minimum period of 128 (the smallest value that has bits
+ * 0-5 cleared and >= 100).
+ *
+ * Because of how the code in x86_perf_event_set_period() works, the truncation
+ * of the lower 6 bits is 'harmless' as we'll occasionally add a longer period
+ * to make up for the 'lost' events due to carrying the 'error' in period_left.
+ *
+ * Therefore the effective (average) period matches the requested period,
+ * despite coarser hardware granularity.
+ */
+static unsigned bdw_limit_period(struct perf_event *event, unsigned left)
+{
+	if ((event->hw.config & INTEL_ARCH_EVENT_MASK) ==
+			X86_CONFIG(.event=0xc0, .umask=0x01)) {
+		if (left < 128)
+			left = 128;
+		left &= ~0x3fu;
+	}
+	return left;
+}
+
 PMU_FORMAT_ATTR(event,	"config:0-7"	);
 PMU_FORMAT_ATTR(umask,	"config:8-15"	);
 PMU_FORMAT_ATTR(edge,	"config:18"	);
@@ -2774,6 +2800,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
 		x86_pmu.cpu_events = hsw_events_attrs;
+		x86_pmu.limit_period = bdw_limit_period;
 		pr_cont("Broadwell events, ");
 		break;
 
-- 
cgit v1.2.3


From 876e78818def2983be55878b21f7152fbaebbd36 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 19 Mar 2015 10:09:06 +0100
Subject: time: Rename timekeeper::tkr to timekeeper::tkr_mono

In preparation of adding another tkr field, rename this one to
tkr_mono. Also rename tk_read_base::base_mono to tk_read_base::base,
since the structure is not specific to CLOCK_MONOTONIC and the mono
name got added to the tk_read_base instance.

Lots of trivial churn.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150319093400.344679419@infradead.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm64/kernel/vdso.c            |  10 +--
 arch/s390/kernel/time.c             |  18 ++---
 arch/tile/kernel/time.c             |  24 +++---
 arch/x86/kernel/vsyscall_gtod.c     |  24 +++---
 arch/x86/kvm/x86.c                  |  14 ++--
 include/linux/timekeeper_internal.h |  12 +--
 kernel/time/timekeeping.c           | 150 ++++++++++++++++++------------------
 7 files changed, 126 insertions(+), 126 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c
index 32aeea083d93..ec37ab3f524f 100644
--- a/arch/arm64/kernel/vdso.c
+++ b/arch/arm64/kernel/vdso.c
@@ -200,7 +200,7 @@ up_fail:
 void update_vsyscall(struct timekeeper *tk)
 {
 	struct timespec xtime_coarse;
-	u32 use_syscall = strcmp(tk->tkr.clock->name, "arch_sys_counter");
+	u32 use_syscall = strcmp(tk->tkr_mono.clock->name, "arch_sys_counter");
 
 	++vdso_data->tb_seq_count;
 	smp_wmb();
@@ -213,11 +213,11 @@ void update_vsyscall(struct timekeeper *tk)
 	vdso_data->wtm_clock_nsec		= tk->wall_to_monotonic.tv_nsec;
 
 	if (!use_syscall) {
-		vdso_data->cs_cycle_last	= tk->tkr.cycle_last;
+		vdso_data->cs_cycle_last	= tk->tkr_mono.cycle_last;
 		vdso_data->xtime_clock_sec	= tk->xtime_sec;
-		vdso_data->xtime_clock_nsec	= tk->tkr.xtime_nsec;
-		vdso_data->cs_mult		= tk->tkr.mult;
-		vdso_data->cs_shift		= tk->tkr.shift;
+		vdso_data->xtime_clock_nsec	= tk->tkr_mono.xtime_nsec;
+		vdso_data->cs_mult		= tk->tkr_mono.mult;
+		vdso_data->cs_shift		= tk->tkr_mono.shift;
 	}
 
 	smp_wmb();
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 6c273cd815bb..170ddd2018b3 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -215,20 +215,20 @@ void update_vsyscall(struct timekeeper *tk)
 {
 	u64 nsecps;
 
-	if (tk->tkr.clock != &clocksource_tod)
+	if (tk->tkr_mono.clock != &clocksource_tod)
 		return;
 
 	/* Make userspace gettimeofday spin until we're done. */
 	++vdso_data->tb_update_count;
 	smp_wmb();
-	vdso_data->xtime_tod_stamp = tk->tkr.cycle_last;
+	vdso_data->xtime_tod_stamp = tk->tkr_mono.cycle_last;
 	vdso_data->xtime_clock_sec = tk->xtime_sec;
-	vdso_data->xtime_clock_nsec = tk->tkr.xtime_nsec;
+	vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec;
 	vdso_data->wtom_clock_sec =
 		tk->xtime_sec + tk->wall_to_monotonic.tv_sec;
-	vdso_data->wtom_clock_nsec = tk->tkr.xtime_nsec +
-		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr.shift);
-	nsecps = (u64) NSEC_PER_SEC << tk->tkr.shift;
+	vdso_data->wtom_clock_nsec = tk->tkr_mono.xtime_nsec +
+		+ ((u64) tk->wall_to_monotonic.tv_nsec << tk->tkr_mono.shift);
+	nsecps = (u64) NSEC_PER_SEC << tk->tkr_mono.shift;
 	while (vdso_data->wtom_clock_nsec >= nsecps) {
 		vdso_data->wtom_clock_nsec -= nsecps;
 		vdso_data->wtom_clock_sec++;
@@ -236,7 +236,7 @@ void update_vsyscall(struct timekeeper *tk)
 
 	vdso_data->xtime_coarse_sec = tk->xtime_sec;
 	vdso_data->xtime_coarse_nsec =
-		(long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+		(long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	vdso_data->wtom_coarse_sec =
 		vdso_data->xtime_coarse_sec + tk->wall_to_monotonic.tv_sec;
 	vdso_data->wtom_coarse_nsec =
@@ -246,8 +246,8 @@ void update_vsyscall(struct timekeeper *tk)
 		vdso_data->wtom_coarse_sec++;
 	}
 
-	vdso_data->tk_mult = tk->tkr.mult;
-	vdso_data->tk_shift = tk->tkr.shift;
+	vdso_data->tk_mult = tk->tkr_mono.mult;
+	vdso_data->tk_shift = tk->tkr_mono.shift;
 	smp_wmb();
 	++vdso_data->tb_update_count;
 }
diff --git a/arch/tile/kernel/time.c b/arch/tile/kernel/time.c
index d412b0856c0a..00178ecf9aea 100644
--- a/arch/tile/kernel/time.c
+++ b/arch/tile/kernel/time.c
@@ -257,34 +257,34 @@ void update_vsyscall_tz(void)
 
 void update_vsyscall(struct timekeeper *tk)
 {
-	if (tk->tkr.clock != &cycle_counter_cs)
+	if (tk->tkr_mono.clock != &cycle_counter_cs)
 		return;
 
 	write_seqcount_begin(&vdso_data->tb_seq);
 
-	vdso_data->cycle_last		= tk->tkr.cycle_last;
-	vdso_data->mask			= tk->tkr.mask;
-	vdso_data->mult			= tk->tkr.mult;
-	vdso_data->shift		= tk->tkr.shift;
+	vdso_data->cycle_last		= tk->tkr_mono.cycle_last;
+	vdso_data->mask			= tk->tkr_mono.mask;
+	vdso_data->mult			= tk->tkr_mono.mult;
+	vdso_data->shift		= tk->tkr_mono.shift;
 
 	vdso_data->wall_time_sec	= tk->xtime_sec;
-	vdso_data->wall_time_snsec	= tk->tkr.xtime_nsec;
+	vdso_data->wall_time_snsec	= tk->tkr_mono.xtime_nsec;
 
 	vdso_data->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdso_data->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdso_data->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdso_data->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdso_data->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdso_data->monotonic_time_sec++;
 	}
 
 	vdso_data->wall_time_coarse_sec	= tk->xtime_sec;
-	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdso_data->wall_time_coarse_nsec = (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);
 
 	vdso_data->monotonic_time_coarse_sec =
 		vdso_data->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kernel/vsyscall_gtod.c b/arch/x86/kernel/vsyscall_gtod.c
index c7d791f32b98..51e330416995 100644
--- a/arch/x86/kernel/vsyscall_gtod.c
+++ b/arch/x86/kernel/vsyscall_gtod.c
@@ -31,30 +31,30 @@ void update_vsyscall(struct timekeeper *tk)
 	gtod_write_begin(vdata);
 
 	/* copy vsyscall data */
-	vdata->vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->cycle_last	= tk->tkr.cycle_last;
-	vdata->mask		= tk->tkr.mask;
-	vdata->mult		= tk->tkr.mult;
-	vdata->shift		= tk->tkr.shift;
+	vdata->vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->cycle_last	= tk->tkr_mono.cycle_last;
+	vdata->mask		= tk->tkr_mono.mask;
+	vdata->mult		= tk->tkr_mono.mult;
+	vdata->shift		= tk->tkr_mono.shift;
 
 	vdata->wall_time_sec		= tk->xtime_sec;
-	vdata->wall_time_snsec		= tk->tkr.xtime_nsec;
+	vdata->wall_time_snsec		= tk->tkr_mono.xtime_nsec;
 
 	vdata->monotonic_time_sec	= tk->xtime_sec
 					+ tk->wall_to_monotonic.tv_sec;
-	vdata->monotonic_time_snsec	= tk->tkr.xtime_nsec
+	vdata->monotonic_time_snsec	= tk->tkr_mono.xtime_nsec
 					+ ((u64)tk->wall_to_monotonic.tv_nsec
-						<< tk->tkr.shift);
+						<< tk->tkr_mono.shift);
 	while (vdata->monotonic_time_snsec >=
-					(((u64)NSEC_PER_SEC) << tk->tkr.shift)) {
+					(((u64)NSEC_PER_SEC) << tk->tkr_mono.shift)) {
 		vdata->monotonic_time_snsec -=
-					((u64)NSEC_PER_SEC) << tk->tkr.shift;
+					((u64)NSEC_PER_SEC) << tk->tkr_mono.shift;
 		vdata->monotonic_time_sec++;
 	}
 
 	vdata->wall_time_coarse_sec	= tk->xtime_sec;
-	vdata->wall_time_coarse_nsec	= (long)(tk->tkr.xtime_nsec >>
-						 tk->tkr.shift);
+	vdata->wall_time_coarse_nsec	= (long)(tk->tkr_mono.xtime_nsec >>
+						 tk->tkr_mono.shift);
 
 	vdata->monotonic_time_coarse_sec =
 		vdata->wall_time_coarse_sec + tk->wall_to_monotonic.tv_sec;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70be41b3..d7a300e0147f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1070,19 +1070,19 @@ static void update_pvclock_gtod(struct timekeeper *tk)
 	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
 	u64 boot_ns;
 
-	boot_ns = ktime_to_ns(ktime_add(tk->tkr.base_mono, tk->offs_boot));
+	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
 
 	write_seqcount_begin(&vdata->seq);
 
 	/* copy pvclock gtod data */
-	vdata->clock.vclock_mode	= tk->tkr.clock->archdata.vclock_mode;
-	vdata->clock.cycle_last		= tk->tkr.cycle_last;
-	vdata->clock.mask		= tk->tkr.mask;
-	vdata->clock.mult		= tk->tkr.mult;
-	vdata->clock.shift		= tk->tkr.shift;
+	vdata->clock.vclock_mode	= tk->tkr_mono.clock->archdata.vclock_mode;
+	vdata->clock.cycle_last		= tk->tkr_mono.cycle_last;
+	vdata->clock.mask		= tk->tkr_mono.mask;
+	vdata->clock.mult		= tk->tkr_mono.mult;
+	vdata->clock.shift		= tk->tkr_mono.shift;
 
 	vdata->boot_ns			= boot_ns;
-	vdata->nsec_base		= tk->tkr.xtime_nsec;
+	vdata->nsec_base		= tk->tkr_mono.xtime_nsec;
 
 	write_seqcount_end(&vdata->seq);
 }
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 05af9a334893..73df17f1535f 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -16,16 +16,16 @@
  * @read:	Read function of @clock
  * @mask:	Bitmask for two's complement subtraction of non 64bit clocks
  * @cycle_last: @clock cycle value at last update
- * @mult:	NTP adjusted multiplier for scaled math conversion
+ * @mult:	(NTP adjusted) multiplier for scaled math conversion
  * @shift:	Shift value for scaled math conversion
  * @xtime_nsec: Shifted (fractional) nano seconds offset for readout
- * @base_mono:  ktime_t (nanoseconds) base time for readout
+ * @base:	ktime_t (nanoseconds) base time for readout
  *
  * This struct has size 56 byte on 64 bit. Together with a seqcount it
  * occupies a single 64byte cache line.
  *
  * The struct is separate from struct timekeeper as it is also used
- * for a fast NMI safe accessor to clock monotonic.
+ * for a fast NMI safe accessors.
  */
 struct tk_read_base {
 	struct clocksource	*clock;
@@ -35,12 +35,12 @@ struct tk_read_base {
 	u32			mult;
 	u32			shift;
 	u64			xtime_nsec;
-	ktime_t			base_mono;
+	ktime_t			base;
 };
 
 /**
  * struct timekeeper - Structure holding internal timekeeping values.
- * @tkr:		The readout base structure
+ * @tkr_mono:		The readout base structure for CLOCK_MONOTONIC
  * @xtime_sec:		Current CLOCK_REALTIME time in seconds
  * @ktime_sec:		Current CLOCK_MONOTONIC time in seconds
  * @wall_to_monotonic:	CLOCK_REALTIME to CLOCK_MONOTONIC offset
@@ -76,7 +76,7 @@ struct tk_read_base {
  * used instead.
  */
 struct timekeeper {
-	struct tk_read_base	tkr;
+	struct tk_read_base	tkr_mono;
 	u64			xtime_sec;
 	unsigned long		ktime_sec;
 	struct timespec64	wall_to_monotonic;
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 892f6cbf1e67..1405091f3acb 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -68,8 +68,8 @@ bool __read_mostly persistent_clock_exist = false;
 
 static inline void tk_normalize_xtime(struct timekeeper *tk)
 {
-	while (tk->tkr.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr.shift)) {
-		tk->tkr.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr.shift;
+	while (tk->tkr_mono.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_mono.shift)) {
+		tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 		tk->xtime_sec++;
 	}
 }
@@ -79,20 +79,20 @@ static inline struct timespec64 tk_xtime(struct timekeeper *tk)
 	struct timespec64 ts;
 
 	ts.tv_sec = tk->xtime_sec;
-	ts.tv_nsec = (long)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	ts.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	return ts;
 }
 
 static void tk_set_xtime(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec = ts->tv_sec;
-	tk->tkr.xtime_nsec = (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec = (u64)ts->tv_nsec << tk->tkr_mono.shift;
 }
 
 static void tk_xtime_add(struct timekeeper *tk, const struct timespec64 *ts)
 {
 	tk->xtime_sec += ts->tv_sec;
-	tk->tkr.xtime_nsec += (u64)ts->tv_nsec << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)ts->tv_nsec << tk->tkr_mono.shift;
 	tk_normalize_xtime(tk);
 }
 
@@ -136,8 +136,8 @@ static long timekeeping_last_warning;
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
 
-	cycle_t max_cycles = tk->tkr.clock->max_cycles;
-	const char *name = tk->tkr.clock->name;
+	cycle_t max_cycles = tk->tkr_mono.clock->max_cycles;
+	const char *name = tk->tkr_mono.clock->name;
 
 	if (offset > max_cycles) {
 		printk_deferred("WARNING: timekeeping: Cycle offset (%lld) is larger than allowed by the '%s' clock's max_cycles value (%lld): time overflow danger\n",
@@ -246,11 +246,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	u64 tmp, ntpinterval;
 	struct clocksource *old_clock;
 
-	old_clock = tk->tkr.clock;
-	tk->tkr.clock = clock;
-	tk->tkr.read = clock->read;
-	tk->tkr.mask = clock->mask;
-	tk->tkr.cycle_last = tk->tkr.read(clock);
+	old_clock = tk->tkr_mono.clock;
+	tk->tkr_mono.clock = clock;
+	tk->tkr_mono.read = clock->read;
+	tk->tkr_mono.mask = clock->mask;
+	tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
 
 	/* Do the ns -> cycle conversion first, using original mult */
 	tmp = NTP_INTERVAL_LENGTH;
@@ -274,11 +274,11 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	if (old_clock) {
 		int shift_change = clock->shift - old_clock->shift;
 		if (shift_change < 0)
-			tk->tkr.xtime_nsec >>= -shift_change;
+			tk->tkr_mono.xtime_nsec >>= -shift_change;
 		else
-			tk->tkr.xtime_nsec <<= shift_change;
+			tk->tkr_mono.xtime_nsec <<= shift_change;
 	}
-	tk->tkr.shift = clock->shift;
+	tk->tkr_mono.shift = clock->shift;
 
 	tk->ntp_error = 0;
 	tk->ntp_error_shift = NTP_SCALE_SHIFT - clock->shift;
@@ -289,7 +289,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
 	 * active clocksource. These value will be adjusted via NTP
 	 * to counteract clock drifting.
 	 */
-	tk->tkr.mult = clock->mult;
+	tk->tkr_mono.mult = clock->mult;
 	tk->ntp_err_mult = 0;
 }
 
@@ -318,11 +318,11 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
 
 static inline s64 timekeeping_get_ns_raw(struct timekeeper *tk)
 {
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t delta;
 	s64 nsec;
 
-	delta = timekeeping_get_delta(&tk->tkr);
+	delta = timekeeping_get_delta(&tk->tkr_mono);
 
 	/* convert delta to nanoseconds. */
 	nsec = clocksource_cyc2ns(delta, clock->mult, clock->shift);
@@ -428,7 +428,7 @@ u64 notrace ktime_get_mono_fast_ns(void)
 	do {
 		seq = raw_read_seqcount(&tk_fast_mono.seq);
 		tkr = tk_fast_mono.base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base_mono) + timekeeping_get_ns(tkr);
+		now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
 
 	} while (read_seqcount_retry(&tk_fast_mono.seq, seq));
 	return now;
@@ -456,7 +456,7 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
 static void halt_fast_timekeeper(struct timekeeper *tk)
 {
 	static struct tk_read_base tkr_dummy;
-	struct tk_read_base *tkr = &tk->tkr;
+	struct tk_read_base *tkr = &tk->tkr_mono;
 
 	memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
 	cycles_at_suspend = tkr->read(tkr->clock);
@@ -472,8 +472,8 @@ static inline void update_vsyscall(struct timekeeper *tk)
 
 	xt = timespec64_to_timespec(tk_xtime(tk));
 	wm = timespec64_to_timespec(tk->wall_to_monotonic);
-	update_vsyscall_old(&xt, &wm, tk->tkr.clock, tk->tkr.mult,
-			    tk->tkr.cycle_last);
+	update_vsyscall_old(&xt, &wm, tk->tkr_mono.clock, tk->tkr_mono.mult,
+			    tk->tkr_mono.cycle_last);
 }
 
 static inline void old_vsyscall_fixup(struct timekeeper *tk)
@@ -490,11 +490,11 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
 	* (shifted nanoseconds), and CONFIG_GENERIC_TIME_VSYSCALL_OLD
 	* users are removed, this can be killed.
 	*/
-	remainder = tk->tkr.xtime_nsec & ((1ULL << tk->tkr.shift) - 1);
-	tk->tkr.xtime_nsec -= remainder;
-	tk->tkr.xtime_nsec += 1ULL << tk->tkr.shift;
+	remainder = tk->tkr_mono.xtime_nsec & ((1ULL << tk->tkr_mono.shift) - 1);
+	tk->tkr_mono.xtime_nsec -= remainder;
+	tk->tkr_mono.xtime_nsec += 1ULL << tk->tkr_mono.shift;
 	tk->ntp_error += remainder << tk->ntp_error_shift;
-	tk->ntp_error -= (1ULL << tk->tkr.shift) << tk->ntp_error_shift;
+	tk->ntp_error -= (1ULL << tk->tkr_mono.shift) << tk->ntp_error_shift;
 }
 #else
 #define old_vsyscall_fixup(tk)
@@ -559,7 +559,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	 */
 	seconds = (u64)(tk->xtime_sec + tk->wall_to_monotonic.tv_sec);
 	nsec = (u32) tk->wall_to_monotonic.tv_nsec;
-	tk->tkr.base_mono = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
+	tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
 
 	/* Update the monotonic raw base */
 	tk->base_raw = timespec64_to_ktime(tk->raw_time);
@@ -569,7 +569,7 @@ static inline void tk_update_ktime_data(struct timekeeper *tk)
 	 * wall_to_monotonic can be greater/equal one second. Take
 	 * this into account before updating tk->ktime_sec.
 	 */
-	nsec += (u32)(tk->tkr.xtime_nsec >> tk->tkr.shift);
+	nsec += (u32)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift);
 	if (nsec >= NSEC_PER_SEC)
 		seconds++;
 	tk->ktime_sec = seconds;
@@ -592,7 +592,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
 		       sizeof(tk_core.timekeeper));
 
-	update_fast_timekeeper(&tk->tkr);
+	update_fast_timekeeper(&tk->tkr_mono);
 }
 
 /**
@@ -604,18 +604,18 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
  */
 static void timekeeping_forward_now(struct timekeeper *tk)
 {
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	cycle_t cycle_now, delta;
 	s64 nsec;
 
-	cycle_now = tk->tkr.read(clock);
-	delta = clocksource_delta(cycle_now, tk->tkr.cycle_last, tk->tkr.mask);
-	tk->tkr.cycle_last = cycle_now;
+	cycle_now = tk->tkr_mono.read(clock);
+	delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
+	tk->tkr_mono.cycle_last = cycle_now;
 
-	tk->tkr.xtime_nsec += delta * tk->tkr.mult;
+	tk->tkr_mono.xtime_nsec += delta * tk->tkr_mono.mult;
 
 	/* If arch requires, add in get_arch_timeoffset() */
-	tk->tkr.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr.shift;
+	tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
 
 	tk_normalize_xtime(tk);
 
@@ -640,7 +640,7 @@ int __getnstimeofday64(struct timespec64 *ts)
 		seq = read_seqcount_begin(&tk_core.seq);
 
 		ts->tv_sec = tk->xtime_sec;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -680,8 +680,8 @@ ktime_t ktime_get(void)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -706,8 +706,8 @@ ktime_t ktime_get_with_offset(enum tk_offsets offs)
 
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
-		base = ktime_add(tk->tkr.base_mono, *offset);
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = ktime_add(tk->tkr_mono.base, *offset);
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -777,7 +777,7 @@ void ktime_get_ts64(struct timespec64 *ts)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 		ts->tv_sec = tk->xtime_sec;
-		nsec = timekeeping_get_ns(&tk->tkr);
+		nsec = timekeeping_get_ns(&tk->tkr_mono);
 		tomono = tk->wall_to_monotonic;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
@@ -863,7 +863,7 @@ void getnstime_raw_and_real(struct timespec *ts_raw, struct timespec *ts_real)
 		ts_real->tv_nsec = 0;
 
 		nsecs_raw = timekeeping_get_ns_raw(tk);
-		nsecs_real = timekeeping_get_ns(&tk->tkr);
+		nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1046,7 +1046,7 @@ static int change_clocksource(void *data)
 	 */
 	if (try_module_get(new->owner)) {
 		if (!new->enable || new->enable(new) == 0) {
-			old = tk->tkr.clock;
+			old = tk->tkr_mono.clock;
 			tk_setup_internals(tk, new);
 			if (old->disable)
 				old->disable(old);
@@ -1074,11 +1074,11 @@ int timekeeping_notify(struct clocksource *clock)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 
-	if (tk->tkr.clock == clock)
+	if (tk->tkr_mono.clock == clock)
 		return 0;
 	stop_machine(change_clocksource, clock, NULL);
 	tick_clock_notify();
-	return tk->tkr.clock == clock ? 0 : -1;
+	return tk->tkr_mono.clock == clock ? 0 : -1;
 }
 
 /**
@@ -1119,7 +1119,7 @@ int timekeeping_valid_for_hres(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		ret = tk->tkr.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
+		ret = tk->tkr_mono.clock->flags & CLOCK_SOURCE_VALID_FOR_HRES;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1138,7 +1138,7 @@ u64 timekeeping_max_deferment(void)
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		ret = tk->tkr.clock->max_idle_ns;
+		ret = tk->tkr_mono.clock->max_idle_ns;
 
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
@@ -1303,7 +1303,7 @@ void timekeeping_inject_sleeptime64(struct timespec64 *delta)
 void timekeeping_resume(void)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
-	struct clocksource *clock = tk->tkr.clock;
+	struct clocksource *clock = tk->tkr_mono.clock;
 	unsigned long flags;
 	struct timespec64 ts_new, ts_delta;
 	struct timespec tmp;
@@ -1331,16 +1331,16 @@ void timekeeping_resume(void)
 	 * The less preferred source will only be tried if there is no better
 	 * usable source. The rtc part is handled separately in rtc core code.
 	 */
-	cycle_now = tk->tkr.read(clock);
+	cycle_now = tk->tkr_mono.read(clock);
 	if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
-		cycle_now > tk->tkr.cycle_last) {
+		cycle_now > tk->tkr_mono.cycle_last) {
 		u64 num, max = ULLONG_MAX;
 		u32 mult = clock->mult;
 		u32 shift = clock->shift;
 		s64 nsec = 0;
 
-		cycle_delta = clocksource_delta(cycle_now, tk->tkr.cycle_last,
-						tk->tkr.mask);
+		cycle_delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last,
+						tk->tkr_mono.mask);
 
 		/*
 		 * "cycle_delta * mutl" may cause 64 bits overflow, if the
@@ -1366,7 +1366,7 @@ void timekeeping_resume(void)
 		__timekeeping_inject_sleeptime(tk, &ts_delta);
 
 	/* Re-base the last cycle value */
-	tk->tkr.cycle_last = cycle_now;
+	tk->tkr_mono.cycle_last = cycle_now;
 	tk->ntp_error = 0;
 	timekeeping_suspended = 0;
 	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
@@ -1519,15 +1519,15 @@ static __always_inline void timekeeping_apply_adjustment(struct timekeeper *tk,
 	 *
 	 * XXX - TODO: Doc ntp_error calculation.
 	 */
-	if ((mult_adj > 0) && (tk->tkr.mult + mult_adj < mult_adj)) {
+	if ((mult_adj > 0) && (tk->tkr_mono.mult + mult_adj < mult_adj)) {
 		/* NTP adjustment caused clocksource mult overflow */
 		WARN_ON_ONCE(1);
 		return;
 	}
 
-	tk->tkr.mult += mult_adj;
+	tk->tkr_mono.mult += mult_adj;
 	tk->xtime_interval += interval;
-	tk->tkr.xtime_nsec -= offset;
+	tk->tkr_mono.xtime_nsec -= offset;
 	tk->ntp_error -= (interval - offset) << tk->ntp_error_shift;
 }
 
@@ -1589,13 +1589,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		tk->ntp_err_mult = 0;
 	}
 
-	if (unlikely(tk->tkr.clock->maxadj &&
-		(abs(tk->tkr.mult - tk->tkr.clock->mult)
-			> tk->tkr.clock->maxadj))) {
+	if (unlikely(tk->tkr_mono.clock->maxadj &&
+		(abs(tk->tkr_mono.mult - tk->tkr_mono.clock->mult)
+			> tk->tkr_mono.clock->maxadj))) {
 		printk_once(KERN_WARNING
 			"Adjusting %s more than 11%% (%ld vs %ld)\n",
-			tk->tkr.clock->name, (long)tk->tkr.mult,
-			(long)tk->tkr.clock->mult + tk->tkr.clock->maxadj);
+			tk->tkr_mono.clock->name, (long)tk->tkr_mono.mult,
+			(long)tk->tkr_mono.clock->mult + tk->tkr_mono.clock->maxadj);
 	}
 
 	/*
@@ -1612,9 +1612,9 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 	 * We'll correct this error next time through this function, when
 	 * xtime_nsec is not as small.
 	 */
-	if (unlikely((s64)tk->tkr.xtime_nsec < 0)) {
-		s64 neg = -(s64)tk->tkr.xtime_nsec;
-		tk->tkr.xtime_nsec = 0;
+	if (unlikely((s64)tk->tkr_mono.xtime_nsec < 0)) {
+		s64 neg = -(s64)tk->tkr_mono.xtime_nsec;
+		tk->tkr_mono.xtime_nsec = 0;
 		tk->ntp_error += neg << tk->ntp_error_shift;
 	}
 }
@@ -1629,13 +1629,13 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
  */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
-	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr.shift;
+	u64 nsecps = (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
 	unsigned int clock_set = 0;
 
-	while (tk->tkr.xtime_nsec >= nsecps) {
+	while (tk->tkr_mono.xtime_nsec >= nsecps) {
 		int leap;
 
-		tk->tkr.xtime_nsec -= nsecps;
+		tk->tkr_mono.xtime_nsec -= nsecps;
 		tk->xtime_sec++;
 
 		/* Figure out if its a leap sec and apply if needed */
@@ -1680,9 +1680,9 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 
 	/* Accumulate one shifted interval */
 	offset -= interval;
-	tk->tkr.cycle_last += interval;
+	tk->tkr_mono.cycle_last += interval;
 
-	tk->tkr.xtime_nsec += tk->xtime_interval << shift;
+	tk->tkr_mono.xtime_nsec += tk->xtime_interval << shift;
 	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
@@ -1725,8 +1725,8 @@ void update_wall_time(void)
 #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
 	offset = real_tk->cycle_interval;
 #else
-	offset = clocksource_delta(tk->tkr.read(tk->tkr.clock),
-				   tk->tkr.cycle_last, tk->tkr.mask);
+	offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+				   tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
 #endif
 
 	/* Check if there's really nothing to do */
@@ -1890,8 +1890,8 @@ ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		base = tk->tkr.base_mono;
-		nsecs = tk->tkr.xtime_nsec >> tk->tkr.shift;
+		base = tk->tkr_mono.base;
+		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
@@ -1922,8 +1922,8 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 	do {
 		seq = read_seqcount_begin(&tk_core.seq);
 
-		base = tk->tkr.base_mono;
-		nsecs = timekeeping_get_ns(&tk->tkr);
+		base = tk->tkr_mono.base;
+		nsecs = timekeeping_get_ns(&tk->tkr_mono);
 
 		*offs_real = tk->offs_real;
 		*offs_boot = tk->offs_boot;
-- 
cgit v1.2.3


From 9332d250b4b4f67c633894b311e022e3cf943bd5 Mon Sep 17 00:00:00 2001
From: David Ahern
Date: Wed, 18 Feb 2015 10:45:43 -0700
Subject: perf/x86: Remove redundant calls to perf_pmu_{dis|en}able()

perf_pmu_disable() is called before pmu->add() and perf_pmu_enable() is called
afterwards. No need to call these inside of x86_pmu_add() as well.

Signed-off-by: David Ahern <dsahern@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1424281543-67335-1-git-send-email-dsahern@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ec6e982fd464..ac41b3ad1fc9 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1044,7 +1044,6 @@ static int x86_pmu_add(struct perf_event *event, int flags)
 
 	hwc = &event->hw;
 
-	perf_pmu_disable(event->pmu);
 	n0 = cpuc->n_events;
 	ret = n = collect_events(cpuc, event, false);
 	if (ret < 0)
@@ -1082,7 +1081,6 @@ done_collect:
 
 	ret = 0;
 out:
-	perf_pmu_enable(event->pmu);
 	return ret;
 }
 
-- 
cgit v1.2.3


From 146b2b097d7a322b64b88a927fc5d870fc79a60b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 25 Mar 2015 18:18:13 +0100
Subject: x86/asm/entry/64: Use better label name, fix comments

A named label "ret_from_sys_call" implies that there are jumps
to this location from elsewhere, as happens with many other
labels in this file.

But this label is used only by the JMP a few insns above.
To make that obvious, use local numeric label instead.

Improve comments:

"and return regs->ax" isn't too informative. We always return
regs->ax.

The comment suggesting that it'd be cool to use rip relative
addressing for CALL is deleted. It's unclear why that would be
an improvement - we aren't striving to use position-independent
code here. PIC code here would require something like LEA
sys_call_table(%rip),reg + CALL *(reg,%rax*8)...

"iret frame is also incomplete" is no longer true, fix that too.

Also fix typo in comment.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427303896-24023-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bf9afadbb99e..9988c4b2de33 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -258,16 +258,15 @@ system_call_fastpath:
 	andl $__SYSCALL_MASK,%eax
 	cmpl $__NR_syscall_max,%eax
 #endif
-	ja ret_from_sys_call  /* and return regs->ax */
+	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
 	movq %r10,%rcx
-	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
+	call *sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
+1:
 /*
- * Syscall return path ending with SYSRET (fast path)
- * Has incompletely filled pt_regs, iret frame is also incomplete.
+ * Syscall return path ending with SYSRET (fast path).
+ * Has incompletely filled pt_regs.
  */
-ret_from_sys_call:
-
 	LOCKDEP_SYS_EXIT
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
@@ -1407,7 +1406,7 @@ ENTRY(nmi)
 	 * NMI.
 	 */
 
-	/* Use %rdx as out temp variable throughout */
+	/* Use %rdx as our temp variable throughout */
 	pushq_cfi %rdx
 	CFI_REL_OFFSET rdx, 0
 
-- 
cgit v1.2.3


From 47eb582e702880c302036d17341c7ea1a7dc2a53 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 25 Mar 2015 18:18:15 +0100
Subject: x86/asm/entry/64: Use smaller instructions

The $AUDIT_ARCH_X86_64 parameter to syscall_trace_enter_phase1/2
is a 32-bit constant, loading it with 32-bit MOV produces 5-byte
insn instead of 10-byte MOVABS one.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427303896-24023-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9988c4b2de33..f85d2ccec6d2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -305,7 +305,7 @@ system_call_fastpath:
 	/* Do syscall entry tracing */
 tracesys:
 	movq %rsp, %rdi
-	movq $AUDIT_ARCH_X86_64, %rsi
+	movl $AUDIT_ARCH_X86_64, %esi
 	call syscall_trace_enter_phase1
 	test %rax, %rax
 	jnz tracesys_phase2		/* if needed, run the slow path */
@@ -316,7 +316,7 @@ tracesys:
 tracesys_phase2:
 	SAVE_EXTRA_REGS
 	movq %rsp, %rdi
-	movq $AUDIT_ARCH_X86_64, %rsi
+	movl $AUDIT_ARCH_X86_64, %esi
 	movq %rax,%rdx
 	call syscall_trace_enter_phase2
 
-- 
cgit v1.2.3


From 40e2ec657dcb0ae328db1abc8e37df4caa893391 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 25 Mar 2015 21:14:26 +0100
Subject: x86/irq/tracing: Move ARCH_LOCKDEP_SYS_EXIT defines closer to their
 users

This change simply moves defines around (even if it's not
obvious in a patch form). Nothing is changed.

This is a preparation for folding ARCH_LOCKDEP_SYS_EXIT defines
into their users.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427314468-12763-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/irqflags.h | 37 +++++++++++++++++--------------------
 1 file changed, 17 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 021bee9b86b6..55866c26d447 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -163,12 +163,20 @@ static inline int arch_irqs_disabled(void)
 
 	return arch_irqs_disabled_flags(flags);
 }
+#endif /* !__ASSEMBLY__ */
 
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_TRACE_IRQFLAGS
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
-
-#ifdef CONFIG_X86_64
-#define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
+#  define TRACE_IRQS_ON
+#  define TRACE_IRQS_OFF
+#endif
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#  ifdef CONFIG_X86_64
+#    define ARCH_LOCKDEP_SYS_EXIT	call lockdep_sys_exit_thunk
+#    define ARCH_LOCKDEP_SYS_EXIT_IRQ \
 	TRACE_IRQS_ON; \
 	sti; \
 	SAVE_EXTRA_REGS; \
@@ -176,9 +184,8 @@ static inline int arch_irqs_disabled(void)
 	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
-
-#else
-#define ARCH_LOCKDEP_SYS_EXIT			\
+#  else
+#    define ARCH_LOCKDEP_SYS_EXIT \
 	pushl %eax;				\
 	pushl %ecx;				\
 	pushl %edx;				\
@@ -186,24 +193,14 @@ static inline int arch_irqs_disabled(void)
 	popl %edx;				\
 	popl %ecx;				\
 	popl %eax;
-
-#define ARCH_LOCKDEP_SYS_EXIT_IRQ
-#endif
-
-#ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
-#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
-#else
-#  define TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF
-#endif
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
+#    define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#  endif
 #  define LOCKDEP_SYS_EXIT	ARCH_LOCKDEP_SYS_EXIT
 #  define LOCKDEP_SYS_EXIT_IRQ	ARCH_LOCKDEP_SYS_EXIT_IRQ
 # else
 #  define LOCKDEP_SYS_EXIT
 #  define LOCKDEP_SYS_EXIT_IRQ
 # endif
-
 #endif /* __ASSEMBLY__ */
+
 #endif
-- 
cgit v1.2.3


From 7dc7cc0780b04935f1127fa22ee23e9d6daf166a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 25 Mar 2015 21:14:27 +0100
Subject: x86/irq/tracing: Fold ARCH_LOCKDEP_SYS_EXIT defines into their users

There is no need to have an extra level of macro indirection
here.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427314468-12763-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/irqflags.h | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 55866c26d447..19355f34c4a3 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -175,17 +175,17 @@ static inline int arch_irqs_disabled(void)
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #  ifdef CONFIG_X86_64
-#    define ARCH_LOCKDEP_SYS_EXIT	call lockdep_sys_exit_thunk
-#    define ARCH_LOCKDEP_SYS_EXIT_IRQ \
+#    define LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
+#    define LOCKDEP_SYS_EXIT_IRQ \
 	TRACE_IRQS_ON; \
 	sti; \
 	SAVE_EXTRA_REGS; \
-	LOCKDEP_SYS_EXIT; \
+	call lockdep_sys_exit_thunk; \
 	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
 #  else
-#    define ARCH_LOCKDEP_SYS_EXIT \
+#    define LOCKDEP_SYS_EXIT \
 	pushl %eax;				\
 	pushl %ecx;				\
 	pushl %edx;				\
@@ -193,14 +193,12 @@ static inline int arch_irqs_disabled(void)
 	popl %edx;				\
 	popl %ecx;				\
 	popl %eax;
-#    define ARCH_LOCKDEP_SYS_EXIT_IRQ
+#    define LOCKDEP_SYS_EXIT_IRQ
 #  endif
-#  define LOCKDEP_SYS_EXIT	ARCH_LOCKDEP_SYS_EXIT
-#  define LOCKDEP_SYS_EXIT_IRQ	ARCH_LOCKDEP_SYS_EXIT_IRQ
-# else
+#else
 #  define LOCKDEP_SYS_EXIT
 #  define LOCKDEP_SYS_EXIT_IRQ
-# endif
+#endif
 #endif /* __ASSEMBLY__ */
 
 #endif
-- 
cgit v1.2.3


From aa6d9a128b861fe7e9dc37bcc37179837674b739 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 25 Mar 2015 21:14:28 +0100
Subject: x86/irq/tracing: Do not save callee-preserved registers around
 lockdep_sys_exit_thunk

Internally, lockdep_sys_exit_thunk saves callee-clobbered
registers, and calls a C function, lockdep_sys_exit. Thus,
callee-preserved registers won't be mangled, there is no need to
save them.

Patch was run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427314468-12763-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/irqflags.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 19355f34c4a3..9a63eae04e4b 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -179,9 +179,7 @@ static inline int arch_irqs_disabled(void)
 #    define LOCKDEP_SYS_EXIT_IRQ \
 	TRACE_IRQS_ON; \
 	sti; \
-	SAVE_EXTRA_REGS; \
 	call lockdep_sys_exit_thunk; \
-	RESTORE_EXTRA_REGS; \
 	cli; \
 	TRACE_IRQS_OFF;
 #  else
-- 
cgit v1.2.3


From 34f439278cef7b1177f8ce24f9fc81dfc6221d3b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Fri, 20 Feb 2015 14:05:38 +0100
Subject: perf: Add per event clockid support

While thinking on the whole clock discussion it occurred to me we have
two distinct uses of time:

 1) the tracking of event/ctx/cgroup enabled/running/stopped times
    which includes the self-monitoring support in struct
    perf_event_mmap_page.

 2) the actual timestamps visible in the data records.

And we've been conflating them.

The first is all about tracking time deltas, nobody should really care
in what time base that happens, its all relative information, as long
as its internally consistent it works.

The second however is what people are worried about when having to
merge their data with external sources. And here we have the
discussion on MONOTONIC vs MONOTONIC_RAW etc..

Where MONOTONIC is good for correlating between machines (static
offset), MONOTNIC_RAW is required for correlating against a fixed rate
hardware clock.

This means configurability; now 1) makes that hard because it needs to
be internally consistent across groups of unrelated events; which is
why we had to have a global perf_clock().

However, for 2) it doesn't really matter, perf itself doesn't care
what it writes into the buffer.

The below patch makes the distinction between these two cases by
adding perf_event_clock() which is used for the second case. It
further makes this configurable on a per-event basis, but adds a few
sanity checks such that we cannot combine events with different clocks
in confusing ways.

And since we then have per-event configurability we might as well
retain the 'legacy' behaviour as a default.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: David Ahern <dsahern@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 14 ++++++--
 include/linux/perf_event.h       |  2 ++
 include/uapi/linux/perf_event.h  |  6 ++--
 kernel/events/core.c             | 77 ++++++++++++++++++++++++++++++++++++++--
 4 files changed, 91 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index ac41b3ad1fc9..0420ebcac116 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1978,13 +1978,23 @@ void arch_perf_update_userpage(struct perf_event *event,
 
 	data = cyc2ns_read_begin();
 
+	/*
+	 * Internal timekeeping for enabled/running/stopped times
+	 * is always in the local_clock domain.
+	 */
 	userpg->cap_user_time = 1;
 	userpg->time_mult = data->cyc2ns_mul;
 	userpg->time_shift = data->cyc2ns_shift;
 	userpg->time_offset = data->cyc2ns_offset - now;
 
-	userpg->cap_user_time_zero = 1;
-	userpg->time_zero = data->cyc2ns_offset;
+	/*
+	 * cap_user_time_zero doesn't make sense when we're using a different
+	 * time base for the records.
+	 */
+	if (event->clock == &local_clock) {
+		userpg->cap_user_time_zero = 1;
+		userpg->time_zero = data->cyc2ns_offset;
+	}
 
 	cyc2ns_read_end(data);
 }
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index b16eac5f54ce..401554074de9 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -173,6 +173,7 @@ struct perf_event;
  * pmu::capabilities flags
  */
 #define PERF_PMU_CAP_NO_INTERRUPT		0x01
+#define PERF_PMU_CAP_NO_NMI			0x02
 
 /**
  * struct pmu - generic performance monitoring unit
@@ -457,6 +458,7 @@ struct perf_event {
 	struct pid_namespace		*ns;
 	u64				id;
 
+	u64				(*clock)(void);
 	perf_overflow_handler_t		overflow_handler;
 	void				*overflow_handler_context;
 
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 1e3cd07cf76e..3bb40ddadbe5 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -326,7 +326,8 @@ struct perf_event_attr {
 				exclude_callchain_user   : 1, /* exclude user callchains */
 				mmap2          :  1, /* include mmap with inode data     */
 				comm_exec      :  1, /* flag comm events that are due to an exec */
-				__reserved_1   : 39;
+				use_clockid    :  1, /* use @clockid for time fields */
+				__reserved_1   : 38;
 
 	union {
 		__u32		wakeup_events;	  /* wakeup every n events */
@@ -355,8 +356,7 @@ struct perf_event_attr {
 	 */
 	__u32	sample_stack_user;
 
-	/* Align to u64. */
-	__u32	__reserved_2;
+	__s32	clockid;
 	/*
 	 * Defines set of regs to dump for each sample
 	 * state captured on:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index bb1a7c36e794..c40c2cac2d8e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -327,6 +327,11 @@ static inline u64 perf_clock(void)
 	return local_clock();
 }
 
+static inline u64 perf_event_clock(struct perf_event *event)
+{
+	return event->clock();
+}
+
 static inline struct perf_cpu_context *
 __get_cpu_context(struct perf_event_context *ctx)
 {
@@ -4762,7 +4767,7 @@ static void __perf_event_header__init_id(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME)
-		data->time = perf_clock();
+		data->time = perf_event_clock(event);
 
 	if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
 		data->id = primary_event_id(event);
@@ -5340,6 +5345,8 @@ static void perf_event_task_output(struct perf_event *event,
 	task_event->event_id.tid = perf_event_tid(event, task);
 	task_event->event_id.ptid = perf_event_tid(event, current);
 
+	task_event->event_id.time = perf_event_clock(event);
+
 	perf_output_put(&handle, task_event->event_id);
 
 	perf_event__output_id_sample(event, &handle, &sample);
@@ -5373,7 +5380,7 @@ static void perf_event_task(struct task_struct *task,
 			/* .ppid */
 			/* .tid  */
 			/* .ptid */
-			.time = perf_clock(),
+			/* .time */
 		},
 	};
 
@@ -5749,7 +5756,7 @@ static void perf_log_throttle(struct perf_event *event, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= perf_clock(),
+		.time		= perf_event_clock(event),
 		.id		= primary_event_id(event),
 		.stream_id	= event->id,
 	};
@@ -6293,6 +6300,8 @@ static int perf_swevent_init(struct perf_event *event)
 static struct pmu perf_swevent = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= perf_swevent_init,
 	.add		= perf_swevent_add,
 	.del		= perf_swevent_del,
@@ -6636,6 +6645,8 @@ static int cpu_clock_event_init(struct perf_event *event)
 static struct pmu perf_cpu_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= cpu_clock_event_init,
 	.add		= cpu_clock_event_add,
 	.del		= cpu_clock_event_del,
@@ -6715,6 +6726,8 @@ static int task_clock_event_init(struct perf_event *event)
 static struct pmu perf_task_clock = {
 	.task_ctx_nr	= perf_sw_context,
 
+	.capabilities	= PERF_PMU_CAP_NO_NMI,
+
 	.event_init	= task_clock_event_init,
 	.add		= task_clock_event_add,
 	.del		= task_clock_event_del,
@@ -7200,6 +7213,10 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
 		event->hw.target = task;
 	}
 
+	event->clock = &local_clock;
+	if (parent_event)
+		event->clock = parent_event->clock;
+
 	if (!overflow_handler && parent_event) {
 		overflow_handler = parent_event->overflow_handler;
 		context = parent_event->overflow_handler_context;
@@ -7422,6 +7439,12 @@ perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
 	if (output_event->cpu == -1 && output_event->ctx != event->ctx)
 		goto out;
 
+	/*
+	 * Mixing clocks in the same buffer is trouble you don't need.
+	 */
+	if (output_event->clock != event->clock)
+		goto out;
+
 set:
 	mutex_lock(&event->mmap_mutex);
 	/* Can't redirect output if we've got an active mmap() */
@@ -7454,6 +7477,43 @@ static void mutex_lock_double(struct mutex *a, struct mutex *b)
 	mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
 }
 
+static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
+{
+	bool nmi_safe = false;
+
+	switch (clk_id) {
+	case CLOCK_MONOTONIC:
+		event->clock = &ktime_get_mono_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_MONOTONIC_RAW:
+		event->clock = &ktime_get_raw_fast_ns;
+		nmi_safe = true;
+		break;
+
+	case CLOCK_REALTIME:
+		event->clock = &ktime_get_real_ns;
+		break;
+
+	case CLOCK_BOOTTIME:
+		event->clock = &ktime_get_boot_ns;
+		break;
+
+	case CLOCK_TAI:
+		event->clock = &ktime_get_tai_ns;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
+		return -EINVAL;
+
+	return 0;
+}
+
 /**
  * sys_perf_event_open - open a performance event, associate it to a task/cpu
  *
@@ -7569,6 +7629,12 @@ SYSCALL_DEFINE5(perf_event_open,
 	 */
 	pmu = event->pmu;
 
+	if (attr.use_clockid) {
+		err = perf_event_set_clock(event, attr.clockid);
+		if (err)
+			goto err_alloc;
+	}
+
 	if (group_leader &&
 	    (is_software_event(event) != is_software_event(group_leader))) {
 		if (is_software_event(event)) {
@@ -7618,6 +7684,11 @@ SYSCALL_DEFINE5(perf_event_open,
 		 */
 		if (group_leader->group_leader != group_leader)
 			goto err_context;
+
+		/* All events in a group should have the same clock */
+		if (group_leader->clock != event->clock)
+			goto err_context;
+
 		/*
 		 * Do not allow to attach to a group in a different
 		 * task or CPU context:
-- 
cgit v1.2.3


From 487d1edb9a6005cf790c7fe59f25ad1e5cb5817b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 27 Mar 2015 11:59:16 +0100
Subject: x86/asm/entry/64: Fix comment about SYSENTER MSRs

The comment is ancient, it dates to the time when only AMD's
x86_64 implementation existed. AMD wasn't (and still isn't)
supporting SYSENTER, so these writes were "just in case" back
then.

This has changed: Intel's x86_64 appeared, and Intel does
support SYSENTER in long mode. "Some future 64-bit CPU" is here
already.

The code may appear "buggy" for AMD as it stands, since
MSR_IA32_SYSENTER_EIP is only 32-bit for AMD CPUs. Writing a
kernel function's address to it would drop high bits. Subsequent
use of this MSR for branch via SYSENTER seem to allow user to
transition to CPL0 while executing his code. Scary, eh?

Explain why that is not a bug: because SYSENTER insn would not
work on AMD CPU.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427453956-21931-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c928a7ae1099..71e4adcb15f1 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1173,8 +1173,10 @@ void syscall_init(void)
 #ifdef CONFIG_IA32_EMULATION
 	wrmsrl(MSR_CSTAR, ia32_cstar_target);
 	/*
-	 * Always load these, in case some future 64-bit CPU supports
-	 * SYSENTER from compat mode too:
+	 * This only works on Intel CPUs.
+	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+	 * This does not cause SYSENTER to jump to the wrong location, because
+	 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
 	 */
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
-- 
cgit v1.2.3


From 27be87c5d53117f048d590d6fc6febb21176c3e9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 27 Mar 2015 11:36:19 +0100
Subject: x86/asm/entry/64: Add missing CFI annotation

This is a missing bit of the recent MOV-to-PUSH conversion.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427452582-21624-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f85d2ccec6d2..dbfc8875d735 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -248,6 +248,7 @@ GLOBAL(system_call_after_swapgs)
 	pushq_cfi_reg	r10			/* pt_regs->r10 */
 	pushq_cfi_reg	r11			/* pt_regs->r11 */
 	sub	$(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+	CFI_ADJUST_CFA_OFFSET 6*8
 
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz tracesys
-- 
cgit v1.2.3


From a232e3d558eef421fbb539ede5483dfb668e38f2 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 27 Mar 2015 11:36:20 +0100
Subject: x86/asm/entry/32: Update "interrupt off" comments

The existing comment has proven to be not very clear.

Replace it with a comment similar to the one we now have in the 64-bit
syscall entry point. (Three instances, one per 32-bit syscall entry).

In the INT80 entry point's CFI annotations, replace mysterious
expressions with numric constants. In this case, raw numbers
look more understandable.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427452582-21624-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 45 +++++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5d2641ce9957..7502ff0b938e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -112,13 +112,16 @@ ENTRY(ia32_sysenter_target)
 	CFI_SIGNAL_FRAME
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rsp,rbp
-	SWAPGS_UNSAFE_STACK
-	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
+
 	/*
-	 * No need to follow this irqs on/off section: the syscall
-	 * disabled irqs, here we enable it straight after entry:
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
 	 */
+	SWAPGS_UNSAFE_STACK
+	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	/* Construct iret frame (ss,rsp,rflags,cs,rip) */
  	movl	%ebp,%ebp		/* zero extension */
 	pushq_cfi $__USER32_DS
@@ -314,15 +317,18 @@ ENTRY(ia32_cstar_target)
 	CFI_DEF_CFA	rsp,0
 	CFI_REGISTER	rip,rcx
 	/*CFI_REGISTER	rflags,r11*/
+
+	/*
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
 	SWAPGS_UNSAFE_STACK
 	movl	%esp,%r8d
 	CFI_REGISTER	rsp,r8
 	movq	PER_CPU_VAR(kernel_stack),%rsp
-	/*
-	 * No need to follow this irqs on/off section: the syscall
-	 * disabled irqs and here we enable it straight after entry:
-	 */
 	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
 	SAVE_C_REGS_EXCEPT_RCX_R891011
 	movl 	%eax,%eax	/* zero extension */
@@ -449,19 +455,22 @@ ia32_badarg:
 ENTRY(ia32_syscall)
 	CFI_STARTPROC32	simple
 	CFI_SIGNAL_FRAME
-	CFI_DEF_CFA	rsp,SS+8-RIP
-	/*CFI_REL_OFFSET	ss,SS-RIP*/
-	CFI_REL_OFFSET	rsp,RSP-RIP
-	/*CFI_REL_OFFSET	rflags,EFLAGS-RIP*/
-	/*CFI_REL_OFFSET	cs,CS-RIP*/
-	CFI_REL_OFFSET	rip,RIP-RIP
-	PARAVIRT_ADJUST_EXCEPTION_FRAME
-	SWAPGS
+	CFI_DEF_CFA	rsp,5*8
+	/*CFI_REL_OFFSET	ss,4*8 */
+	CFI_REL_OFFSET	rsp,3*8
+	/*CFI_REL_OFFSET	rflags,2*8 */
+	/*CFI_REL_OFFSET	cs,1*8 */
+	CFI_REL_OFFSET	rip,0*8
+
 	/*
-	 * No need to follow this irqs on/off section: the syscall
-	 * disabled irqs and here we enable it straight after entry:
+	 * Interrupts are off on entry.
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
 	 */
+	PARAVIRT_ADJUST_EXCEPTION_FRAME
+	SWAPGS
 	ENABLE_INTERRUPTS(CLBR_NONE)
+
 	movl %eax,%eax
 	pushq_cfi %rax		/* store orig_ax */
 	cld
-- 
cgit v1.2.3


From 4ee8ec17ba00fce4af042543771f996fb9d98d34 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 27 Mar 2015 11:36:21 +0100
Subject: x86/asm/entry/32: Make register zero-extension more prominent

There are a couple of syscall argument zero-extension instructions in
the 32-bit compat entry code, and it was mentioned that people keep
trying to optimize them out, introducing bugs.

Make them more visible, and add a "do not remove" comment.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427452582-21624-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 7502ff0b938e..dec8c1de9c9e 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -122,8 +122,11 @@ ENTRY(ia32_sysenter_target)
 	movq	PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%ebp, %ebp
+	movl	%eax, %eax
+
 	/* Construct iret frame (ss,rsp,rflags,cs,rip) */
- 	movl	%ebp,%ebp		/* zero extension */
 	pushq_cfi $__USER32_DS
 	/*CFI_REL_OFFSET ss,0*/
 	pushq_cfi %rbp
@@ -134,7 +137,6 @@ ENTRY(ia32_sysenter_target)
 	CFI_REGISTER rip,r10
 	pushq_cfi $__USER32_CS
 	/*CFI_REL_OFFSET cs,0*/
-	movl	%eax, %eax
 	/* Store thread_info->sysenter_return in rip stack slot */
 	pushq_cfi %r10
 	CFI_REL_OFFSET rip,0
@@ -329,9 +331,11 @@ ENTRY(ia32_cstar_target)
 	movq	PER_CPU_VAR(kernel_stack),%rsp
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%eax,%eax
+
 	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
 	SAVE_C_REGS_EXCEPT_RCX_R891011
-	movl 	%eax,%eax	/* zero extension */
 	movq	%rax,ORIG_RAX(%rsp)
 	movq	%rcx,RIP(%rsp)
 	CFI_REL_OFFSET rip,RIP
@@ -471,7 +475,9 @@ ENTRY(ia32_syscall)
 	SWAPGS
 	ENABLE_INTERRUPTS(CLBR_NONE)
 
-	movl %eax,%eax
+	/* Zero-extending 32-bit regs, do not remove */
+	movl	%eax,%eax
+
 	pushq_cfi %rax		/* store orig_ax */
 	cld
 	/* note the registers are not zero extended to the sf.
-- 
cgit v1.2.3


From 2dccb4cdbf8fd4cb1d779a6f7ddd66d193bb5805 Mon Sep 17 00:00:00 2001
From: Petr Matousek
Date: Wed, 11 Mar 2015 12:16:09 +0100
Subject: kvm: x86: i8259: return initialized data on invalid-size read

If data is read from PIC with invalid access size, the return data stays
uninitialized even though success is returned.

Fix this by always initializing the data.

Signed-off-by: Petr Matousek <pmatouse@redhat.com>
Reported-by: Nadav Amit <nadav.amit@gmail.com>
Message-Id: <20150311111609.GG8544@dhcp-25-225.brq.redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/i8259.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index cc31f7c06d3d..9541ba34126b 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -507,6 +507,7 @@ static int picdev_read(struct kvm_pic *s,
 		return -EOPNOTSUPP;
 
 	if (len != 1) {
+		memset(val, 0, len);
 		pr_pic_unimpl("non byte read\n");
 		return 0;
 	}
-- 
cgit v1.2.3


From b91aa14d95bf4cf8ed0426bd25c0af1548519696 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Mon, 30 Mar 2015 15:39:19 +0300
Subject: KVM: x86: CMOV emulation on legacy mode is wrong

On legacy mode CMOV emulation should still clear bits [63:32] even if the
assignment is not done. The previous fix 140bad89fd ("KVM: x86: emulation of
dword cmov on long-mode should clear [63:32]") was incomplete.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427719163-5429-2-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index c941abe800ef..62f7a395717d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -5126,8 +5126,7 @@ twobyte_insn:
 	case 0x40 ... 0x4f:	/* cmov */
 		if (test_cc(ctxt->b, ctxt->eflags))
 			ctxt->dst.val = ctxt->src.val;
-		else if (ctxt->mode != X86EMUL_MODE_PROT64 ||
-			 ctxt->op_bytes != 4)
+		else if (ctxt->op_bytes != 4)
 			ctxt->dst.type = OP_NONE; /* no writeback */
 		break;
 	case 0x80 ... 0x8f: /* jnz rel, etc*/
-- 
cgit v1.2.3


From 6fd8e1275709a5bb084847eda6730b983538a572 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Mon, 30 Mar 2015 15:39:20 +0300
Subject: KVM: x86: POPA emulation may not clear bits [63:32]

POPA should assign the values to the registers as usual registers are assigned.
In other words, 32-bits register assignments should clear bits [63:32] of the
register.

Split the code of register assignments that will be used by future changes as
well.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427719163-5429-3-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 39 +++++++++++++++++++++++----------------
 1 file changed, 23 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 62f7a395717d..4961dc5eb303 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -478,6 +478,25 @@ static void assign_masked(ulong *dest, ulong src, ulong mask)
 	*dest = (*dest & ~mask) | (src & mask);
 }
 
+static void assign_register(unsigned long *reg, u64 val, int bytes)
+{
+	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
+	switch (bytes) {
+	case 1:
+		*(u8 *)reg = (u8)val;
+		break;
+	case 2:
+		*(u16 *)reg = (u16)val;
+		break;
+	case 4:
+		*reg = (u32)val;
+		break;	/* 64b: zero-extend */
+	case 8:
+		*reg = val;
+		break;
+	}
+}
+
 static inline unsigned long ad_mask(struct x86_emulate_ctxt *ctxt)
 {
 	return (1UL << (ctxt->ad_bytes << 3)) - 1;
@@ -1691,21 +1710,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 static void write_register_operand(struct operand *op)
 {
-	/* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
-	switch (op->bytes) {
-	case 1:
-		*(u8 *)op->addr.reg = (u8)op->val;
-		break;
-	case 2:
-		*(u16 *)op->addr.reg = (u16)op->val;
-		break;
-	case 4:
-		*op->addr.reg = (u32)op->val;
-		break;	/* 64b: zero-extend */
-	case 8:
-		*op->addr.reg = op->val;
-		break;
-	}
+	return assign_register(op->addr.reg, op->val, op->bytes);
 }
 
 static int writeback(struct x86_emulate_ctxt *ctxt, struct operand *op)
@@ -1926,6 +1931,7 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 {
 	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RDI;
+	u32 val;
 
 	while (reg >= VCPU_REGS_RAX) {
 		if (reg == VCPU_REGS_RSP) {
@@ -1933,9 +1939,10 @@ static int em_popa(struct x86_emulate_ctxt *ctxt)
 			--reg;
 		}
 
-		rc = emulate_pop(ctxt, reg_rmw(ctxt, reg), ctxt->op_bytes);
+		rc = emulate_pop(ctxt, &val, ctxt->op_bytes);
 		if (rc != X86EMUL_CONTINUE)
 			break;
+		assign_register(reg_rmw(ctxt, reg), val, ctxt->op_bytes);
 		--reg;
 	}
 	return rc;
-- 
cgit v1.2.3


From 900efe200e317649aecbeaa55619a4fc3adb2251 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Mon, 30 Mar 2015 15:39:21 +0300
Subject: KVM: x86: BSF and BSR emulation change register unnecassarily

If the source of BSF and BSR is zero, the destination register should not
change. That is how real hardware behaves.  If we set the destination even with
the same value that we had before, we may clear bits [63:32] unnecassarily.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427719163-5429-4-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4961dc5eb303..70045779c725 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -962,6 +962,22 @@ FASTOP2(xadd);
 
 FASTOP2R(cmp, cmp_r);
 
+static int em_bsf_c(struct x86_emulate_ctxt *ctxt)
+{
+	/* If src is zero, do not writeback, but update flags */
+	if (ctxt->src.val == 0)
+		ctxt->dst.type = OP_NONE;
+	return fastop(ctxt, em_bsf);
+}
+
+static int em_bsr_c(struct x86_emulate_ctxt *ctxt)
+{
+	/* If src is zero, do not writeback, but update flags */
+	if (ctxt->src.val == 0)
+		ctxt->dst.type = OP_NONE;
+	return fastop(ctxt, em_bsr);
+}
+
 static u8 test_cc(unsigned int condition, unsigned long flags)
 {
 	u8 rc;
@@ -4188,7 +4204,8 @@ static const struct opcode twobyte_table[256] = {
 	N, N,
 	G(BitOp, group8),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc),
-	F(DstReg | SrcMem | ModRM, em_bsf), F(DstReg | SrcMem | ModRM, em_bsr),
+	I(DstReg | SrcMem | ModRM, em_bsf_c),
+	I(DstReg | SrcMem | ModRM, em_bsr_c),
 	D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
 	/* 0xC0 - 0xC7 */
 	F2bv(DstMem | SrcReg | ModRM | SrcWrite | Lock, em_xadd),
-- 
cgit v1.2.3


From 0efb04406de834d820f7ba150a00d1d3194aa8a6 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Sun, 29 Mar 2015 16:33:03 +0300
Subject: KVM: x86: removing redundant eflags bits definitions

The eflags are redefined (using other defines) in emulate.c.
Use the definition from processor-flags.h as some mess already started.
No functional change.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427635984-8113-2-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   2 -
 arch/x86/kvm/emulate.c          | 105 ++++++++++++++++++----------------------
 2 files changed, 46 insertions(+), 61 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bf5a1606ccd5..7ba3d9dc7ca2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -84,8 +84,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 #define SELECTOR_TI_MASK (1 << 2)
 #define SELECTOR_RPL_MASK 0x03
 
-#define IOPL_SHIFT 12
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 70045779c725..e49cabae377d 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -248,27 +248,7 @@ struct mode_dual {
 	struct opcode mode64;
 };
 
-/* EFLAGS bit definitions. */
-#define EFLG_ID (1<<21)
-#define EFLG_VIP (1<<20)
-#define EFLG_VIF (1<<19)
-#define EFLG_AC (1<<18)
-#define EFLG_VM (1<<17)
-#define EFLG_RF (1<<16)
-#define EFLG_IOPL (3<<12)
-#define EFLG_NT (1<<14)
-#define EFLG_OF (1<<11)
-#define EFLG_DF (1<<10)
-#define EFLG_IF (1<<9)
-#define EFLG_TF (1<<8)
-#define EFLG_SF (1<<7)
-#define EFLG_ZF (1<<6)
-#define EFLG_AF (1<<4)
-#define EFLG_PF (1<<2)
-#define EFLG_CF (1<<0)
-
 #define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
-#define EFLG_RESERVED_ONE_MASK 2
 
 enum x86_transfer_type {
 	X86_TRANSFER_NONE,
@@ -317,7 +297,8 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
  * These EFLAGS bits are restored from saved value during emulation, and
  * any changes are written back to the saved value after emulation.
  */
-#define EFLAGS_MASK (EFLG_OF|EFLG_SF|EFLG_ZF|EFLG_AF|EFLG_PF|EFLG_CF)
+#define EFLAGS_MASK (X86_EFLAGS_OF|X86_EFLAGS_SF|X86_EFLAGS_ZF|X86_EFLAGS_AF|\
+		     X86_EFLAGS_PF|X86_EFLAGS_CF)
 
 #ifdef CONFIG_X86_64
 #define ON64(x) x
@@ -1434,7 +1415,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 		unsigned int in_page, n;
 		unsigned int count = ctxt->rep_prefix ?
 			address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) : 1;
-		in_page = (ctxt->eflags & EFLG_DF) ?
+		in_page = (ctxt->eflags & X86_EFLAGS_DF) ?
 			offset_in_page(reg_read(ctxt, VCPU_REGS_RDI)) :
 			PAGE_SIZE - offset_in_page(reg_read(ctxt, VCPU_REGS_RDI));
 		n = min3(in_page, (unsigned int)sizeof(rc->data) / size, count);
@@ -1447,7 +1428,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 	}
 
 	if (ctxt->rep_prefix && (ctxt->d & String) &&
-	    !(ctxt->eflags & EFLG_DF)) {
+	    !(ctxt->eflags & X86_EFLAGS_DF)) {
 		ctxt->dst.data = rc->data + rc->pos;
 		ctxt->dst.type = OP_MEM_STR;
 		ctxt->dst.count = (rc->end - rc->pos) / size;
@@ -1813,32 +1794,34 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 {
 	int rc;
 	unsigned long val, change_mask;
-	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+	int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
 	int cpl = ctxt->ops->cpl(ctxt);
 
 	rc = emulate_pop(ctxt, &val, len);
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
-		| EFLG_TF | EFLG_DF | EFLG_NT | EFLG_AC | EFLG_ID;
+	change_mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+		      X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_OF |
+		      X86_EFLAGS_TF | X86_EFLAGS_DF | X86_EFLAGS_NT |
+		      X86_EFLAGS_AC | X86_EFLAGS_ID;
 
 	switch(ctxt->mode) {
 	case X86EMUL_MODE_PROT64:
 	case X86EMUL_MODE_PROT32:
 	case X86EMUL_MODE_PROT16:
 		if (cpl == 0)
-			change_mask |= EFLG_IOPL;
+			change_mask |= X86_EFLAGS_IOPL;
 		if (cpl <= iopl)
-			change_mask |= EFLG_IF;
+			change_mask |= X86_EFLAGS_IF;
 		break;
 	case X86EMUL_MODE_VM86:
 		if (iopl < 3)
 			return emulate_gp(ctxt, 0);
-		change_mask |= EFLG_IF;
+		change_mask |= X86_EFLAGS_IF;
 		break;
 	default: /* real mode */
-		change_mask |= (EFLG_IOPL | EFLG_IF);
+		change_mask |= (X86_EFLAGS_IOPL | X86_EFLAGS_IF);
 		break;
 	}
 
@@ -1939,7 +1922,7 @@ static int em_pusha(struct x86_emulate_ctxt *ctxt)
 
 static int em_pushf(struct x86_emulate_ctxt *ctxt)
 {
-	ctxt->src.val = (unsigned long)ctxt->eflags & ~EFLG_VM;
+	ctxt->src.val = (unsigned long)ctxt->eflags & ~X86_EFLAGS_VM;
 	return em_push(ctxt);
 }
 
@@ -1979,7 +1962,7 @@ static int __emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 
-	ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
+	ctxt->eflags &= ~(X86_EFLAGS_IF | X86_EFLAGS_TF | X86_EFLAGS_AC);
 
 	ctxt->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
 	rc = em_push(ctxt);
@@ -2045,10 +2028,14 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	unsigned long temp_eip = 0;
 	unsigned long temp_eflags = 0;
 	unsigned long cs = 0;
-	unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
-			     EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
-			     EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
-	unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
+	unsigned long mask = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
+			     X86_EFLAGS_ZF | X86_EFLAGS_SF | X86_EFLAGS_TF |
+			     X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
+			     X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
+			     X86_EFLAGS_AC | X86_EFLAGS_ID |
+			     X86_EFLAGS_FIXED_BIT;
+	unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
+				  X86_EFLAGS_VIP;
 
 	/* TODO: Add stack limit check */
 
@@ -2077,7 +2064,6 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 
 	ctxt->_eip = temp_eip;
 
-
 	if (ctxt->op_bytes == 4)
 		ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
 	else if (ctxt->op_bytes == 2) {
@@ -2086,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	}
 
 	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
-	ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+	ctxt->eflags |= X86_EFLAGS_FIXED_BIT;
 	ctxt->ops->set_nmi_mask(ctxt, false);
 
 	return rc;
@@ -2168,12 +2154,12 @@ static int em_cmpxchg8b(struct x86_emulate_ctxt *ctxt)
 	    ((u32) (old >> 32) != (u32) reg_read(ctxt, VCPU_REGS_RDX))) {
 		*reg_write(ctxt, VCPU_REGS_RAX) = (u32) (old >> 0);
 		*reg_write(ctxt, VCPU_REGS_RDX) = (u32) (old >> 32);
-		ctxt->eflags &= ~EFLG_ZF;
+		ctxt->eflags &= ~X86_EFLAGS_ZF;
 	} else {
 		ctxt->dst.val64 = ((u64)reg_read(ctxt, VCPU_REGS_RCX) << 32) |
 			(u32) reg_read(ctxt, VCPU_REGS_RBX);
 
-		ctxt->eflags |= EFLG_ZF;
+		ctxt->eflags |= X86_EFLAGS_ZF;
 	}
 	return X86EMUL_CONTINUE;
 }
@@ -2245,7 +2231,7 @@ static int em_cmpxchg(struct x86_emulate_ctxt *ctxt)
 	ctxt->src.val = ctxt->dst.orig_val;
 	fastop(ctxt, em_cmp);
 
-	if (ctxt->eflags & EFLG_ZF) {
+	if (ctxt->eflags & X86_EFLAGS_ZF) {
 		/* Success: write back to memory; no update of EAX */
 		ctxt->src.type = OP_NONE;
 		ctxt->dst.val = ctxt->src.orig_val;
@@ -2404,14 +2390,14 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
 		ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~msr_data;
-		ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
+		ctxt->eflags |= X86_EFLAGS_FIXED_BIT;
 #endif
 	} else {
 		/* legacy mode */
 		ops->get_msr(ctxt, MSR_STAR, &msr_data);
 		ctxt->_eip = (u32)msr_data;
 
-		ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
+		ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
 	}
 
 	return X86EMUL_CONTINUE;
@@ -2448,7 +2434,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 	if ((msr_data & 0xfffc) == 0x0)
 		return emulate_gp(ctxt, 0);
 
-	ctxt->eflags &= ~(EFLG_VM | EFLG_IF);
+	ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
 	cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK;
 	ss_sel = cs_sel + 8;
 	if (efer & EFER_LMA) {
@@ -2535,7 +2521,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
 		return false;
 	if (ctxt->mode == X86EMUL_MODE_VM86)
 		return true;
-	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
+	iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> X86_EFLAGS_IOPL_BIT;
 	return ctxt->ops->cpl(ctxt) > iopl;
 }
 
@@ -2977,7 +2963,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 static void string_addr_inc(struct x86_emulate_ctxt *ctxt, int reg,
 		struct operand *op)
 {
-	int df = (ctxt->eflags & EFLG_DF) ? -op->count : op->count;
+	int df = (ctxt->eflags & X86_EFLAGS_DF) ? -op->count : op->count;
 
 	register_address_increment(ctxt, reg, df * op->bytes);
 	op->addr.mem.ea = register_address(ctxt, reg);
@@ -3516,7 +3502,8 @@ static int em_sahf(struct x86_emulate_ctxt *ctxt)
 {
 	u32 flags;
 
-	flags = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF;
+	flags = X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF | X86_EFLAGS_ZF |
+		X86_EFLAGS_SF;
 	flags &= *reg_rmw(ctxt, VCPU_REGS_RAX) >> 8;
 
 	ctxt->eflags &= ~0xffUL;
@@ -4772,9 +4759,9 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
 	if (((ctxt->b == 0xa6) || (ctxt->b == 0xa7) ||
 	     (ctxt->b == 0xae) || (ctxt->b == 0xaf))
 	    && (((ctxt->rep_prefix == REPE_PREFIX) &&
-		 ((ctxt->eflags & EFLG_ZF) == 0))
+		 ((ctxt->eflags & X86_EFLAGS_ZF) == 0))
 		|| ((ctxt->rep_prefix == REPNE_PREFIX) &&
-		    ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
+		    ((ctxt->eflags & X86_EFLAGS_ZF) == X86_EFLAGS_ZF))))
 		return true;
 
 	return false;
@@ -4926,7 +4913,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
 			/* All REP prefixes have the same first termination condition */
 			if (address_mask(ctxt, reg_read(ctxt, VCPU_REGS_RCX)) == 0) {
 				ctxt->eip = ctxt->_eip;
-				ctxt->eflags &= ~EFLG_RF;
+				ctxt->eflags &= ~X86_EFLAGS_RF;
 				goto done;
 			}
 		}
@@ -4976,9 +4963,9 @@ special_insn:
 	}
 
 	if (ctxt->rep_prefix && (ctxt->d & String))
-		ctxt->eflags |= EFLG_RF;
+		ctxt->eflags |= X86_EFLAGS_RF;
 	else
-		ctxt->eflags &= ~EFLG_RF;
+		ctxt->eflags &= ~X86_EFLAGS_RF;
 
 	if (ctxt->execute) {
 		if (ctxt->d & Fastop) {
@@ -5027,7 +5014,7 @@ special_insn:
 		rc = emulate_int(ctxt, ctxt->src.val);
 		break;
 	case 0xce:		/* into */
-		if (ctxt->eflags & EFLG_OF)
+		if (ctxt->eflags & X86_EFLAGS_OF)
 			rc = emulate_int(ctxt, 4);
 		break;
 	case 0xe9: /* jmp rel */
@@ -5040,19 +5027,19 @@ special_insn:
 		break;
 	case 0xf5:	/* cmc */
 		/* complement carry flag from eflags reg */
-		ctxt->eflags ^= EFLG_CF;
+		ctxt->eflags ^= X86_EFLAGS_CF;
 		break;
 	case 0xf8: /* clc */
-		ctxt->eflags &= ~EFLG_CF;
+		ctxt->eflags &= ~X86_EFLAGS_CF;
 		break;
 	case 0xf9: /* stc */
-		ctxt->eflags |= EFLG_CF;
+		ctxt->eflags |= X86_EFLAGS_CF;
 		break;
 	case 0xfc: /* cld */
-		ctxt->eflags &= ~EFLG_DF;
+		ctxt->eflags &= ~X86_EFLAGS_DF;
 		break;
 	case 0xfd: /* std */
-		ctxt->eflags |= EFLG_DF;
+		ctxt->eflags |= X86_EFLAGS_DF;
 		break;
 	default:
 		goto cannot_emulate;
@@ -5113,7 +5100,7 @@ writeback:
 			}
 			goto done; /* skip rip writeback */
 		}
-		ctxt->eflags &= ~EFLG_RF;
+		ctxt->eflags &= ~X86_EFLAGS_RF;
 	}
 
 	ctxt->eip = ctxt->_eip;
-- 
cgit v1.2.3


From b32a99180027ec980af971d548781eac1f6bb9b5 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Sun, 29 Mar 2015 16:33:04 +0300
Subject: KVM: x86: Remove redundant definitions

Some constants are redfined in emulate.c. Avoid it.

s/SELECTOR_RPL_MASK/SEGMENT_RPL_MASK
s/SELECTOR_TI_MASK/SEGMENT_TI_MASK

No functional change.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427635984-8113-3-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 ---
 arch/x86/kvm/emulate.c          |  6 +++---
 arch/x86/kvm/vmx.c              | 18 +++++++++---------
 3 files changed, 12 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 7ba3d9dc7ca2..30b28dc76411 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -81,9 +81,6 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level)
 		(base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 }
 
-#define SELECTOR_TI_MASK (1 << 2)
-#define SELECTOR_RPL_MASK 0x03
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e49cabae377d..cf7d424b29d2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2435,7 +2435,7 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt)
 		return emulate_gp(ctxt, 0);
 
 	ctxt->eflags &= ~(X86_EFLAGS_VM | X86_EFLAGS_IF);
-	cs_sel = (u16)msr_data & ~SELECTOR_RPL_MASK;
+	cs_sel = (u16)msr_data & ~SEGMENT_RPL_MASK;
 	ss_sel = cs_sel + 8;
 	if (efer & EFER_LMA) {
 		cs.d = 0;
@@ -2502,8 +2502,8 @@ static int em_sysexit(struct x86_emulate_ctxt *ctxt)
 			return emulate_gp(ctxt, 0);
 		break;
 	}
-	cs_sel |= SELECTOR_RPL_MASK;
-	ss_sel |= SELECTOR_RPL_MASK;
+	cs_sel |= SEGMENT_RPL_MASK;
+	ss_sel |= SEGMENT_RPL_MASK;
 
 	ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
 	ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index fdd9f8b88e10..63ca692fa673 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3263,8 +3263,8 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg,
 		 * default value.
 		 */
 		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_SS)
-			save->selector &= ~SELECTOR_RPL_MASK;
-		save->dpl = save->selector & SELECTOR_RPL_MASK;
+			save->selector &= ~SEGMENT_RPL_MASK;
+		save->dpl = save->selector & SEGMENT_RPL_MASK;
 		save->s = 1;
 	}
 	vmx_set_segment(vcpu, save, seg);
@@ -3837,7 +3837,7 @@ static bool code_segment_valid(struct kvm_vcpu *vcpu)
 	unsigned int cs_rpl;
 
 	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
-	cs_rpl = cs.selector & SELECTOR_RPL_MASK;
+	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
 
 	if (cs.unusable)
 		return false;
@@ -3865,7 +3865,7 @@ static bool stack_segment_valid(struct kvm_vcpu *vcpu)
 	unsigned int ss_rpl;
 
 	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
-	ss_rpl = ss.selector & SELECTOR_RPL_MASK;
+	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
 
 	if (ss.unusable)
 		return true;
@@ -3887,7 +3887,7 @@ static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
 	unsigned int rpl;
 
 	vmx_get_segment(vcpu, &var, seg);
-	rpl = var.selector & SELECTOR_RPL_MASK;
+	rpl = var.selector & SEGMENT_RPL_MASK;
 
 	if (var.unusable)
 		return true;
@@ -3914,7 +3914,7 @@ static bool tr_valid(struct kvm_vcpu *vcpu)
 
 	if (tr.unusable)
 		return false;
-	if (tr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+	if (tr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
 		return false;
 	if (tr.type != 3 && tr.type != 11) /* TODO: Check if guest is in IA32e mode */
 		return false;
@@ -3932,7 +3932,7 @@ static bool ldtr_valid(struct kvm_vcpu *vcpu)
 
 	if (ldtr.unusable)
 		return true;
-	if (ldtr.selector & SELECTOR_TI_MASK)	/* TI = 1 */
+	if (ldtr.selector & SEGMENT_TI_MASK)	/* TI = 1 */
 		return false;
 	if (ldtr.type != 2)
 		return false;
@@ -3949,8 +3949,8 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
 	vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
 	vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
 
-	return ((cs.selector & SELECTOR_RPL_MASK) ==
-		 (ss.selector & SELECTOR_RPL_MASK));
+	return ((cs.selector & SEGMENT_RPL_MASK) ==
+		 (ss.selector & SEGMENT_RPL_MASK));
 }
 
 /*
-- 
cgit v1.2.3


From 2f729b10bb74f97797beb310113f6182f262d36a Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky
Date: Sun, 29 Mar 2015 01:27:17 +0300
Subject: KVM: remove useless check of "ret" variable prior to returning the
 same value

A trivial code cleanup. This `if` is redundant.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Message-Id: <20150328222717.GA6508@gnote>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index cf7d424b29d2..b304728aabe3 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2791,10 +2791,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 		return ret;
 	ret = __load_segment_descriptor(ctxt, tss->gs, VCPU_SREG_GS, cpl,
 					X86_TRANSFER_TASK_SWITCH, NULL);
-	if (ret != X86EMUL_CONTINUE)
-		return ret;
 
-	return X86EMUL_CONTINUE;
+	return ret;
 }
 
 static int task_switch_32(struct x86_emulate_ctxt *ctxt,
-- 
cgit v1.2.3


From 627276cb55f33e2dc56eab5b973937c128b7704c Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 30 Mar 2015 20:09:31 +0200
Subject: x86/asm/entry/64: Move retint_kernel code block closer to its user

The "retint_kernel" code block is misplaced. Since its logical
continuation is "retint_restore_args", it is more natural to
place it above that label. This also makes two jumps "short".

This change only moves code block around, without changing
logic.

This enables the next simplification: making
"retint_restore_args" label a local numeric one.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427738975-7391-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index dbfc8875d735..34d60c34fca8 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -740,7 +740,19 @@ opportunistic_sysret_failed:
 	SWAPGS
 	jmp restore_args
 
-retint_restore_args:	/* return to kernel space */
+/* Returning to kernel space */
+#ifdef CONFIG_PREEMPT
+	/* Interrupts are off */
+	/* Check if we need preemption */
+ENTRY(retint_kernel)
+	cmpl	$0,PER_CPU_VAR(__preempt_count)
+	jnz	retint_restore_args
+	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
+	jnc	retint_restore_args
+	call	preempt_schedule_irq
+	jmp	exit_intr
+#endif
+retint_restore_args:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	/*
 	 * The iretq could re-enable interrupts:
@@ -830,17 +842,6 @@ retint_signal:
 	GET_THREAD_INFO(%rcx)
 	jmp retint_with_reschedule
 
-#ifdef CONFIG_PREEMPT
-	/* Returning to kernel space. Check if we need preemption */
-	/* rcx:	 threadinfo. interrupts off. */
-ENTRY(retint_kernel)
-	cmpl $0,PER_CPU_VAR(__preempt_count)
-	jnz  retint_restore_args
-	bt   $9,EFLAGS(%rsp)	/* interrupts off? */
-	jnc  retint_restore_args
-	call preempt_schedule_irq
-	jmp exit_intr
-#endif
 	CFI_ENDPROC
 END(common_interrupt)
 
-- 
cgit v1.2.3


From a3675b32aac81c2c4733568844f8276527a37423 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Mon, 30 Mar 2015 20:09:34 +0200
Subject: x86/asm/entry/64: Do not GET_THREAD_INFO() too early

At exit_intr, we GET_THREAD_INFO(%rcx) and then jump to
retint_kernel if saved CS was from kernel. But the code at
retint_kernel doesn't need %rcx.

Move GET_THREAD_INFO(%rcx) down, after CS check and branch.

While at it, remove "has a correct top of stack" comment.
After recent changes which eliminated FIXUP_TOP_OF_STACK,
we always have a correct pt_regs layout.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427738975-7391-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 34d60c34fca8..6f251a5ee1dc 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -658,13 +658,12 @@ ret_from_intr:
 	CFI_ADJUST_CFA_OFFSET	RBP
 
 exit_intr:
-	GET_THREAD_INFO(%rcx)
 	testl $3,CS(%rsp)
 	je retint_kernel
-
 	/* Interrupt came from user space */
+
+	GET_THREAD_INFO(%rcx)
 	/*
-	 * Has a correct top of stack.
 	 * %rcx: thread info. Interrupts off.
 	 */
 retint_with_reschedule:
-- 
cgit v1.2.3


From 46423ffaf45c2494626841238d8bf75123a28caa Mon Sep 17 00:00:00 2001
From: Michael S. Tsirkin
Date: Sun, 29 Mar 2015 15:43:09 +0200
Subject: x86/microcode/amd: Drop the pci_ids.h dependency

This file doesn't use any macros from pci_ids.h anymore, drop the include.

Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andreas Herrmann <herrmann.der.user@googlemail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1427635734-24786-80-git-send-email-mst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/microcode/amd.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe6195e2d..12829c3ced3c 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/firmware.h>
-#include <linux/pci_ids.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel.h>
-- 
cgit v1.2.3


From 4e26d11f52684dc8b1632a8cfe450cb5197a8464 Mon Sep 17 00:00:00 2001
From: Hector Marco-Gisbert
Date: Fri, 27 Mar 2015 12:38:21 +0100
Subject: x86/mm: Improve AMD Bulldozer ASLR workaround

The ASLR implementation needs to special-case AMD F15h processors by
clearing out bits [14:12] of the virtual address in order to avoid I$
cross invalidations and thus performance penalty for certain workloads.
For details, see:

  dfb09f9b7ab0 ("x86, amd: Avoid cache aliasing penalties on AMD family 15h")

This special case reduces the mmapped file's entropy by 3 bits.

The following output is the run on an AMD Opteron 62xx class CPU
processor under x86_64 Linux 4.0.0:

  $ for i in `seq 1 10`; do cat /proc/self/maps | grep "r-xp.*libc" ; done
  b7588000-b7736000 r-xp 00000000 00:01 4924       /lib/i386-linux-gnu/libc.so.6
  b7570000-b771e000 r-xp 00000000 00:01 4924       /lib/i386-linux-gnu/libc.so.6
  b75d0000-b777e000 r-xp 00000000 00:01 4924       /lib/i386-linux-gnu/libc.so.6
  b75b0000-b775e000 r-xp 00000000 00:01 4924       /lib/i386-linux-gnu/libc.so.6
  b7578000-b7726000 r-xp 00000000 00:01 4924       /lib/i386-linux-gnu/libc.so.6
  ...

Bits [12:14] are always 0, i.e. the address always ends in 0x8000 or
0x0000.

32-bit systems, as in the example above, are especially sensitive
to this issue because 32-bit randomness for VA space is 8 bits (see
mmap_rnd()). With the Bulldozer special case, this diminishes to only 32
different slots of mmap virtual addresses.

This patch randomizes per boot the three affected bits rather than
setting them to zero. Since all the shared pages have the same value
at bits [12..14], there is no cache aliasing problems. This value gets
generated during system boot and it is thus not known to a potential
remote attacker. Therefore, the impact from the Bulldozer workaround
gets diminished and ASLR randomness increased.

More details at:

  http://hmarco.org/bugs/AMD-Bulldozer-linux-ASLR-weakness-reducing-mmaped-files-by-eight.html

Original white paper by AMD dealing with the issue:

  http://developer.amd.com/wordpress/media/2012/10/SharedL1InstructionCacheonAMD15hCPU.pdf

Mentored-by: Ismael Ripoll <iripoll@disca.upv.es>
Signed-off-by: Hector Marco-Gisbert <hecmargi@upv.es>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan-Simon <dl9pf@gmx.de>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-fsdevel@vger.kernel.org
Link: http://lkml.kernel.org/r/1427456301-3764-1-git-send-email-hecmargi@upv.es
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/elf.h   |  1 +
 arch/x86/kernel/cpu/amd.c    |  4 ++++
 arch/x86/kernel/sys_x86_64.c | 30 +++++++++++++++++++++++++++---
 3 files changed, 32 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..bd292ce9be0a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -365,6 +365,7 @@ enum align_flags {
 struct va_alignment {
 	int flags;
 	unsigned long mask;
+	unsigned long bits;
 } ____cacheline_aligned;
 
 extern struct va_alignment va_align;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..ec6a61b21b41 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
 
 #include <linux/io.h>
 #include <linux/sched.h>
+#include <linux/random.h>
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 
 		va_align.mask	  = (upperbit - 1) & PAGE_MASK;
 		va_align.flags    = ALIGN_VA_32 | ALIGN_VA_64;
+
+		/* A random value per boot for bit slice [12:upper_bit) */
+		va_align.bits = get_random_int() & va_align.mask;
 	}
 }
 
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 30277e27431a..10e0272d789a 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
 	return va_align.mask;
 }
 
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+	return va_align.bits & get_align_mask();
+}
+
 unsigned long align_vdso_addr(unsigned long addr)
 {
 	unsigned long align_mask = get_align_mask();
-	return (addr + align_mask) & ~align_mask;
+	addr = (addr + align_mask) & ~align_mask;
+	return addr | get_align_bits();
 }
 
 static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 	info.length = len;
 	info.low_limit = begin;
 	info.high_limit = end;
-	info.align_mask = filp ? get_align_mask() : 0;
+	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
+	if (filp) {
+		info.align_mask = get_align_mask();
+		info.align_offset += get_align_bits();
+	}
 	return vm_unmapped_area(&info);
 }
 
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	info.length = len;
 	info.low_limit = PAGE_SIZE;
 	info.high_limit = mm->mmap_base;
-	info.align_mask = filp ? get_align_mask() : 0;
+	info.align_mask = 0;
 	info.align_offset = pgoff << PAGE_SHIFT;
+	if (filp) {
+		info.align_mask = get_align_mask();
+		info.align_offset += get_align_bits();
+	}
 	addr = vm_unmapped_area(&info);
 	if (!(addr & ~PAGE_MASK))
 		return addr;
-- 
cgit v1.2.3


From 0a4f59d6e09ef16fbb7d213cfa1bf472c7845fda Mon Sep 17 00:00:00 2001
From: Tommi Kyntola
Date: Fri, 27 Mar 2015 11:48:16 -0700
Subject: x86/vdso: Fix the x86 vdso2c tool includes

The build-time tool arch/x86/vdso/vdso2c.c includes <linux/elf.h>,
but cannot find it, unless the build host happens to provide it.

It should be reading the uapi linux/elf.h

This build regression came along with the vdso2c changes between
v3.15 and v3.16.

Signed-off-by: Tommi Kyntola <tommi.kyntola@gmail.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Link: http://lkml.kernel.org/r/1525002.3cJ7BySVpA@musta
Link: http://lkml.kernel.org/r/efe1ec29eda830b1d0030882706f3dac99ce1f73.1427482099.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 7b9be9822724..e07e52ade243 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -51,7 +51,7 @@ VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
 $(obj)/vdso64.so.dbg: $(src)/vdso.lds $(vobjs) FORCE
 	$(call if_changed,vdso)
 
-HOST_EXTRACFLAGS += -I$(srctree)/tools/include
+HOST_EXTRACFLAGS += -I$(srctree)/tools/include -I$(srctree)/include/uapi
 hostprogs-y			+= vdso2c
 
 quiet_cmd_vdso2c = VDSO2C  $@
-- 
cgit v1.2.3


From e7d6eefaaa443130079d73cd05039d90b3db7a4a Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 27 Mar 2015 11:48:17 -0700
Subject: x86/vdso32/syscall.S: Do not load __USER32_DS to %ss

This vDSO code only gets used by 64-bit kernels, not 32-bit ones.

On 64-bit kernels, the data segment is the same for 32-bit and
64-bit userspace, and the SYSRET instruction loads %ss with its
selector.

So there's no need to repeat it by hand. Segment loads are somewhat
expensive: tens of cycles.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
[ Removed unnecessary comment. ]
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/63da6d778f69fd0f1345d9287f6764d58be519fa.1427482099.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/vdso32/syscall.S | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vdso32/syscall.S b/arch/x86/vdso/vdso32/syscall.S
index 5415b5613d55..6b286bb5251c 100644
--- a/arch/x86/vdso/vdso32/syscall.S
+++ b/arch/x86/vdso/vdso32/syscall.S
@@ -19,8 +19,6 @@ __kernel_vsyscall:
 .Lpush_ebp:
 	movl	%ecx, %ebp
 	syscall
-	movl	$__USER32_DS, %ecx
-	movl	%ecx, %ss
 	movl	%ebp, %ecx
 	popl	%ebp
 .Lpop_ebp:
-- 
cgit v1.2.3


From ef37507d99ab29c253de3c55e7ee9fa4f6d50834 Mon Sep 17 00:00:00 2001
From: Andrey Skvortsov
Date: Fri, 27 Mar 2015 11:48:18 -0700
Subject: x86/vdso: Teach 'make clean' to remove generated vdso-image-*.c files

After 'make clean' the following files were left in arch/x86/vdso/:

  vdso-image-32-int80.c
  vdso-image-32-syscall.c
  vdso-image-32-sysenter.c

These file are generated during the build process and are present
in .gitignore, so remove them.

Signed-off-by: Andrey Skvortsov <andrej.skvortzov@gmail.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/f85bb7ef6f8c6f6aa4bf422348018c84321454f8.1427482099.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index e07e52ade243..bea2f748ef86 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
 PHONY += vdso_install $(vdso_img_insttargets)
 vdso_install: $(vdso_img_insttargets) FORCE
 
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64*
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c
-- 
cgit v1.2.3


From 115db5c68bd4ed7fbcb73f300e666ff127b359b6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Fri, 27 Mar 2015 11:48:19 -0700
Subject: x86/vdso: Remove x32 intermediates during 'make clean'

The existing clean-files rule was missing vdsox32.so and
vdsox32.so.dbg.  We should really rename the intermediates to
allow a single rule to get them all.

Also-reported-by: Magnus Damm <damm+renesas@opensource.se>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Link: http://lkml.kernel.org/r/7fa2ad4a63bc6f52e214125900d54165ef06cc10.1427482099.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/vdso/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index bea2f748ef86..275a3a8b78af 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -206,4 +206,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE
 PHONY += vdso_install $(vdso_img_insttargets)
 vdso_install: $(vdso_img_insttargets) FORCE
 
-clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c
+clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so*
-- 
cgit v1.2.3


From 55474c48b4726fd3914c1ec47fced0f931729979 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sun, 29 Mar 2015 11:02:34 +0200
Subject: x86/asm/entry: Remove user_mode_ignore_vm86()

user_mode_ignore_vm86() can be used instead of user_mode(), in
places where we have already done a v8086_mode() security
check of ptregs.

But doing this check in the wrong place would be a bug that
could result in security problems, and also the naming still
isn't very clear.

Furthermore, it only affects 32-bit kernels, while most
development happens on 64-bit kernels.

If we replace them with user_mode() checks then the cost is only
a very minor increase in various slowpaths:

   text             data   bss     dec              hex    filename
   10573391         703562 1753042 13029995         c6d26b vmlinux.o.before
   10573423         703562 1753042 13030027         c6d28b vmlinux.o.after

So lets get rid of this distinction once and for all.

Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brad Spengler <spender@grsecurity.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150329090233.GA1963@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/ptrace.h    | 17 -----------------
 arch/x86/kernel/cpu/perf_event.c |  2 +-
 arch/x86/kernel/traps.c          |  6 +++---
 3 files changed, 4 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index d20bae298852..19507ffa5d28 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -113,23 +113,6 @@ static inline int user_mode(struct pt_regs *regs)
 #endif
 }
 
-/*
- * This is the fastest way to check whether regs come from user space.
- * It is unsafe if regs might come from vm86 mode, though -- in vm86
- * mode, all bits of CS and SS are completely under the user's control.
- * The CPU considers vm86 mode to be CPL 3 regardless of CS and SS.
- *
- * Do NOT use this function unless you have already ruled out the
- * possibility that regs came from vm86 mode.
- *
- * We check for RPL != 0 instead of RPL == 3 because we don't use rings
- * 1 or 2 and this is more efficient.
- */
-static inline int user_mode_ignore_vm86(struct pt_regs *regs)
-{
-	return (regs->cs & SEGMENT_RPL_MASK) != 0;
-}
-
 static inline int v8086_mode(struct pt_regs *regs)
 {
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 56f7e60ad732..e2888a3ad1e3 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2159,7 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs)
 	if (regs->flags & X86_VM_MASK)
 		return 0x10 * regs->cs;
 
-	if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS)
+	if (user_mode(regs) && regs->cs != __USER_CS)
 		return get_segment_base(regs->cs);
 #else
 	if (user_mode(regs) && !user_64bit_mode(regs) &&
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index c8eb469a94a4..6751c5c58eec 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -207,7 +207,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
 		return -1;
 	}
 
-	if (!user_mode_ignore_vm86(regs)) {
+	if (!user_mode(regs)) {
 		if (!fixup_exception(regs)) {
 			tsk->thread.error_code = error_code;
 			tsk->thread.trap_nr = trapnr;
@@ -468,7 +468,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 
 	tsk = current;
-	if (!user_mode_ignore_vm86(regs)) {
+	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
 			goto exit;
 
@@ -685,7 +685,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	 * We already checked v86 mode above, so we can check for kernel mode
 	 * by just checking the CPL of CS.
 	 */
-	if ((dr6 & DR_STEP) && !user_mode_ignore_vm86(regs)) {
+	if ((dr6 & DR_STEP) && !user_mode(regs)) {
 		tsk->thread.debugreg6 &= ~DR_STEP;
 		set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
 		regs->flags &= ~X86_EFLAGS_TF;
-- 
cgit v1.2.3


From eabdc320ece583e16e581306c720e5f1ff67c3bb Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 21:58:17 +0200
Subject: crypto: aesni - mark AES-NI helper ciphers

Flag all AES-NI helper ciphers as internal ciphers to prevent them from
being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/aesni-intel_glue.c | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 6893f4947583..f9a78f32f494 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -797,7 +797,9 @@ static int rfc4106_init(struct crypto_tfm *tfm)
 		PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
 	struct crypto_aead *cryptd_child;
 	struct aesni_rfc4106_gcm_ctx *child_ctx;
-	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
+	cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni",
+				       CRYPTO_ALG_INTERNAL,
+				       CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 
@@ -1262,7 +1264,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__aes-aesni",
 	.cra_driver_name	= "__driver-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_CIPHER | CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1281,7 +1283,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__ecb-aes-aesni",
 	.cra_driver_name	= "__driver-ecb-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1301,7 +1304,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__cbc-aes-aesni",
 	.cra_driver_name	= "__driver-cbc-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1365,7 +1369,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__ctr-aes-aesni",
 	.cra_driver_name	= "__driver-ctr-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct crypto_aes_ctx) +
 				  AESNI_ALIGN - 1,
@@ -1409,7 +1414,7 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__gcm-aes-aesni",
 	.cra_driver_name	= "__driver-gcm-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_AEAD,
+	.cra_flags		= CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct aesni_rfc4106_gcm_ctx) +
 				  AESNI_ALIGN,
@@ -1479,7 +1484,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__lrw-aes-aesni",
 	.cra_driver_name	= "__driver-lrw-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct aesni_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -1500,7 +1506,8 @@ static struct crypto_alg aesni_algs[] = { {
 	.cra_name		= "__xts-aes-aesni",
 	.cra_driver_name	= "__driver-xts-aes-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= AES_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct aesni_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 6a9b52b7fa1a376c1749cf441d1aeb8572d3b691 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:01:49 +0200
Subject: crypto: clmulni - mark ghash clmulni helper ciphers

Flag all ash clmulni helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/ghash-clmulni-intel_glue.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index 8253d85aa165..2079baf06bdd 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -154,7 +154,8 @@ static struct shash_alg ghash_alg = {
 		.cra_name		= "__ghash",
 		.cra_driver_name	= "__ghash-pclmulqdqni",
 		.cra_priority		= 0,
-		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH |
+					  CRYPTO_ALG_INTERNAL,
 		.cra_blocksize		= GHASH_BLOCK_SIZE,
 		.cra_ctxsize		= sizeof(struct ghash_ctx),
 		.cra_module		= THIS_MODULE,
@@ -261,7 +262,9 @@ static int ghash_async_init_tfm(struct crypto_tfm *tfm)
 	struct cryptd_ahash *cryptd_tfm;
 	struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
 
-	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
+	cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni",
+					CRYPTO_ALG_INTERNAL,
+					CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(cryptd_tfm))
 		return PTR_ERR(cryptd_tfm);
 	ctx->cryptd_tfm = cryptd_tfm;
-- 
cgit v1.2.3


From a62356a978180eb3ccd2bcac49764dfd628c72f8 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:03:17 +0200
Subject: crypto: camellia_aesni_avx2 - mark AES-NI Camellia helper ciphers

Flag all AES-NI Camellia helper ciphers as internal ciphers to
prevent them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_aesni_avx2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia_aesni_avx2_glue.c b/arch/x86/crypto/camellia_aesni_avx2_glue.c
index 9a07fafe3831..baf0ac21ace5 100644
--- a/arch/x86/crypto/camellia_aesni_avx2_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx2_glue.c
@@ -343,7 +343,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ecb-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-ecb-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -362,7 +363,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__cbc-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-cbc-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -381,7 +383,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ctr-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-ctr-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -401,7 +404,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__lrw-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-lrw-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -424,7 +428,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__xts-camellia-aesni-avx2",
 	.cra_driver_name	= "__driver-xts-camellia-aesni-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 680574e8b3b2f62e507cdd9ca35c77cad0344bb2 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:03:57 +0200
Subject: crypto: cast5_avx - mark CAST5 helper ciphers

Flag all CAST5 helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast5_avx_glue.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c
index 60ada677a928..236c80974457 100644
--- a/arch/x86/crypto/cast5_avx_glue.c
+++ b/arch/x86/crypto/cast5_avx_glue.c
@@ -341,7 +341,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__ecb-cast5-avx",
 	.cra_driver_name	= "__driver-ecb-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST5_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
@@ -360,7 +361,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__cbc-cast5-avx",
 	.cra_driver_name	= "__driver-cbc-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST5_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
@@ -379,7 +381,8 @@ static struct crypto_alg cast5_algs[6] = { {
 	.cra_name		= "__ctr-cast5-avx",
 	.cra_driver_name	= "__driver-ctr-cast5-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct cast5_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 7d2c31dd70ed49210847ae8cc00f14064292b349 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:04:49 +0200
Subject: crypto: camellia_aesni_avx - mark AVX Camellia helper ciphers

Flag all AVX Camellia helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/camellia_aesni_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/camellia_aesni_avx_glue.c b/arch/x86/crypto/camellia_aesni_avx_glue.c
index ed38d959add6..78818a1e73e3 100644
--- a/arch/x86/crypto/camellia_aesni_avx_glue.c
+++ b/arch/x86/crypto/camellia_aesni_avx_glue.c
@@ -335,7 +335,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ecb-camellia-aesni",
 	.cra_driver_name	= "__driver-ecb-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -354,7 +355,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__cbc-camellia-aesni",
 	.cra_driver_name	= "__driver-cbc-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -373,7 +375,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__ctr-camellia-aesni",
 	.cra_driver_name	= "__driver-ctr-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct camellia_ctx),
 	.cra_alignmask		= 0,
@@ -393,7 +396,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__lrw-camellia-aesni",
 	.cra_driver_name	= "__driver-lrw-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -416,7 +420,8 @@ static struct crypto_alg cmll_algs[10] = { {
 	.cra_name		= "__xts-camellia-aesni",
 	.cra_driver_name	= "__driver-xts-camellia-aesni",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAMELLIA_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct camellia_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From e69b8a46ca0ec38ef419071b00574bc053664e18 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:05:35 +0200
Subject: crypto: cast6_avx - mark CAST6 helper ciphers

Flag all CAST6 helper ciphers as internal ciphers to prevent them
from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/cast6_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/cast6_avx_glue.c b/arch/x86/crypto/cast6_avx_glue.c
index 0160f68a57ff..f448810ca4ac 100644
--- a/arch/x86/crypto/cast6_avx_glue.c
+++ b/arch/x86/crypto/cast6_avx_glue.c
@@ -372,7 +372,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__ecb-cast6-avx",
 	.cra_driver_name	= "__driver-ecb-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -391,7 +392,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__cbc-cast6-avx",
 	.cra_driver_name	= "__driver-cbc-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -410,7 +412,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__ctr-cast6-avx",
 	.cra_driver_name	= "__driver-ctr-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct cast6_ctx),
 	.cra_alignmask		= 0,
@@ -430,7 +433,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__lrw-cast6-avx",
 	.cra_driver_name	= "__driver-lrw-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -453,7 +457,8 @@ static struct crypto_alg cast6_algs[10] = { {
 	.cra_name		= "__xts-cast6-avx",
 	.cra_driver_name	= "__driver-xts-cast6-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= CAST6_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct cast6_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From f82419acd8ad6b045a040ba8f5f0289972189826 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:06:13 +0200
Subject: crypto: serpent_avx2 - mark Serpent AVX2 helper ciphers

Flag all Serpent AVX2 helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_avx2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_avx2_glue.c b/arch/x86/crypto/serpent_avx2_glue.c
index 437e47a4d302..2f63dc89e7a9 100644
--- a/arch/x86/crypto/serpent_avx2_glue.c
+++ b/arch/x86/crypto/serpent_avx2_glue.c
@@ -309,7 +309,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__ecb-serpent-avx2",
 	.cra_driver_name	= "__driver-ecb-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -329,7 +330,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__cbc-serpent-avx2",
 	.cra_driver_name	= "__driver-cbc-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -349,7 +351,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__ctr-serpent-avx2",
 	.cra_driver_name	= "__driver-ctr-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -370,7 +373,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__lrw-serpent-avx2",
 	.cra_driver_name	= "__driver-lrw-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -394,7 +398,8 @@ static struct crypto_alg srp_algs[10] = { {
 	.cra_name		= "__xts-serpent-avx2",
 	.cra_driver_name	= "__driver-xts-serpent-avx2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 65aed5394120953ccb6e307d75f9abd17c15740e Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:07:05 +0200
Subject: crypto: serpent_avx - mark Serpent AVX helper ciphers

Flag all Serpent AVX helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_avx_glue.c b/arch/x86/crypto/serpent_avx_glue.c
index 7e217398b4eb..c8d478af8456 100644
--- a/arch/x86/crypto/serpent_avx_glue.c
+++ b/arch/x86/crypto/serpent_avx_glue.c
@@ -378,7 +378,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ecb-serpent-avx",
 	.cra_driver_name	= "__driver-ecb-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -397,7 +398,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__cbc-serpent-avx",
 	.cra_driver_name	= "__driver-cbc-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -416,7 +418,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ctr-serpent-avx",
 	.cra_driver_name	= "__driver-ctr-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -436,7 +439,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__lrw-serpent-avx",
 	.cra_driver_name	= "__driver-lrw-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -459,7 +463,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__xts-serpent-avx",
 	.cra_driver_name	= "__driver-xts-serpent-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 748be1f1bfddfe83d4b31c17191b082e96c86867 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:07:45 +0200
Subject: crypto: serpent_sse2 - mark Serpent SSE2 helper ciphers

Flag all Serpent SSE2 helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/serpent_sse2_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/serpent_sse2_glue.c b/arch/x86/crypto/serpent_sse2_glue.c
index bf025adaea01..3643dd508f45 100644
--- a/arch/x86/crypto/serpent_sse2_glue.c
+++ b/arch/x86/crypto/serpent_sse2_glue.c
@@ -387,7 +387,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ecb-serpent-sse2",
 	.cra_driver_name	= "__driver-ecb-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -406,7 +407,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__cbc-serpent-sse2",
 	.cra_driver_name	= "__driver-cbc-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -425,7 +427,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__ctr-serpent-sse2",
 	.cra_driver_name	= "__driver-ctr-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct serpent_ctx),
 	.cra_alignmask		= 0,
@@ -445,7 +448,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__lrw-serpent-sse2",
 	.cra_driver_name	= "__driver-lrw-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -468,7 +472,8 @@ static struct crypto_alg serpent_algs[10] = { {
 	.cra_name		= "__xts-serpent-sse2",
 	.cra_driver_name	= "__driver-xts-serpent-sse2",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= SERPENT_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct serpent_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 4dda66f62e01e832e09d6d1fbd9c9070d57b049e Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:08:53 +0200
Subject: crypto: twofish_avx - mark Twofish AVX helper ciphers

Flag all Twofish AVX helper ciphers as internal ciphers to prevent
them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/twofish_avx_glue.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/twofish_avx_glue.c b/arch/x86/crypto/twofish_avx_glue.c
index 1ac531ea9bcc..b5e2d5651851 100644
--- a/arch/x86/crypto/twofish_avx_glue.c
+++ b/arch/x86/crypto/twofish_avx_glue.c
@@ -340,7 +340,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__ecb-twofish-avx",
 	.cra_driver_name	= "__driver-ecb-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -359,7 +360,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__cbc-twofish-avx",
 	.cra_driver_name	= "__driver-cbc-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -378,7 +380,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__ctr-twofish-avx",
 	.cra_driver_name	= "__driver-ctr-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= 1,
 	.cra_ctxsize		= sizeof(struct twofish_ctx),
 	.cra_alignmask		= 0,
@@ -398,7 +401,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__lrw-twofish-avx",
 	.cra_driver_name	= "__driver-lrw-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_lrw_ctx),
 	.cra_alignmask		= 0,
@@ -421,7 +425,8 @@ static struct crypto_alg twofish_algs[10] = { {
 	.cra_name		= "__xts-twofish-avx",
 	.cra_driver_name	= "__driver-xts-twofish-avx",
 	.cra_priority		= 0,
-	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER,
+	.cra_flags		= CRYPTO_ALG_TYPE_BLKCIPHER |
+				  CRYPTO_ALG_INTERNAL,
 	.cra_blocksize		= TF_BLOCK_SIZE,
 	.cra_ctxsize		= sizeof(struct twofish_xts_ctx),
 	.cra_alignmask		= 0,
-- 
cgit v1.2.3


From 555fa17b2b8cc865c203de263443041eb75bc7f7 Mon Sep 17 00:00:00 2001
From: Stephan Mueller
Date: Mon, 30 Mar 2015 22:11:46 +0200
Subject: crypto: sha-mb - mark Multi buffer SHA1 helper cipher

Flag all Multi buffer SHA1 helper ciphers as internal ciphers
to prevent them from being called by normal users.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha-mb/sha1_mb.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha-mb/sha1_mb.c b/arch/x86/crypto/sha-mb/sha1_mb.c
index 9414fd964ecd..e510b1c5d690 100644
--- a/arch/x86/crypto/sha-mb/sha1_mb.c
+++ b/arch/x86/crypto/sha-mb/sha1_mb.c
@@ -694,7 +694,8 @@ static struct shash_alg sha1_mb_shash_alg = {
 		 * use ASYNC flag as some buffers in multi-buffer
 		 * algo may not have completed before hashing thread sleep
 		 */
-		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC,
+		.cra_flags	 = CRYPTO_ALG_TYPE_SHASH | CRYPTO_ALG_ASYNC |
+				   CRYPTO_ALG_INTERNAL,
 		.cra_blocksize	 = SHA1_BLOCK_SIZE,
 		.cra_module	 = THIS_MODULE,
 		.cra_list	 = LIST_HEAD_INIT(sha1_mb_shash_alg.base.cra_list),
@@ -770,7 +771,9 @@ static int sha1_mb_async_init_tfm(struct crypto_tfm *tfm)
 	struct sha1_mb_ctx *ctx = crypto_tfm_ctx(tfm);
 	struct mcryptd_hash_ctx *mctx;
 
-	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb", 0, 0);
+	mcryptd_tfm = mcryptd_alloc_ahash("__intel_sha1-mb",
+					  CRYPTO_ALG_INTERNAL,
+					  CRYPTO_ALG_INTERNAL);
 	if (IS_ERR(mcryptd_tfm))
 		return PTR_ERR(mcryptd_tfm);
 	mctx = crypto_ahash_ctx(&mcryptd_tfm->base);
-- 
cgit v1.2.3


From 1d804d079a92138d011900785193b6b00b44bc00 Mon Sep 17 00:00:00 2001
From: Joe Perches
Date: Mon, 30 Mar 2015 16:46:09 -0700
Subject: x86: Use bool function return values of true/false not 1/0

Use the normal return values for bool functions

Signed-off-by: Joe Perches <joe@perches.com>
Message-Id: <9f593eb2f43b456851cd73f7ed09654ca58fb570.1427759009.git.joe@perches.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_para.h |  2 +-
 arch/x86/kvm/cpuid.h            |  2 +-
 arch/x86/kvm/vmx.c              | 72 ++++++++++++++++++++---------------------
 3 files changed, 38 insertions(+), 38 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index e62cf897f781..c1adf33fdd0d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -115,7 +115,7 @@ static inline void kvm_spinlock_init(void)
 
 static inline bool kvm_para_available(void)
 {
-	return 0;
+	return false;
 }
 
 static inline unsigned int kvm_arch_para_features(void)
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 4452eedfaedd..26228466f3f8 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -26,7 +26,7 @@ static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 	struct kvm_cpuid_entry2 *best;
 
 	if (!static_cpu_has(X86_FEATURE_XSAVE))
-		return 0;
+		return false;
 
 	best = kvm_find_cpuid_entry(vcpu, 1, 0);
 	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 63ca692fa673..0caaf56eb459 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7314,21 +7314,21 @@ static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
 		else if (port < 0x10000)
 			bitmap = vmcs12->io_bitmap_b;
 		else
-			return 1;
+			return true;
 		bitmap += (port & 0x7fff) / 8;
 
 		if (last_bitmap != bitmap)
 			if (kvm_read_guest(vcpu->kvm, bitmap, &b, 1))
-				return 1;
+				return true;
 		if (b & (1 << (port & 7)))
-			return 1;
+			return true;
 
 		port++;
 		size--;
 		last_bitmap = bitmap;
 	}
 
-	return 0;
+	return false;
 }
 
 /*
@@ -7344,7 +7344,7 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	gpa_t bitmap;
 
 	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_MSR_BITMAPS))
-		return 1;
+		return true;
 
 	/*
 	 * The MSR_BITMAP page is divided into four 1024-byte bitmaps,
@@ -7363,10 +7363,10 @@ static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
 	if (msr_index < 1024*8) {
 		unsigned char b;
 		if (kvm_read_guest(vcpu->kvm, bitmap + msr_index/8, &b, 1))
-			return 1;
+			return true;
 		return 1 & (b >> (msr_index & 7));
 	} else
-		return 1; /* let L1 handle the wrong parameter */
+		return true; /* let L1 handle the wrong parameter */
 }
 
 /*
@@ -7388,7 +7388,7 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 		case 0:
 			if (vmcs12->cr0_guest_host_mask &
 			    (val ^ vmcs12->cr0_read_shadow))
-				return 1;
+				return true;
 			break;
 		case 3:
 			if ((vmcs12->cr3_target_count >= 1 &&
@@ -7399,37 +7399,37 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 					vmcs12->cr3_target_value2 == val) ||
 				(vmcs12->cr3_target_count >= 4 &&
 					vmcs12->cr3_target_value3 == val))
-				return 0;
+				return false;
 			if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
-				return 1;
+				return true;
 			break;
 		case 4:
 			if (vmcs12->cr4_guest_host_mask &
 			    (vmcs12->cr4_read_shadow ^ val))
-				return 1;
+				return true;
 			break;
 		case 8:
 			if (nested_cpu_has(vmcs12, CPU_BASED_CR8_LOAD_EXITING))
-				return 1;
+				return true;
 			break;
 		}
 		break;
 	case 2: /* clts */
 		if ((vmcs12->cr0_guest_host_mask & X86_CR0_TS) &&
 		    (vmcs12->cr0_read_shadow & X86_CR0_TS))
-			return 1;
+			return true;
 		break;
 	case 1: /* mov from cr */
 		switch (cr) {
 		case 3:
 			if (vmcs12->cpu_based_vm_exec_control &
 			    CPU_BASED_CR3_STORE_EXITING)
-				return 1;
+				return true;
 			break;
 		case 8:
 			if (vmcs12->cpu_based_vm_exec_control &
 			    CPU_BASED_CR8_STORE_EXITING)
-				return 1;
+				return true;
 			break;
 		}
 		break;
@@ -7440,14 +7440,14 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 		 */
 		if (vmcs12->cr0_guest_host_mask & 0xe &
 		    (val ^ vmcs12->cr0_read_shadow))
-			return 1;
+			return true;
 		if ((vmcs12->cr0_guest_host_mask & 0x1) &&
 		    !(vmcs12->cr0_read_shadow & 0x1) &&
 		    (val & 0x1))
-			return 1;
+			return true;
 		break;
 	}
-	return 0;
+	return false;
 }
 
 /*
@@ -7470,43 +7470,43 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 				KVM_ISA_VMX);
 
 	if (vmx->nested.nested_run_pending)
-		return 0;
+		return false;
 
 	if (unlikely(vmx->fail)) {
 		pr_info_ratelimited("%s failed vm entry %x\n", __func__,
 				    vmcs_read32(VM_INSTRUCTION_ERROR));
-		return 1;
+		return true;
 	}
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
 		if (!is_exception(intr_info))
-			return 0;
+			return false;
 		else if (is_page_fault(intr_info))
 			return enable_ept;
 		else if (is_no_device(intr_info) &&
 			 !(vmcs12->guest_cr0 & X86_CR0_TS))
-			return 0;
+			return false;
 		return vmcs12->exception_bitmap &
 				(1u << (intr_info & INTR_INFO_VECTOR_MASK));
 	case EXIT_REASON_EXTERNAL_INTERRUPT:
-		return 0;
+		return false;
 	case EXIT_REASON_TRIPLE_FAULT:
-		return 1;
+		return true;
 	case EXIT_REASON_PENDING_INTERRUPT:
 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
 	case EXIT_REASON_NMI_WINDOW:
 		return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
 	case EXIT_REASON_TASK_SWITCH:
-		return 1;
+		return true;
 	case EXIT_REASON_CPUID:
 		if (kvm_register_read(vcpu, VCPU_REGS_RAX) == 0xa)
-			return 0;
-		return 1;
+			return false;
+		return true;
 	case EXIT_REASON_HLT:
 		return nested_cpu_has(vmcs12, CPU_BASED_HLT_EXITING);
 	case EXIT_REASON_INVD:
-		return 1;
+		return true;
 	case EXIT_REASON_INVLPG:
 		return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING);
 	case EXIT_REASON_RDPMC:
@@ -7523,7 +7523,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * VMX instructions trap unconditionally. This allows L1 to
 		 * emulate them for its L2 guest, i.e., allows 3-level nesting!
 		 */
-		return 1;
+		return true;
 	case EXIT_REASON_CR_ACCESS:
 		return nested_vmx_exit_handled_cr(vcpu, vmcs12);
 	case EXIT_REASON_DR_ACCESS:
@@ -7534,7 +7534,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_MSR_WRITE:
 		return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason);
 	case EXIT_REASON_INVALID_STATE:
-		return 1;
+		return true;
 	case EXIT_REASON_MWAIT_INSTRUCTION:
 		return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
 	case EXIT_REASON_MONITOR_INSTRUCTION:
@@ -7544,7 +7544,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 			nested_cpu_has2(vmcs12,
 				SECONDARY_EXEC_PAUSE_LOOP_EXITING);
 	case EXIT_REASON_MCE_DURING_VMENTRY:
-		return 0;
+		return false;
 	case EXIT_REASON_TPR_BELOW_THRESHOLD:
 		return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
 	case EXIT_REASON_APIC_ACCESS:
@@ -7553,7 +7553,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 	case EXIT_REASON_APIC_WRITE:
 	case EXIT_REASON_EOI_INDUCED:
 		/* apic_write and eoi_induced should exit unconditionally. */
-		return 1;
+		return true;
 	case EXIT_REASON_EPT_VIOLATION:
 		/*
 		 * L0 always deals with the EPT violation. If nested EPT is
@@ -7561,7 +7561,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * missing in the guest EPT table (EPT12), the EPT violation
 		 * will be injected with nested_ept_inject_page_fault()
 		 */
-		return 0;
+		return false;
 	case EXIT_REASON_EPT_MISCONFIG:
 		/*
 		 * L2 never uses directly L1's EPT, but rather L0's own EPT
@@ -7569,11 +7569,11 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * (EPT on EPT). So any problems with the structure of the
 		 * table is L0's fault.
 		 */
-		return 0;
+		return false;
 	case EXIT_REASON_WBINVD:
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_WBINVD_EXITING);
 	case EXIT_REASON_XSETBV:
-		return 1;
+		return true;
 	case EXIT_REASON_XSAVES: case EXIT_REASON_XRSTORS:
 		/*
 		 * This should never happen, since it is not possible to
@@ -7583,7 +7583,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 	default:
-		return 1;
+		return true;
 	}
 }
 
-- 
cgit v1.2.3


From 4399c03c6780ed75fa26e09a7b3a175b3aac5760 Mon Sep 17 00:00:00 2001
From: Bandan Das
Date: Tue, 31 Mar 2015 16:43:17 -0400
Subject: x86/apic: Remove verify_local_APIC()

__verify_local_APIC() is detritus from the early APIC days.
Its return value isn't used anywhere and the information it
prints when debug is enabled is already part of APIC
initialization messages printed to syslog. Off with it!

Signed-off-by: Bandan Das <bsd@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/jpgy4mcsxsq.fsf@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/apic.h |  1 -
 arch/x86/kernel/apic/apic.c | 62 ---------------------------------------------
 arch/x86/kernel/smpboot.c   |  2 --
 3 files changed, 65 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..08f217354442 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -204,7 +204,6 @@ extern void clear_local_APIC(void);
 extern void disconnect_bsp_APIC(int virt_wire_setup);
 extern void disable_local_APIC(void);
 extern void lapic_shutdown(void);
-extern int verify_local_APIC(void);
 extern void sync_Arb_IDs(void);
 extern void init_bsp_APIC(void);
 extern void setup_local_APIC(void);
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639ae1b9b..dcb52850a28f 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
 	local_irq_restore(flags);
 }
 
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
-	unsigned int reg0, reg1;
-
-	/*
-	 * The version register is read-only in a real APIC.
-	 */
-	reg0 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
-	apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
-	reg1 = apic_read(APIC_LVR);
-	apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
-	/*
-	 * The two version reads above should print the same
-	 * numbers.  If the second one is different, then we
-	 * poke at a non-APIC.
-	 */
-	if (reg1 != reg0)
-		return 0;
-
-	/*
-	 * Check if the version looks reasonably.
-	 */
-	reg1 = GET_APIC_VERSION(reg0);
-	if (reg1 == 0x00 || reg1 == 0xff)
-		return 0;
-	reg1 = lapic_get_maxlvt();
-	if (reg1 < 0x02 || reg1 == 0xff)
-		return 0;
-
-	/*
-	 * The ID register is read/write in a real APIC.
-	 */
-	reg0 = apic_read(APIC_ID);
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
-	apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
-	reg1 = apic_read(APIC_ID);
-	apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
-	apic_write(APIC_ID, reg0);
-	if (reg1 != (reg0 ^ apic->apic_id_mask))
-		return 0;
-
-	/*
-	 * The next two are just to see if we have sane values.
-	 * They're only really relevant if we're in Virtual Wire
-	 * compatibility mode, but most boxes are anymore.
-	 */
-	reg0 = apic_read(APIC_LVT0);
-	apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
-	reg1 = apic_read(APIC_LVT1);
-	apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
-	return 1;
-}
-
 /**
  * sync_Arb_IDs - synchronize APIC bus arbitration IDs
  */
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
 		disable_ioapic_support();
 
 	default_setup_apic_routing();
-	verify_local_APIC();
 	apic_bsp_setup(true);
 	return 0;
 }
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..ddd2c0674cda 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1086,8 +1086,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
 		return SMP_NO_APIC;
 	}
 
-	verify_local_APIC();
-
 	/*
 	 * If SMP should be disabled, then really disable it!
 	 */
-- 
cgit v1.2.3


From 4416c5a6dacdddd55378e7011f9c8720d2a7470f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:03 +0200
Subject: x86/asm/entry/64: Do not TRACE_IRQS fast SYSRET64 path

SYSRET code path has a small irq-off block.
On this code path, TRACE_IRQS_ON can't be called right before
interrupts are enabled for real, we can't clobber registers
there. So current code does it earlier, in a safe place.

But with this, TRACE_IRQS_OFF/ON frames just two fast
instructions, which is ridiculous: now most of irq-off block is
_outside_ of the framing.

Do the same thing that we do on SYSCALL entry: do not track this
irq-off block, it is very small to ever cause noticeable irq
latency.

Be careful: make sure that "jnz int_ret_from_sys_call_irqs_off"
now does invoke TRACE_IRQS_OFF - move
int_ret_from_sys_call_irqs_off label before TRACE_IRQS_OFF.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 6f251a5ee1dc..f6e37de02ba7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -269,8 +269,11 @@ system_call_fastpath:
  * Has incompletely filled pt_regs.
  */
 	LOCKDEP_SYS_EXIT
+	/*
+	 * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+	 * it is too small to ever cause noticeable irq latency.
+	 */
 	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
 
 	/*
 	 * We must check ti flags with interrupts (or at least preemption)
@@ -284,10 +287,7 @@ system_call_fastpath:
 	jnz int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
 	CFI_REMEMBER_STATE
-	/*
-	 * sysretq will re-enable interrupts:
-	 */
-	TRACE_IRQS_ON
+
 	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RIP(%rsp),%rcx
 	CFI_REGISTER	rip,rcx
@@ -298,6 +298,7 @@ system_call_fastpath:
 	 * 64bit SYSRET restores rip from rcx,
 	 * rflags from r11 (but RF and VM bits are forced to 0),
 	 * cs and ss are loaded from MSRs.
+	 * Restoration of rflags re-enables interrupts.
 	 */
 	USERGS_SYSRET64
 
@@ -346,8 +347,8 @@ tracesys_phase2:
  */
 GLOBAL(int_ret_from_sys_call)
 	DISABLE_INTERRUPTS(CLBR_NONE)
+int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
 	TRACE_IRQS_OFF
-int_ret_from_sys_call_irqs_off:
 	movl $_TIF_ALLWORK_MASK,%edi
 	/* edi:	mask to check */
 GLOBAL(int_with_check)
-- 
cgit v1.2.3


From 4c9c0e919fef05b3bc6a8aff1db7a31b2ba4f4b6 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:04 +0200
Subject: x86/asm/entry/32: Use smaller PUSH instructions instead of MOV, to
 build 'pt_regs' on stack

This mimics the recent similar 64-bit change.
Saves ~110 bytes of code.

Patch was run-tested on 32 and 64 bits, Intel and AMD CPU.
I also looked at the diff of entry_64.o disassembly, to have
a different view of the changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 82 ++++++++++++++++++++++++++---------------------
 1 file changed, 46 insertions(+), 36 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index dec8c1de9c9e..8d01cce7b6b8 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -126,26 +126,27 @@ ENTRY(ia32_sysenter_target)
 	movl	%ebp, %ebp
 	movl	%eax, %eax
 
-	/* Construct iret frame (ss,rsp,rflags,cs,rip) */
-	pushq_cfi $__USER32_DS
-	/*CFI_REL_OFFSET ss,0*/
-	pushq_cfi %rbp
-	CFI_REL_OFFSET rsp,0
-	pushfq_cfi
-	/*CFI_REL_OFFSET rflags,0*/
-	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d
+	movl	ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d
 	CFI_REGISTER rip,r10
-	pushq_cfi $__USER32_CS
-	/*CFI_REL_OFFSET cs,0*/
-	/* Store thread_info->sysenter_return in rip stack slot */
-	pushq_cfi %r10
-	CFI_REL_OFFSET rip,0
-	/* Store orig_ax */
-	pushq_cfi %rax
-	/* Construct the rest of "struct pt_regs" */
+
+	/* Construct struct pt_regs on stack */
+	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
+	pushq_cfi	%rbp			/* pt_regs->sp */
+	CFI_REL_OFFSET	rsp,0
+	pushfq_cfi				/* pt_regs->flags */
+	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
+	pushq_cfi	%r10 /* pt_regs->ip = thread_info->sysenter_return */
+	CFI_REL_OFFSET	rip,0
+	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
+	pushq_cfi_reg	rdi			/* pt_regs->di */
+	pushq_cfi_reg	rsi			/* pt_regs->si */
+	pushq_cfi_reg	rdx			/* pt_regs->dx */
+	pushq_cfi_reg	rcx			/* pt_regs->cx */
+	pushq_cfi_reg	rax			/* pt_regs->ax */
 	cld
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS_EXCEPT_R891011
+	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+	CFI_ADJUST_CFA_OFFSET 10*8
+
 	/*
 	 * no need to do an access_ok check here because rbp has been
 	 * 32bit zero extended
@@ -334,20 +335,24 @@ ENTRY(ia32_cstar_target)
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%eax,%eax
 
-	ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */
-	SAVE_C_REGS_EXCEPT_RCX_R891011
-	movq	%rax,ORIG_RAX(%rsp)
-	movq	%rcx,RIP(%rsp)
-	CFI_REL_OFFSET rip,RIP
-	movq	%rbp,RCX(%rsp) /* this lies slightly to ptrace */
+	/* Construct struct pt_regs on stack */
+	pushq_cfi	$__USER32_DS		/* pt_regs->ss */
+	pushq_cfi	%r8			/* pt_regs->sp */
+	CFI_REL_OFFSET rsp,0
+	pushq_cfi	%r11			/* pt_regs->flags */
+	pushq_cfi	$__USER32_CS		/* pt_regs->cs */
+	pushq_cfi	%rcx			/* pt_regs->ip */
+	CFI_REL_OFFSET rip,0
+	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
+	pushq_cfi_reg	rdi			/* pt_regs->di */
+	pushq_cfi_reg	rsi			/* pt_regs->si */
+	pushq_cfi_reg	rdx			/* pt_regs->dx */
+	pushq_cfi_reg	rbp			/* pt_regs->cx */
 	movl	%ebp,%ecx
-	movq	$__USER32_CS,CS(%rsp)
-	movq	$__USER32_DS,SS(%rsp)
-	movq	%r11,EFLAGS(%rsp)
-	/*CFI_REL_OFFSET rflags,EFLAGS*/
-	movq	%r8,RSP(%rsp)
-	CFI_REL_OFFSET rsp,RSP
-	/* iret stack frame is complete now */
+	pushq_cfi_reg	rax			/* pt_regs->ax */
+	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+	CFI_ADJUST_CFA_OFFSET 10*8
+
 	/*
 	 * no need to do an access_ok check here because r8 has been
 	 * 32bit zero extended
@@ -478,12 +483,17 @@ ENTRY(ia32_syscall)
 	/* Zero-extending 32-bit regs, do not remove */
 	movl	%eax,%eax
 
-	pushq_cfi %rax		/* store orig_ax */
+	/* Construct struct pt_regs on stack (iret frame is already on stack) */
+	pushq_cfi_reg	rax			/* pt_regs->orig_ax */
+	pushq_cfi_reg	rdi			/* pt_regs->di */
+	pushq_cfi_reg	rsi			/* pt_regs->si */
+	pushq_cfi_reg	rdx			/* pt_regs->dx */
+	pushq_cfi_reg	rcx			/* pt_regs->cx */
+	pushq_cfi_reg	rax			/* pt_regs->ax */
 	cld
-	/* note the registers are not zero extended to the sf.
-	   this could be a problem. */
-	ALLOC_PT_GPREGS_ON_STACK
-	SAVE_C_REGS_EXCEPT_R891011
+	sub	$(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */
+	CFI_ADJUST_CFA_OFFSET 10*8
+
 	orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz ia32_tracesys
-- 
cgit v1.2.3


From 6ba71b7617f1fa65f19bd34f4484a0694ef9a520 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:05 +0200
Subject: x86/asm/entry/64: Simplify retint_kernel label usage, make
 retint_restore_args label local

Get rid of #define obfuscation of retint_kernel in
CONFIG_PREEMPT case by defining retint_kernel label always, not
only for CONFIG_PREEMPT.

Strip retint_kernel of .global-ness (ENTRY macro) - it has no
users outside of this file.

This looks like cosmetics, but it is not:
"je LABEL" can be optimized to short jump by assember
only if LABEL is not global, for global labels jump is always
a near one with relocation.

Convert retint_restore_args to a local numeric label, making it
clearer that it is not used elsewhere in the file.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f6e37de02ba7..1879c55bf0f7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -57,10 +57,6 @@
 	.section .entry.text, "ax"
 
 
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
 #ifdef CONFIG_PARAVIRT
 ENTRY(native_usergs_sysret64)
 	swapgs
@@ -741,18 +737,18 @@ opportunistic_sysret_failed:
 	jmp restore_args
 
 /* Returning to kernel space */
+retint_kernel:
 #ifdef CONFIG_PREEMPT
 	/* Interrupts are off */
 	/* Check if we need preemption */
-ENTRY(retint_kernel)
 	cmpl	$0,PER_CPU_VAR(__preempt_count)
-	jnz	retint_restore_args
+	jnz	1f
 	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
-	jnc	retint_restore_args
+	jnc	1f
 	call	preempt_schedule_irq
 	jmp	exit_intr
+1:
 #endif
-retint_restore_args:
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	/*
 	 * The iretq could re-enable interrupts:
-- 
cgit v1.2.3


From 32a04077fe401842424a4b555572fa459c01e0a3 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:06 +0200
Subject: x86/asm/entry/64: Remove redundant DISABLE_INTERRUPTS()

At this location, we already have interrupts off, always.
To be more specific, we already disabled them here:

    ret_from_intr:
	    DISABLE_INTERRUPTS(CLBR_NONE)

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1879c55bf0f7..9f8d01f1f1b2 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -749,7 +749,6 @@ retint_kernel:
 	jmp	exit_intr
 1:
 #endif
-	DISABLE_INTERRUPTS(CLBR_ANY)
 	/*
 	 * The iretq could re-enable interrupts:
 	 */
-- 
cgit v1.2.3


From 36acef2510853e2831047ca9e22d333ba7a1047b Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:07 +0200
Subject: x86/asm/entry/64: Simplify looping around preempt_schedule_irq()

At the 'exit_intr' label we test whether interrupt/exception was in
kernel. If it did, we jump to the preemption check. If preemption
does happen (IOW if we call preempt_schedule_irq()), we go back to
'exit_intr'.

But it's pointless, we already know that the test succeeded last
time, preemption doesn't change the fact that interrupt/exception
was in the kernel.

We can go back directly to checking PER_CPU_VAR(__preempt_count) instead.

This makes the 'exit_intr' label unused, drop it.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 9f8d01f1f1b2..bad285d84a9f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -654,7 +654,6 @@ ret_from_intr:
 	CFI_DEF_CFA_REGISTER	rsp
 	CFI_ADJUST_CFA_OFFSET	RBP
 
-exit_intr:
 	testl $3,CS(%rsp)
 	je retint_kernel
 	/* Interrupt came from user space */
@@ -741,12 +740,12 @@ retint_kernel:
 #ifdef CONFIG_PREEMPT
 	/* Interrupts are off */
 	/* Check if we need preemption */
-	cmpl	$0,PER_CPU_VAR(__preempt_count)
-	jnz	1f
 	bt	$9,EFLAGS(%rsp)	/* interrupts were off? */
 	jnc	1f
+0:	cmpl	$0,PER_CPU_VAR(__preempt_count)
+	jnz	1f
 	call	preempt_schedule_irq
-	jmp	exit_intr
+	jmp	0b
 1:
 #endif
 	/*
-- 
cgit v1.2.3


From a734b4a23e4b5a5bba577d11b6e2ff21f6ca4fce Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:10 +0200
Subject: x86/asm: Replace "MOVQ $imm, %reg" with MOVL

There is no reason to use MOVQ to load a non-negative immediate
constant value into a 64-bit register. MOVL does the same, since
the upper 32 bits are zero-extended by the CPU.

This makes the code a bit smaller, while leaving functionality
unchanged.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-8-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +-
 arch/x86/crypto/twofish-x86_64-asm_64.S   | 4 ++--
 arch/x86/kernel/relocate_kernel_64.S      | 8 ++++----
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 26d49ebae040..225be06edc80 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -178,7 +178,7 @@ continue_block:
 	## 2a) PROCESS FULL BLOCKS:
 	################################################################
 full_block:
-	movq    $128,%rax
+	movl    $128,%eax
 	lea     128*8*2(block_0), block_1
 	lea     128*8*3(block_0), block_2
 	add     $128*8*1, block_0
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index a039d21986a2..a350c990dc86 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk)
 	movq	R1,	8(%rsi)
 
 	popq	R1
-	movq	$1,%rax
+	movl	$1,%eax
 	ret
 ENDPROC(twofish_enc_blk)
 
@@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk)
 	movq	R1,	8(%rsi)
 
 	popq	R1
-	movq	$1,%rax
+	movl	$1,%eax
 	ret
 ENDPROC(twofish_dec_blk)
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 04cb1790a596..98111b38ebfd 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
 	 * Set cr4 to a known state:
 	 *  - physical address extension enabled
 	 */
-	movq	$X86_CR4_PAE, %rax
+	movl	$X86_CR4_PAE, %eax
 	movq	%rax, %cr4
 
 	jmp 1f
@@ -246,17 +246,17 @@ swap_pages:
 	movq	%rsi, %rax
 
 	movq	%r10, %rdi
-	movq	$512,   %rcx
+	movl	$512, %ecx
 	rep ; movsq
 
 	movq	%rax, %rdi
 	movq	%rdx, %rsi
-	movq	$512,   %rcx
+	movl	$512, %ecx
 	rep ; movsq
 
 	movq	%rdx, %rdi
 	movq	%r10, %rsi
-	movq	$512,   %rcx
+	movl	$512, %ecx
 	rep ; movsq
 
 	lea	PAGE_SIZE(%rax), %rsi
-- 
cgit v1.2.3


From a6de5a21fb25cdbbdf3c3e9afd8481581c4f2464 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 31 Mar 2015 19:00:11 +0200
Subject: x86/asm/entry/64: Use local label to skip around sycall dispatch

Logically, we just want to jump around the following instruction
and its prologue/epilogue:

  call *sys_call_table(,%rax,8)

if the syscall number is too big - we do not specifically target
the "int_ret_from_sys_call" label.

Use a local, numerical label for this jump, for more clarity.

This also makes the code smaller:

 -ffffffff8187756b:      0f 87 0f 00 00 00       ja     ffffffff81877580 <int_ret_from_sys_call>
 +ffffffff8187756b:      77 0f                   ja     ffffffff8187757c <int_ret_from_sys_call>

because jumps to global labels are never translated to short jump
instructions by GAS.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427821211-25099-9-git-send-email-dvlasenk@redhat.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index bad285d84a9f..03c52e217680 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -331,10 +331,11 @@ tracesys_phase2:
 	andl $__SYSCALL_MASK,%eax
 	cmpl $__NR_syscall_max,%eax
 #endif
-	ja   int_ret_from_sys_call	/* RAX(%rsp) is already set */
+	ja	1f	/* return -ENOSYS (already in pt_regs->ax) */
 	movq %r10,%rcx	/* fixup for C */
 	call *sys_call_table(,%rax,8)
 	movq %rax,RAX(%rsp)
+1:
 	/* Use IRET because user could have changed pt_regs->foo */
 
 /*
-- 
cgit v1.2.3


From fed6cefe3b6e862dcc74d07324478caa07e84eaf Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 5 Feb 2015 11:44:41 +0100
Subject: x86/efi: Add a "debug" option to the efi= cmdline

... and hide the memory regions dump behind it. Make it default-off.

Signed-off-by: Borislav Petkov <bp@suse.de>
Link: http://lkml.kernel.org/r/20141209095843.GA3990@pd.tnic
Acked-by: Laszlo Ersek <lersek@redhat.com>
Acked-by: Dave Young <dyoung@redhat.com>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 Documentation/kernel-parameters.txt | 3 ++-
 arch/x86/platform/efi/efi.c         | 5 ++++-
 include/linux/efi.h                 | 1 +
 3 files changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..01aa47d3b6ab 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1036,7 +1036,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Format: {"off" | "on" | "skip[mbr]"}
 
 	efi=		[EFI]
-			Format: { "old_map", "nochunk", "noruntime" }
+			Format: { "old_map", "nochunk", "noruntime", "debug" }
 			old_map [X86-64]: switch to the old ioremap-based EFI
 			runtime services mapping. 32-bit still uses this one by
 			default.
@@ -1044,6 +1044,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			boot stub, as chunking can cause problems with some
 			firmware implementations.
 			noruntime : disable EFI runtime services support
+			debug: enable misc debug output
 
 	efi_no_storage_paranoia [EFI; X86]
 			Using this parameter you can use more than 50% of
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index dbc8627a5cdf..e859d56ce9f8 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -491,7 +491,8 @@ void __init efi_init(void)
 	if (efi_memmap_init())
 		return;
 
-	print_efi_memmap();
+	if (efi_enabled(EFI_DBG))
+		print_efi_memmap();
 }
 
 void __init efi_late_init(void)
@@ -939,6 +940,8 @@ static int __init arch_parse_efi_cmdline(char *str)
 {
 	if (parse_option_str(str, "old_map"))
 		set_bit(EFI_OLD_MEMMAP, &efi.flags);
+	if (parse_option_str(str, "debug"))
+		set_bit(EFI_DBG, &efi.flags);
 
 	return 0;
 }
diff --git a/include/linux/efi.h b/include/linux/efi.h
index cf7e431cbc73..af5be0368dec 100644
--- a/include/linux/efi.h
+++ b/include/linux/efi.h
@@ -942,6 +942,7 @@ extern int __init efi_setup_pcdp_console(char *);
 #define EFI_64BIT		5	/* Is the firmware 64-bit? */
 #define EFI_PARAVIRT		6	/* Access is via a paravirt interface */
 #define EFI_ARCH_1		7	/* First arch-specific bit */
+#define EFI_DBG			8	/* Print additional debug info at runtime */
 
 #ifdef CONFIG_EFI
 /*
-- 
cgit v1.2.3


From 23a0d4e8fa6d3a1d7fb819f79bcc0a3739c30ba9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 3 Mar 2015 07:34:33 +0100
Subject: efi: Disable interrupts around EFI calls, not in the epilog/prolog
 calls

Tapasweni Pathak reported that we do a kmalloc() in efi_call_phys_prolog()
on x86-64 while having interrupts disabled, which is a big no-no, as
kmalloc() can sleep.

Solve this by removing the irq disabling from the prolog/epilog calls
around EFI calls: it's unnecessary, as in this stage we are single
threaded in the boot thread, and we don't ever execute this from
interrupt contexts.

Reported-by: Tapasweni Pathak <tapaswenipathak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/platform/efi/efi.c    |  7 +++++++
 arch/x86/platform/efi/efi_32.c | 11 +++--------
 arch/x86/platform/efi/efi_64.c |  3 ---
 3 files changed, 10 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e859d56ce9f8..e7a01e32db95 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -85,12 +85,19 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
 	efi_memory_desc_t *virtual_map)
 {
 	efi_status_t status;
+	unsigned long flags;
 
 	efi_call_phys_prolog();
+
+	/* Disable interrupts around EFI calls: */
+	local_irq_save(flags);
 	status = efi_call_phys(efi_phys.set_virtual_address_map,
 			       memory_map_size, descriptor_size,
 			       descriptor_version, virtual_map);
+	local_irq_restore(flags);
+
 	efi_call_phys_epilog();
+
 	return status;
 }
 
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 40e7cda52936..abecc6e1dc90 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -33,11 +33,10 @@
 
 /*
  * To make EFI call EFI runtime service in physical addressing mode we need
- * prolog/epilog before/after the invocation to disable interrupt, to
- * claim EFI runtime service handler exclusively and to duplicate a memory in
- * low memory space say 0 - 3G.
+ * prolog/epilog before/after the invocation to claim the EFI runtime service
+ * handler exclusively and to duplicate a memory mapping in low memory space,
+ * say 0 - 3G.
  */
-static unsigned long efi_rt_eflags;
 
 void efi_sync_low_kernel_mappings(void) {}
 void __init efi_dump_pagetable(void) {}
@@ -61,8 +60,6 @@ void __init efi_call_phys_prolog(void)
 {
 	struct desc_ptr gdt_descr;
 
-	local_irq_save(efi_rt_eflags);
-
 	load_cr3(initial_page_table);
 	__flush_tlb_all();
 
@@ -81,8 +78,6 @@ void __init efi_call_phys_epilog(void)
 
 	load_cr3(swapper_pg_dir);
 	__flush_tlb_all();
-
-	local_irq_restore(efi_rt_eflags);
 }
 
 void __init efi_runtime_mkexec(void)
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 17e80d829df0..427eb3540e5f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -42,7 +42,6 @@
 #include <asm/time.h>
 
 static pgd_t *save_pgd __initdata;
-static unsigned long efi_flags __initdata;
 
 /*
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
@@ -88,7 +87,6 @@ void __init efi_call_phys_prolog(void)
 		return;
 
 	early_code_mapping_set_exec(1);
-	local_irq_save(efi_flags);
 
 	n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE);
 	save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL);
@@ -116,7 +114,6 @@ void __init efi_call_phys_epilog(void)
 		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
 	kfree(save_pgd);
 	__flush_tlb_all();
-	local_irq_restore(efi_flags);
 	early_code_mapping_set_exec(0);
 }
 
-- 
cgit v1.2.3


From 744937b0b12a669f298949c4a810794c59fead98 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 3 Mar 2015 07:48:50 +0100
Subject: efi: Clean up the efi_call_phys_[prolog|epilog]() save/restore
 interaction

Currently x86-64 efi_call_phys_prolog() saves into a global variable (save_pgd),
and efi_call_phys_epilog() restores the kernel pagetables from that global
variable.

Change this to a cleaner save/restore pattern where the saving function returns
the saved object and the restore function restores that.

Apply the same concept to the 32-bit code as well.

Plus this approach, as an added bonus, allows us to express the
!efi_enabled(EFI_OLD_MEMMAP) situation in a clean fashion as well,
via a 'NULL' return value.

Cc: Tapasweni Pathak <tapaswenipathak@gmail.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Matt Fleming <matt.fleming@intel.com>
---
 arch/x86/include/asm/efi.h     |  6 ++++--
 arch/x86/platform/efi/efi.c    |  5 +++--
 arch/x86/platform/efi/efi_32.c | 11 ++++++++---
 arch/x86/platform/efi/efi_64.c | 26 ++++++++++++++++----------
 4 files changed, 31 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 25bce45c6fc4..3738b138b843 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -2,6 +2,8 @@
 #define _ASM_X86_EFI_H
 
 #include <asm/i387.h>
+#include <asm/pgtable.h>
+
 /*
  * We map the EFI regions needed for runtime services non-contiguously,
  * with preserved alignment on virtual addresses starting from -4G down
@@ -89,8 +91,8 @@ extern void __iomem *__init efi_ioremap(unsigned long addr, unsigned long size,
 extern struct efi_scratch efi_scratch;
 extern void __init efi_set_executable(efi_memory_desc_t *md, bool executable);
 extern int __init efi_memblock_x86_reserve_range(void);
-extern void __init efi_call_phys_prolog(void);
-extern void __init efi_call_phys_epilog(void);
+extern pgd_t * __init efi_call_phys_prolog(void);
+extern void __init efi_call_phys_epilog(pgd_t *save_pgd);
 extern void __init efi_unmap_memmap(void);
 extern void __init efi_memory_uc(u64 addr, unsigned long size);
 extern void __init efi_map_region(efi_memory_desc_t *md);
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index e7a01e32db95..02744df576d5 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -86,8 +86,9 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
 {
 	efi_status_t status;
 	unsigned long flags;
+	pgd_t *save_pgd;
 
-	efi_call_phys_prolog();
+	save_pgd = efi_call_phys_prolog();
 
 	/* Disable interrupts around EFI calls: */
 	local_irq_save(flags);
@@ -96,7 +97,7 @@ static efi_status_t __init phys_efi_set_virtual_address_map(
 			       descriptor_version, virtual_map);
 	local_irq_restore(flags);
 
-	efi_call_phys_epilog();
+	efi_call_phys_epilog(save_pgd);
 
 	return status;
 }
diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c
index abecc6e1dc90..ed5b67338294 100644
--- a/arch/x86/platform/efi/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
@@ -56,19 +56,24 @@ void __init efi_map_region(efi_memory_desc_t *md)
 void __init efi_map_region_fixed(efi_memory_desc_t *md) {}
 void __init parse_efi_setup(u64 phys_addr, u32 data_len) {}
 
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
 {
 	struct desc_ptr gdt_descr;
+	pgd_t *save_pgd;
 
+	/* Current pgd is swapper_pg_dir, we'll restore it later: */
+	save_pgd = swapper_pg_dir;
 	load_cr3(initial_page_table);
 	__flush_tlb_all();
 
 	gdt_descr.address = __pa(get_cpu_gdt_table(0));
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
+
+	return save_pgd;
 }
 
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
 {
 	struct desc_ptr gdt_descr;
 
@@ -76,7 +81,7 @@ void __init efi_call_phys_epilog(void)
 	gdt_descr.size = GDT_SIZE - 1;
 	load_gdt(&gdt_descr);
 
-	load_cr3(swapper_pg_dir);
+	load_cr3(save_pgd);
 	__flush_tlb_all();
 }
 
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 427eb3540e5f..a0ac0f9c307f 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,8 +41,6 @@
 #include <asm/realmode.h>
 #include <asm/time.h>
 
-static pgd_t *save_pgd __initdata;
-
 /*
  * We allocate runtime services regions bottom-up, starting from -4G, i.e.
  * 0xffff_ffff_0000_0000 and limit EFI VA mapping space to 64G.
@@ -77,14 +75,16 @@ static void __init early_code_mapping_set_exec(int executable)
 	}
 }
 
-void __init efi_call_phys_prolog(void)
+pgd_t * __init efi_call_phys_prolog(void)
 {
 	unsigned long vaddress;
+	pgd_t *save_pgd;
+
 	int pgd;
 	int n_pgds;
 
 	if (!efi_enabled(EFI_OLD_MEMMAP))
-		return;
+		return NULL;
 
 	early_code_mapping_set_exec(1);
 
@@ -97,22 +97,28 @@ void __init efi_call_phys_prolog(void)
 		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), *pgd_offset_k(vaddress));
 	}
 	__flush_tlb_all();
+
+	return save_pgd;
 }
 
-void __init efi_call_phys_epilog(void)
+void __init efi_call_phys_epilog(pgd_t *save_pgd)
 {
 	/*
 	 * After the lock is released, the original page table is restored.
 	 */
-	int pgd;
-	int n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+	int pgd_idx;
+	int nr_pgds;
 
-	if (!efi_enabled(EFI_OLD_MEMMAP))
+	if (!save_pgd)
 		return;
 
-	for (pgd = 0; pgd < n_pgds; pgd++)
-		set_pgd(pgd_offset_k(pgd * PGDIR_SIZE), save_pgd[pgd]);
+	nr_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT) , PGDIR_SIZE);
+
+	for (pgd_idx = 0; pgd_idx < nr_pgds; pgd_idx++)
+		set_pgd(pgd_offset_k(pgd_idx * PGDIR_SIZE), save_pgd[pgd_idx]);
+
 	kfree(save_pgd);
+
 	__flush_tlb_all();
 	early_code_mapping_set_exec(0);
 }
-- 
cgit v1.2.3


From 80313b3078fcd2ca51970880d90757f05879a193 Mon Sep 17 00:00:00 2001
From: Stefan Lippers-Hollmann
Date: Mon, 30 Mar 2015 22:44:27 +0200
Subject: x86/reboot: Add ASRock Q1900DC-ITX mainboard reboot quirk

The ASRock Q1900DC-ITX mainboard (Baytrail-D) hangs randomly in
both BIOS and UEFI mode while rebooting unless reboot=pci is
used. Add a quirk to reboot via the pci method.

The problem is very intermittent and hard to debug, it might succeed
rebooting just fine 40 times in a row - but fails half a dozen times
the next day. It seems to be slightly less common in BIOS CSM mode
than native UEFI (with the CSM disabled), but it does happen in either
mode. Since I've started testing this patch in late january, rebooting
has been 100% reliable.

Most of the time it already hangs during POST, but occasionally it
might even make it through the bootloader and the kernel might even
start booting, but then hangs before the mode switch. The same symptoms
occur with grub-efi, gummiboot and grub-pc, just as well as (at least)
kernel 3.16-3.19 and 4.0-rc6 (I haven't tried older kernels than 3.16).
Upgrading to the most current mainboard firmware of the ASRock
Q1900DC-ITX, version 1.20, does not improve the situation.

( Searching the web seems to suggest that other Bay Trail-D mainboards
  might be affected as well. )
--
Signed-off-by: Stefan Lippers-Hollmann <s.l-h@gmx.de>
Cc: <stable@vger.kernel.org>
Cc: Matt Fleming <matt.fleming@intel.com>
Link: http://lkml.kernel.org/r/20150330224427.0fb58e42@mir
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/reboot.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index bae6c609888e..86db4bcd7ce5 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 		},
 	},
 
+	/* ASRock */
+	{	/* Handle problems with rebooting on ASRock Q1900DC-ITX */
+		.callback = set_pci_reboot,
+		.ident = "ASRock Q1900DC-ITX",
+		.matches = {
+			DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+			DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+		},
+	},
+
 	/* ASUS */
 	{	/* Handle problems with rebooting on ASUS P4S800 */
 		.callback = set_bios_reboot,
-- 
cgit v1.2.3


From 4ffee521f36390c7720d493591b764ca35c8030b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 25 Mar 2015 13:09:16 +0100
Subject: clockevents: Make suspend/resume calls explicit

clockevents_notify() is a leftover from the early design of the
clockevents facility. It's really not a notification mechanism,
it's a multiplex call.

We are way better off to have explicit calls instead of this
monstrosity. Split out the suspend/resume() calls and invoke
them directly from the call sites.

No locking required at this point because these calls happen
with interrupts disabled and a single cpu online.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Rebased on top of 4.0-rc5. ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/713674030.jVm1qaHuPf@vostro.rjw.lan
[ Rebased on top of latest timers/core. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/suspend.c      | 11 ++++-------
 include/linux/clockchips.h  |  2 --
 include/linux/tick.h        |  3 +++
 kernel/time/clockevents.c   |  9 ---------
 kernel/time/tick-common.c   | 26 +++++++++++++++++++++++---
 kernel/time/tick-internal.h |  3 ++-
 kernel/time/timekeeping.c   |  6 ++----
 7 files changed, 34 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index c4df9dbd63b7..033e428581f4 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,5 +1,5 @@
 #include <linux/types.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 
 #include <xen/interface/xen.h>
 #include <xen/grant_table.h>
@@ -81,17 +81,14 @@ void xen_arch_post_suspend(int cancelled)
 
 static void xen_vcpu_notify_restore(void *data)
 {
-	unsigned long reason = (unsigned long)data;
-
 	/* Boot processor notified via generic timekeeping_resume() */
-	if ( smp_processor_id() == 0)
+	if (smp_processor_id() == 0)
 		return;
 
-	clockevents_notify(reason, NULL);
+	tick_resume();
 }
 
 void xen_arch_resume(void)
 {
-	on_each_cpu(xen_vcpu_notify_restore,
-		    (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
+	on_each_cpu(xen_vcpu_notify_restore, NULL, 1);
 }
diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index bc3af821350b..50ce9750754f 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -16,8 +16,6 @@ enum clock_event_nofitiers {
 	CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
 	CLOCK_EVT_NOTIFY_BROADCAST_ENTER,
 	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
-	CLOCK_EVT_NOTIFY_SUSPEND,
-	CLOCK_EVT_NOTIFY_RESUME,
 	CLOCK_EVT_NOTIFY_CPU_DYING,
 	CLOCK_EVT_NOTIFY_CPU_DEAD,
 };
diff --git a/include/linux/tick.h b/include/linux/tick.h
index f9a2d2687a46..7e07e0e3d898 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -29,10 +29,13 @@ extern struct tick_device *tick_get_device(int cpu);
 extern void __init tick_init(void);
 extern void tick_freeze(void);
 extern void tick_unfreeze(void);
+/* Should be core only, but XEN resume magic abuses this interface */
+extern void tick_resume(void);
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
 static inline void tick_freeze(void) { }
 static inline void tick_unfreeze(void) { }
+static inline void tick_resume(void) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 #ifdef CONFIG_TICK_ONESHOT
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index b73002718536..7af614829da1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -670,15 +670,6 @@ int clockevents_notify(unsigned long reason, void *arg)
 		tick_handover_do_timer(arg);
 		break;
 
-	case CLOCK_EVT_NOTIFY_SUSPEND:
-		tick_suspend();
-		tick_suspend_broadcast();
-		break;
-
-	case CLOCK_EVT_NOTIFY_RESUME:
-		tick_resume();
-		break;
-
 	case CLOCK_EVT_NOTIFY_CPU_DEAD:
 		tick_shutdown_broadcast_oneshot(arg);
 		tick_shutdown_broadcast(arg);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index a5b877130ae9..1a60c2ae96a8 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -373,18 +373,39 @@ void tick_shutdown(unsigned int *cpup)
 	}
 }
 
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
 void tick_suspend(void)
 {
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 
 	clockevents_shutdown(td->evtdev);
+	tick_suspend_broadcast();
 }
 
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
 void tick_resume(void)
 {
-	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
-	int broadcast = tick_resume_broadcast();
+	struct tick_device *td;
+	int broadcast;
 
+	broadcast = tick_resume_broadcast();
+	td = this_cpu_ptr(&tick_cpu_device);
 	clockevents_tick_resume(td->evtdev);
 
 	if (!broadcast) {
@@ -416,7 +437,6 @@ void tick_freeze(void)
 		timekeeping_suspend();
 	} else {
 		tick_suspend();
-		tick_suspend_broadcast();
 	}
 
 	raw_spin_unlock(&tick_freeze_lock);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index dd2c45d057b9..85a957195bf6 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -23,7 +23,6 @@ extern void tick_check_new_device(struct clock_event_device *dev);
 extern void tick_handover_do_timer(int *cpup);
 extern void tick_shutdown(unsigned int *cpup);
 extern void tick_suspend(void);
-extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
 				   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
@@ -42,6 +41,8 @@ extern void clockevents_exchange_device(struct clock_event_device *old,
 extern void clockevents_handle_noop(struct clock_event_device *dev);
 extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
+#else
+static inline void tick_suspend(void) { }
 #endif /* GENERIC_CLOCKEVENTS */
 
 /* Oneshot related functions */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index c3fcff06d30a..5b12292b343a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1389,9 +1389,7 @@ void timekeeping_resume(void)
 
 	touch_softlockup_watchdog();
 
-	clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
-
-	/* Resume hrtimers */
+	tick_resume();
 	hrtimers_resume();
 }
 
@@ -1444,7 +1442,7 @@ int timekeeping_suspend(void)
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-	clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
+	tick_suspend();
 	clocksource_suspend();
 	clockevents_suspend();
 
-- 
cgit v1.2.3


From f46481d0a7cb942b84145acb80ad43bdb1ff8eb4 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 25 Mar 2015 13:11:04 +0100
Subject: tick/xen: Provide and use tick_suspend_local() and
 tick_resume_local()

Xen calls on every cpu into tick_resume() which is just wrong.
tick_resume() is for the syscore global suspend/resume
invocation. What XEN really wants is a per cpu local resume
function.

Provide a tick_resume_local() function and use it in XEN.

Also provide a complementary tick_suspend_local() and modify
tick_unfreeze() and tick_freeze(), respectively, to use the
new local tick resume/suspend functions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
[ Combined two patches, rebased, modified subject/changelog. ]
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1698741.eezk9tnXtG@vostro.rjw.lan
[ Merged to latest timers/core. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/xen/suspend.c       |  2 +-
 include/linux/tick.h         |  6 ++---
 kernel/time/tick-broadcast.c | 24 +++++++++++++------
 kernel/time/tick-common.c    | 55 ++++++++++++++++++++++++++++++--------------
 kernel/time/tick-internal.h  |  8 +++++--
 5 files changed, 65 insertions(+), 30 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 033e428581f4..d9497698645a 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -85,7 +85,7 @@ static void xen_vcpu_notify_restore(void *data)
 	if (smp_processor_id() == 0)
 		return;
 
-	tick_resume();
+	tick_resume_local();
 }
 
 void xen_arch_resume(void)
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7e07e0e3d898..a3d4d2840e7f 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -29,13 +29,13 @@ extern struct tick_device *tick_get_device(int cpu);
 extern void __init tick_init(void);
 extern void tick_freeze(void);
 extern void tick_unfreeze(void);
-/* Should be core only, but XEN resume magic abuses this interface */
-extern void tick_resume(void);
+/* Should be core only, but XEN resume magic requires this */
+extern void tick_resume_local(void);
 #else /* CONFIG_GENERIC_CLOCKEVENTS */
 static inline void tick_init(void) { }
 static inline void tick_freeze(void) { }
 static inline void tick_unfreeze(void) { }
-static inline void tick_resume(void) { }
+static inline void tick_resume_local(void) { }
 #endif /* !CONFIG_GENERIC_CLOCKEVENTS */
 
 #ifdef CONFIG_TICK_ONESHOT
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 60e6c23ce1c7..19cfb381faa9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -455,11 +455,26 @@ void tick_suspend_broadcast(void)
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
 
-int tick_resume_broadcast(void)
+/*
+ * This is called from tick_resume_local() on a resuming CPU. That's
+ * called from the core resume function, tick_unfreeze() and the magic XEN
+ * resume hackery.
+ *
+ * In none of these cases the broadcast device mode can change and the
+ * bit of the resuming CPU in the broadcast mask is safe as well.
+ */
+bool tick_resume_check_broadcast(void)
+{
+	if (tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT)
+		return false;
+	else
+		return cpumask_test_cpu(smp_processor_id(), tick_broadcast_mask);
+}
+
+void tick_resume_broadcast(void)
 {
 	struct clock_event_device *bc;
 	unsigned long flags;
-	int broadcast = 0;
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 
@@ -472,8 +487,6 @@ int tick_resume_broadcast(void)
 		case TICKDEV_MODE_PERIODIC:
 			if (!cpumask_empty(tick_broadcast_mask))
 				tick_broadcast_start_periodic(bc);
-			broadcast = cpumask_test_cpu(smp_processor_id(),
-						     tick_broadcast_mask);
 			break;
 		case TICKDEV_MODE_ONESHOT:
 			if (!cpumask_empty(tick_broadcast_mask))
@@ -482,11 +495,8 @@ int tick_resume_broadcast(void)
 		}
 	}
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
-
-	return broadcast;
 }
 
-
 #ifdef CONFIG_TICK_ONESHOT
 
 static cpumask_var_t tick_broadcast_oneshot_mask;
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 1a60c2ae96a8..da796d65d1fb 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -374,40 +374,32 @@ void tick_shutdown(unsigned int *cpup)
 }
 
 /**
- * tick_suspend - Suspend the tick and the broadcast device
+ * tick_suspend_local - Suspend the local tick device
  *
- * Called from syscore_suspend() via timekeeping_suspend with only one
- * CPU online and interrupts disabled or from tick_unfreeze() under
- * tick_freeze_lock.
+ * Called from the local cpu for freeze with interrupts disabled.
  *
  * No locks required. Nothing can change the per cpu device.
  */
-void tick_suspend(void)
+static void tick_suspend_local(void)
 {
 	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
 
 	clockevents_shutdown(td->evtdev);
-	tick_suspend_broadcast();
 }
 
 /**
- * tick_resume - Resume the tick and the broadcast device
+ * tick_resume_local - Resume the local tick device
  *
- * Called from syscore_resume() via timekeeping_resume with only one
- * CPU online and interrupts disabled or from tick_unfreeze() under
- * tick_freeze_lock.
+ * Called from the local CPU for unfreeze or XEN resume magic.
  *
  * No locks required. Nothing can change the per cpu device.
  */
-void tick_resume(void)
+void tick_resume_local(void)
 {
-	struct tick_device *td;
-	int broadcast;
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	bool broadcast = tick_resume_check_broadcast();
 
-	broadcast = tick_resume_broadcast();
-	td = this_cpu_ptr(&tick_cpu_device);
 	clockevents_tick_resume(td->evtdev);
-
 	if (!broadcast) {
 		if (td->mode == TICKDEV_MODE_PERIODIC)
 			tick_setup_periodic(td->evtdev, 0);
@@ -416,6 +408,35 @@ void tick_resume(void)
 	}
 }
 
+/**
+ * tick_suspend - Suspend the tick and the broadcast device
+ *
+ * Called from syscore_suspend() via timekeeping_suspend with only one
+ * CPU online and interrupts disabled or from tick_unfreeze() under
+ * tick_freeze_lock.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_suspend(void)
+{
+	tick_suspend_local();
+	tick_suspend_broadcast();
+}
+
+/**
+ * tick_resume - Resume the tick and the broadcast device
+ *
+ * Called from syscore_resume() via timekeeping_resume with only one
+ * CPU online and interrupts disabled.
+ *
+ * No locks required. Nothing can change the per cpu device.
+ */
+void tick_resume(void)
+{
+	tick_resume_broadcast();
+	tick_resume_local();
+}
+
 static DEFINE_RAW_SPINLOCK(tick_freeze_lock);
 static unsigned int tick_freeze_depth;
 
@@ -436,7 +457,7 @@ void tick_freeze(void)
 	if (tick_freeze_depth == num_online_cpus()) {
 		timekeeping_suspend();
 	} else {
-		tick_suspend();
+		tick_suspend_local();
 	}
 
 	raw_spin_unlock(&tick_freeze_lock);
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 5c9f0eec56b2..6ba7bce732f2 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -23,6 +23,7 @@ extern void tick_check_new_device(struct clock_event_device *dev);
 extern void tick_handover_do_timer(int *cpup);
 extern void tick_shutdown(unsigned int *cpup);
 extern void tick_suspend(void);
+extern void tick_resume(void);
 extern bool tick_check_replacement(struct clock_event_device *curdev,
 				   struct clock_event_device *newdev);
 extern void tick_install_replacement(struct clock_event_device *dev);
@@ -43,6 +44,7 @@ extern int __clockevents_update_freq(struct clock_event_device *dev, u32 freq);
 extern ssize_t sysfs_get_uname(const char *buf, char *dst, size_t cnt);
 #else
 static inline void tick_suspend(void) { }
+static inline void tick_resume(void) { }
 #endif /* GENERIC_CLOCKEVENTS */
 
 /* Oneshot related functions */
@@ -81,7 +83,8 @@ extern int tick_is_broadcast_device(struct clock_event_device *dev);
 extern void tick_broadcast_on_off(unsigned long reason, int *oncpu);
 extern void tick_shutdown_broadcast(unsigned int *cpup);
 extern void tick_suspend_broadcast(void);
-extern int tick_resume_broadcast(void);
+extern void tick_resume_broadcast(void);
+extern bool tick_resume_check_broadcast(void);
 extern void tick_broadcast_init(void);
 extern void tick_set_periodic_handler(struct clock_event_device *dev, int broadcast);
 extern int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq);
@@ -95,7 +98,8 @@ static inline void tick_do_periodic_broadcast(struct clock_event_device *d) { }
 static inline void tick_broadcast_on_off(unsigned long reason, int *oncpu) { }
 static inline void tick_shutdown_broadcast(unsigned int *cpup) { }
 static inline void tick_suspend_broadcast(void) { }
-static inline int tick_resume_broadcast(void) { return 0; }
+static inline void tick_resume_broadcast(void) { }
+static inline bool tick_resume_check_broadcast(void) { return false; }
 static inline void tick_broadcast_init(void) { }
 static inline int tick_broadcast_update_freq(struct clock_event_device *dev, u32 freq) { return -ENODEV; }
 
-- 
cgit v1.2.3


From ec776ef6bbe1734c29cd6bd05219cd93b2731bd4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig
Date: Wed, 1 Apr 2015 09:12:18 +0200
Subject: x86/mm: Add support for the non-standard protected e820 type

Various recent BIOSes support NVDIMMs or ADR using a
non-standard e820 memory type, and Intel supplied reference
Linux code using this type to various vendors.

Wire this e820 table type up to export platform devices for the
pmem driver so that we can use it in Linux.

Based on earlier work from:

   Dave Jiang <dave.jiang@intel.com>
   Dan Williams <dan.j.williams@intel.com>

Includes fixes for NUMA regions from Boaz Harrosh.

Tested-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Acked-by: Dan Williams <dan.j.williams@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boaz Harrosh <boaz@plexistor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jens Axboe <axboe@fb.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Keith Busch <keith.busch@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Wilcox <willy@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-nvdimm@ml01.01.org
Link: http://lkml.kernel.org/r/1427872339-6688-2-git-send-email-hch@lst.de
[ Minor cleanups. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/kernel-parameters.txt |  6 +++++
 arch/x86/Kconfig                    | 10 +++++++
 arch/x86/include/uapi/asm/e820.h    | 10 +++++++
 arch/x86/kernel/Makefile            |  1 +
 arch/x86/kernel/e820.c              | 26 +++++++++++++-----
 arch/x86/kernel/pmem.c              | 53 +++++++++++++++++++++++++++++++++++++
 6 files changed, 100 insertions(+), 6 deletions(-)
 create mode 100644 arch/x86/kernel/pmem.c

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..c87122dd790f 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1965,6 +1965,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			         or
 			         memmap=0x10000$0x18690000
 
+	memmap=nn[KMG]!ss[KMG]
+			[KNL,X86] Mark specific memory as protected.
+			Region of memory to be used, from ss to ss+nn.
+			The memory region may be marked as e820 type 12 (0xc)
+			and is NVDIMM or ADR memory.
+
 	memory_corruption_check=0/1 [X86]
 			Some BIOSes seem to corrupt the first 64k of
 			memory when doing things like suspend/resume.
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index b7d31ca55187..9e3bcd6f4a48 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1430,6 +1430,16 @@ config ILLEGAL_POINTER_VALUE
 
 source "mm/Kconfig"
 
+config X86_PMEM_LEGACY
+	bool "Support non-standard NVDIMMs and ADR protected memory"
+	help
+	  Treat memory marked using the non-standard e820 type of 12 as used
+	  by the Intel Sandy Bridge-EP reference BIOS as protected memory.
+	  The kernel will offer these regions to the 'pmem' driver so
+	  they can be used for persistent storage.
+
+	  Say Y if unsure.
+
 config HIGHPTE
 	bool "Allocate 3rd-level pagetables from highmem"
 	depends on HIGHMEM
diff --git a/arch/x86/include/uapi/asm/e820.h b/arch/x86/include/uapi/asm/e820.h
index d993e33f5236..960a8a9dc4ab 100644
--- a/arch/x86/include/uapi/asm/e820.h
+++ b/arch/x86/include/uapi/asm/e820.h
@@ -33,6 +33,16 @@
 #define E820_NVS	4
 #define E820_UNUSABLE	5
 
+/*
+ * This is a non-standardized way to represent ADR or NVDIMM regions that
+ * persist over a reboot.  The kernel will ignore their special capabilities
+ * unless the CONFIG_X86_PMEM_LEGACY=y option is set.
+ *
+ * ( Note that older platforms also used 6 for the same type of memory,
+ *   but newer versions switched to 12 as 6 was assigned differently.  Some
+ *   time they will learn... )
+ */
+#define E820_PRAM	12
 
 /*
  * reserved RAM used by kernel itself
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..971f18cd9ca0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_KVM_GUEST)		+= kvm.o kvmclock.o
 obj-$(CONFIG_PARAVIRT)		+= paravirt.o paravirt_patch_$(BITS).o
 obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= paravirt-spinlocks.o
 obj-$(CONFIG_PARAVIRT_CLOCK)	+= pvclock.o
+obj-$(CONFIG_X86_PMEM_LEGACY)	+= pmem.o
 
 obj-$(CONFIG_PCSPKR_PLATFORM)	+= pcspeaker.o
 
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201deee923..11cc7d54ec3f 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -149,6 +149,9 @@ static void __init e820_print_type(u32 type)
 	case E820_UNUSABLE:
 		printk(KERN_CONT "unusable");
 		break;
+	case E820_PRAM:
+		printk(KERN_CONT "persistent (type %u)", type);
+		break;
 	default:
 		printk(KERN_CONT "type %u", type);
 		break;
@@ -343,7 +346,7 @@ int __init sanitize_e820_map(struct e820entry *biosmap, int max_nr_map,
 		 * continue building up new bios map based on this
 		 * information
 		 */
-		if (current_type != last_type)	{
+		if (current_type != last_type || current_type == E820_PRAM) {
 			if (last_type != 0)	 {
 				new_bios[new_bios_entry].size =
 					change_point[chgidx]->addr - last_addr;
@@ -688,6 +691,7 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn)
 			register_nosave_region(pfn, PFN_UP(ei->addr));
 
 		pfn = PFN_DOWN(ei->addr + ei->size);
+
 		if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
 			register_nosave_region(PFN_UP(ei->addr), pfn);
 
@@ -748,7 +752,7 @@ u64 __init early_reserve_e820(u64 size, u64 align)
 /*
  * Find the highest page frame number we have available
  */
-static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
+static unsigned long __init e820_end_pfn(unsigned long limit_pfn)
 {
 	int i;
 	unsigned long last_pfn = 0;
@@ -759,7 +763,11 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 		unsigned long start_pfn;
 		unsigned long end_pfn;
 
-		if (ei->type != type)
+		/*
+		 * Persistent memory is accounted as ram for purposes of
+		 * establishing max_pfn and mem_map.
+		 */
+		if (ei->type != E820_RAM && ei->type != E820_PRAM)
 			continue;
 
 		start_pfn = ei->addr >> PAGE_SHIFT;
@@ -784,12 +792,12 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 }
 unsigned long __init e820_end_of_ram_pfn(void)
 {
-	return e820_end_pfn(MAX_ARCH_PFN, E820_RAM);
+	return e820_end_pfn(MAX_ARCH_PFN);
 }
 
 unsigned long __init e820_end_of_low_ram_pfn(void)
 {
-	return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
+	return e820_end_pfn(1UL << (32-PAGE_SHIFT));
 }
 
 static void early_panic(char *msg)
@@ -866,6 +874,9 @@ static int __init parse_memmap_one(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
+	} else if (*p == '!') {
+		start_at = memparse(p+1, &p);
+		e820_add_region(start_at, mem_size, E820_PRAM);
 	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
@@ -907,6 +918,7 @@ static inline const char *e820_type_to_string(int e820_type)
 	case E820_ACPI:	return "ACPI Tables";
 	case E820_NVS:	return "ACPI Non-volatile Storage";
 	case E820_UNUSABLE:	return "Unusable memory";
+	case E820_PRAM: return "Persistent RAM";
 	default:	return "reserved";
 	}
 }
@@ -940,7 +952,9 @@ void __init e820_reserve_resources(void)
 		 * pci device BAR resource and insert them later in
 		 * pcibios_resource_survey()
 		 */
-		if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) {
+		if (((e820.map[i].type != E820_RESERVED) &&
+		     (e820.map[i].type != E820_PRAM)) ||
+		     res->start < (1ULL<<20)) {
 			res->flags |= IORESOURCE_BUSY;
 			insert_resource(&iomem_resource, res);
 		}
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
new file mode 100644
index 000000000000..3420c874ddc5
--- /dev/null
+++ b/arch/x86/kernel/pmem.c
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2015, Christoph Hellwig.
+ */
+#include <linux/memblock.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+#include <asm/e820.h>
+#include <asm/page_types.h>
+#include <asm/setup.h>
+
+static __init void register_pmem_device(struct resource *res)
+{
+	struct platform_device *pdev;
+	int error;
+
+	pdev = platform_device_alloc("pmem", PLATFORM_DEVID_AUTO);
+	if (!pdev)
+		return;
+
+	error = platform_device_add_resources(pdev, res, 1);
+	if (error)
+		goto out_put_pdev;
+
+	error = platform_device_add(pdev);
+	if (error)
+		goto out_put_pdev;
+	return;
+
+out_put_pdev:
+	dev_warn(&pdev->dev, "failed to add 'pmem' (persistent memory) device!\n");
+	platform_device_put(pdev);
+}
+
+static __init int register_pmem_devices(void)
+{
+	int i;
+
+	for (i = 0; i < e820.nr_map; i++) {
+		struct e820entry *ei = &e820.map[i];
+
+		if (ei->type == E820_PRAM) {
+			struct resource res = {
+				.flags	= IORESOURCE_MEM,
+				.start	= ei->addr,
+				.end	= ei->addr + ei->size - 1,
+			};
+			register_pmem_device(&res);
+		}
+	}
+
+	return 0;
+}
+device_initcall(register_pmem_devices);
-- 
cgit v1.2.3


From 7ea24169097d3d3a3eab2dcc5773bc43fd5593e7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Wed, 1 Apr 2015 14:26:34 -0700
Subject: x86/asm/entry/64: Disable opportunistic SYSRET if regs->flags has TF
 set

When I wrote the opportunistic SYSRET code, I missed an important difference
between SYSRET and IRET.

Both instructions are capable of setting EFLAGS.TF, but they behave differently
when doing so:

 - IRET will not issue a #DB trap after execution when it sets TF.
   This is critical -- otherwise you'd never be able to make forward progress when
   returning to userspace.

 - SYSRET, on the other hand, will trap with #DB immediately after
   returning to CPL3, and the next instruction will never execute.

This breaks anything that opportunistically SYSRETs to a user
context with TF set.  For example, running this code with TF set
and a SIGTRAP handler loaded never gets past 'post_nop':

	extern unsigned char post_nop[];
	asm volatile ("pushfq\n\t"
		      "popq %%r11\n\t"
		      "nop\n\t"
		      "post_nop:"
		      : : "c" (post_nop) : "r11");

In my defense, I can't find this documented in the AMD or Intel manual.

Fix it by using IRET to restore TF.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 2a23c6b8a9c4 ("x86_64, entry: Use sysret to return to userspace when possible")
Link: http://lkml.kernel.org/r/9472f1ca4c19a38ecda45bba9c91b7168135fcfa.1427923514.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2babb393915e..f0095a76c182 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -799,7 +799,21 @@ retint_swapgs:		/* return to user-space */
 	cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp)	/* R11 == RFLAGS */
 	jne opportunistic_sysret_failed
 
-	testq $X86_EFLAGS_RF,%r11		/* sysret can't restore RF */
+	/*
+	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
+	 * restoring TF results in a trap from userspace immediately after
+	 * SYSRET.  This would cause an infinite loop whenever #DB happens
+	 * with register state that satisfies the opportunistic SYSRET
+	 * conditions.  For example, single-stepping this user code:
+	 *
+	 *           movq $stuck_here,%rcx
+	 *           pushfq
+	 *           popq %r11
+	 *   stuck_here:
+	 *
+	 * would never get past 'stuck_here'.
+	 */
+	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
 	jnz opportunistic_sysret_failed
 
 	/* nothing to check for RSP */
-- 
cgit v1.2.3


From f59df35fc28167886a0caf9f15db2f4a1f5932da Mon Sep 17 00:00:00 2001
From: Steffen Liebergeld
Date: Thu, 2 Apr 2015 11:01:59 +0200
Subject: kgdb/x86: Fix reporting of 'si' in kgdb on x86_64

This patch fixes an error in kgdb for x86_64 which would report
the value of dx when asked to give the value of si.

Signed-off-by: Steffen Liebergeld <steffen.liebergeld@kernkonzept.com>
Cc: Jason Wessel <jason.wessel@windriver.com>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/kgdb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f8d283..25ecd56cefa8 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
 	{ "bx", 8, offsetof(struct pt_regs, bx) },
 	{ "cx", 8, offsetof(struct pt_regs, cx) },
 	{ "dx", 8, offsetof(struct pt_regs, dx) },
-	{ "si", 8, offsetof(struct pt_regs, dx) },
+	{ "si", 8, offsetof(struct pt_regs, si) },
 	{ "di", 8, offsetof(struct pt_regs, di) },
 	{ "bp", 8, offsetof(struct pt_regs, bp) },
 	{ "sp", 8, offsetof(struct pt_regs, sp) },
-- 
cgit v1.2.3


From 0784b36448a2a85b95b6eb21a69b9045c896c065 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 1 Apr 2015 16:50:57 +0200
Subject: x86/asm/entry/64: Fold the 'test_in_nmi' macro into its only user

No code changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427899858-7165-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 03c52e217680..386375d43d14 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1355,19 +1355,7 @@ ENTRY(error_exit)
 	CFI_ENDPROC
 END(error_exit)
 
-/*
- * Test if a given stack is an NMI stack or not.
- */
-	.macro test_in_nmi reg stack nmi_ret normal_ret
-	cmpq %\reg, \stack
-	ja \normal_ret
-	subq $EXCEPTION_STKSZ, %\reg
-	cmpq %\reg, \stack
-	jb \normal_ret
-	jmp \nmi_ret
-	.endm
-
-	/* runs on exception stack */
+/* Runs on exception stack */
 ENTRY(nmi)
 	INTR_FRAME
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1428,8 +1416,18 @@ ENTRY(nmi)
 	 * We check the variable because the first NMI could be in a
 	 * breakpoint routine using a breakpoint stack.
 	 */
-	lea 6*8(%rsp), %rdx
-	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+	lea	6*8(%rsp), %rdx
+	/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+	cmpq	%rdx, 4*8(%rsp)
+	/* If the stack pointer is above the NMI stack, this is a normal NMI */
+	ja	first_nmi
+	subq	$EXCEPTION_STKSZ, %rdx
+	cmpq	%rdx, 4*8(%rsp)
+	/* If it is below the NMI stack, it is a normal NMI */
+	jb	first_nmi
+	/* Ah, it is within the NMI stack, treat it as nested */
+	jmp	nested_nmi
+
 	CFI_REMEMBER_STATE
 
 nested_nmi:
-- 
cgit v1.2.3


From 40e4f2d177f748a83e7639554ea7d11568a9fa1f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Wed, 1 Apr 2015 16:50:58 +0200
Subject: x86/asm/boot/64: Use __BOOT_TSS instead of literal $0x20

__BOOT_TSS = (GDT_ENTRY_BOOT_TSS * 8)
GDT_ENTRY_BOOT_TSS = (GDT_ENTRY_BOOT_CS + 2)
GDT_ENTRY_BOOT_CS = 2

(2 + 2) * 8 = 4 * 8 = 32 = 0x20

No code changes.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Reviewed-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427899858-7165-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/boot/compressed/head_64.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 90c152135e92..b0c0d16ef58d 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -165,7 +165,7 @@ ENTRY(startup_32)
 	/* After gdt is loaded */
 	xorl	%eax, %eax
 	lldt	%ax
-	movl    $0x20, %eax
+	movl    $__BOOT_TSS, %eax
 	ltr	%ax
 
 	/*
-- 
cgit v1.2.3


From 3f85483bd80ef1de8cbbf0361be59f6a069b59d4 Mon Sep 17 00:00:00 2001
From: Boris Ostrovsky
Date: Wed, 1 Apr 2015 10:12:14 -0400
Subject: x86/cpu: Factor out common CPU initialization code, fix 32-bit Xen PV
 guests

Some of x86 bare-metal and Xen CPU initialization code is common
between the two and therefore can be factored out to avoid code
duplication.

As a side effect, doing so will also extend the fix provided by
commit a7fcf28d431e ("x86/asm/entry: Replace this_cpu_sp0() with
current_top_of_stack() to x86_32") to 32-bit Xen PV guests.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: David Vrabel <david.vrabel@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: konrad.wilk@oracle.com
Cc: xen-devel@lists.xenproject.org
Link: http://lkml.kernel.org/r/1427897534-5086-1-git-send-email-boris.ostrovsky@oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/smp.h |  1 +
 arch/x86/kernel/smpboot.c  | 37 ++++++++++++++++++++++---------------
 arch/x86/xen/smp.c         | 13 +------------
 3 files changed, 24 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..81d02fc7dafa 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
+void common_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
 void native_cpu_die(unsigned int cpu);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7b20ffd2fffc..5b298a95d567 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -779,6 +779,26 @@ out:
 	return boot_error;
 }
 
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+	/* Just in case we booted with a single CPU. */
+	alternatives_enable_smp();
+
+	per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+	/* Stack for startup_32 can be just as for start_secondary onwards */
+	irq_ctx_init(cpu);
+	per_cpu(cpu_current_top_of_stack, cpu) =
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+#else
+	clear_tsk_thread_flag(idle, TIF_FORK);
+	initial_gs = per_cpu_offset(cpu);
+#endif
+	per_cpu(kernel_stack, cpu) =
+		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+}
+
 /*
  * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
  * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,24 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	int cpu0_nmi_registered = 0;
 	unsigned long timeout;
 
-	/* Just in case we booted with a single CPU. */
-	alternatives_enable_smp();
-
 	idle->thread.sp = (unsigned long) (((struct pt_regs *)
 			  (THREAD_SIZE +  task_stack_page(idle))) - 1);
-	per_cpu(current_task, cpu) = idle;
 
-#ifdef CONFIG_X86_32
-	/* Stack for startup_32 can be just as for start_secondary onwards */
-	irq_ctx_init(cpu);
-	per_cpu(cpu_current_top_of_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
-#else
-	clear_tsk_thread_flag(idle, TIF_FORK);
-	initial_gs = per_cpu_offset(cpu);
-#endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
 	early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
 	initial_code = (unsigned long)start_secondary;
 	stack_start  = idle->thread.sp;
@@ -954,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	/* the FPU context is blank, nobody can own it */
 	__cpu_disable_lazy_restore(cpu);
 
+	common_cpu_up(cpu, tidle);
+
 	err = do_boot_cpu(apicid, cpu, tidle);
 	if (err) {
 		pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 765b7684f858..7413ee3706d0 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -445,14 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 {
 	int rc;
 
-	per_cpu(current_task, cpu) = idle;
-#ifdef CONFIG_X86_32
-	irq_ctx_init(cpu);
-#else
-	clear_tsk_thread_flag(idle, TIF_FORK);
-#endif
-	per_cpu(kernel_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+	common_cpu_up(cpu, idle);
 
 	xen_setup_runstate_info(cpu);
 	xen_setup_timer(cpu);
@@ -467,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	if (rc)
 		return rc;
 
-	if (num_online_cpus() == 1)
-		/* Just in case we booted with a single CPU. */
-		alternatives_enable_smp();
-
 	rc = xen_smp_intr_init(cpu);
 	if (rc)
 		return rc;
-- 
cgit v1.2.3


From a6fcb6d4804b51ffcae7881c7f99483f4981ddf1 Mon Sep 17 00:00:00 2001
From: Bryan O'Donoghue
Date: Tue, 31 Mar 2015 12:15:36 +0100
Subject: x86/intel/quark: Run IMR self-test on IMR capble hw only

Automated testing with LKP shows IMR self test code running and
printing error messages on QEMU hardware lacking IMR support.

Update IMR self-test code to run only when IMR hardware should
be present. Tested on Quark X1000 and QEMU.

Signed-off-by: Bryan O'Donoghue <pure.logic@nexus-software.ie>
Acked-by: Ong Boon Leong <boon.leong.ong@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: andriy.shevchenko@intel.com
Cc: dvhart@linux.intel.com
Cc: huang.ying.caritas@gmail.com
Cc: ying.huang@intel.com
Link: http://lkml.kernel.org/r/1427800536-32339-1-git-send-email-pure.logic@nexus-software.ie
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/platform/intel-quark/imr_selftest.c | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/platform/intel-quark/imr_selftest.c b/arch/x86/platform/intel-quark/imr_selftest.c
index c9a0838890e2..278e4da4222f 100644
--- a/arch/x86/platform/intel-quark/imr_selftest.c
+++ b/arch/x86/platform/intel-quark/imr_selftest.c
@@ -11,6 +11,7 @@
  */
 
 #include <asm-generic/sections.h>
+#include <asm/cpu_device_id.h>
 #include <asm/imr.h>
 #include <linux/init.h>
 #include <linux/mm.h>
@@ -101,6 +102,12 @@ static void __init imr_self_test(void)
 	}
 }
 
+static const struct x86_cpu_id imr_ids[] __initconst = {
+	{ X86_VENDOR_INTEL, 5, 9 },	/* Intel Quark SoC X1000. */
+	{}
+};
+MODULE_DEVICE_TABLE(x86cpu, imr_ids);
+
 /**
  * imr_self_test_init - entry point for IMR driver.
  *
@@ -108,7 +115,8 @@ static void __init imr_self_test(void)
  */
 static int __init imr_self_test_init(void)
 {
-	imr_self_test();
+	if (x86_match_cpu(imr_ids))
+		imr_self_test();
 	return 0;
 }
 
-- 
cgit v1.2.3


From a436bb7b806383ae0593cab53d17fc9676270cd3 Mon Sep 17 00:00:00 2001
From: Masahiro Yamada
Date: Fri, 27 Mar 2015 20:43:36 +0900
Subject: kbuild: use relative path more to include Makefile

Prior to this commit, it was impossible to use relative path to
include Makefiles from the top level Makefile because the option
"--include-dir=$(srctree)" becomes effective when Make enters into
sub Makefiles.

To use relative path in any places, this commit moves the option
above the "sub-make" target.

Signed-off-by: Masahiro Yamada <yamada.masahiro@socionext.com>
Signed-off-by: Michal Marek <mmarek@suse.cz>
---
 Makefile             | 22 ++++++++++------------
 arch/mips/Makefile   |  2 +-
 arch/um/Makefile     |  6 +++---
 arch/x86/Makefile    |  2 +-
 arch/x86/Makefile.um |  2 +-
 5 files changed, 16 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/Makefile b/Makefile
index d3726e5f93ef..929d805b4f62 100644
--- a/Makefile
+++ b/Makefile
@@ -10,9 +10,10 @@ NAME = Hurr durr I'ma sheep
 # Comments in this file are targeted only to the developer, do not
 # expect to learn how to build the kernel reading this file.
 
-# Do not use make's built-in rules and variables
-# (this increases performance and avoids hard-to-debug behaviour);
-MAKEFLAGS += -rR
+# o Do not use make's built-in rules and variables
+#   (this increases performance and avoids hard-to-debug behaviour);
+# o Look for make include files relative to root of kernel src
+MAKEFLAGS += -rR --include-dir=$(CURDIR)
 
 # Avoid funny character set dependencies
 unexport LC_ALL
@@ -344,12 +345,9 @@ endif
 export COMPILER
 endif
 
-# Look for make include files relative to root of kernel src
-MAKEFLAGS += --include-dir=$(srctree)
-
 # We need some generic definitions (do not try to remake the file).
-$(srctree)/scripts/Kbuild.include: ;
-include $(srctree)/scripts/Kbuild.include
+scripts/Kbuild.include: ;
+include scripts/Kbuild.include
 
 # Make variables (CC, etc...)
 AS		= $(CROSS_COMPILE)as
@@ -533,7 +531,7 @@ ifeq ($(config-targets),1)
 # Read arch specific Makefile to set KBUILD_DEFCONFIG as needed.
 # KBUILD_DEFCONFIG may point out an alternative default configuration
 # used for 'make defconfig'
-include $(srctree)/arch/$(SRCARCH)/Makefile
+include arch/$(SRCARCH)/Makefile
 export KBUILD_DEFCONFIG KBUILD_KCONFIG
 
 config: scripts_basic outputmakefile FORCE
@@ -609,7 +607,7 @@ endif # $(dot-config)
 # Defaults to vmlinux, but the arch makefile usually adds further targets
 all: vmlinux
 
-include $(srctree)/arch/$(SRCARCH)/Makefile
+include arch/$(SRCARCH)/Makefile
 
 KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
 
@@ -781,8 +779,8 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
 	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
-include $(srctree)/scripts/Makefile.kasan
-include $(srctree)/scripts/Makefile.extrawarn
+include scripts/Makefile.kasan
+include scripts/Makefile.extrawarn
 
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
 KBUILD_CPPFLAGS += $(KCPPFLAGS)
diff --git a/arch/mips/Makefile b/arch/mips/Makefile
index 8f57fc72d62c..d152dfbc360d 100644
--- a/arch/mips/Makefile
+++ b/arch/mips/Makefile
@@ -225,7 +225,7 @@ endif
 #
 # Board-dependent options and extra files
 #
-include $(srctree)/arch/mips/Kbuild.platforms
+include arch/mips/Kbuild.platforms
 
 ifdef CONFIG_PHYSICAL_START
 load-y					= $(CONFIG_PHYSICAL_START)
diff --git a/arch/um/Makefile b/arch/um/Makefile
index e4b1a9639c4d..17d4460b1af3 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -43,8 +43,8 @@ endif
 
 HOST_DIR := arch/$(HEADER_ARCH)
 
-include $(srctree)/$(ARCH_DIR)/Makefile-skas
-include $(srctree)/$(HOST_DIR)/Makefile.um
+include $(ARCH_DIR)/Makefile-skas
+include $(HOST_DIR)/Makefile.um
 
 core-y += $(HOST_DIR)/um/
 
@@ -73,7 +73,7 @@ USER_CFLAGS = $(patsubst $(KERNEL_DEFINES),,$(patsubst -D__KERNEL__,,\
 	$(filter -I%,$(CFLAGS)) -D_FILE_OFFSET_BITS=64 -idirafter include
 
 #This will adjust *FLAGS accordingly to the platform.
-include $(srctree)/$(ARCH_DIR)/Makefile-os-$(OS)
+include $(ARCH_DIR)/Makefile-os-$(OS)
 
 KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \
 		   -I$(srctree)/$(HOST_DIR)/include/uapi \
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 5ba2d9ce82dc..2fda005bb334 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -63,7 +63,7 @@ ifeq ($(CONFIG_X86_32),y)
 				$(call cc-option,-fno-unit-at-a-time))
 
         # CPU-specific tuning. Anything which can be shared with UML should go here.
-        include $(srctree)/arch/x86/Makefile_32.cpu
+        include arch/x86/Makefile_32.cpu
         KBUILD_CFLAGS += $(cflags-y)
 
         # temporary until string.h is fixed
diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um
index 95eba554baf9..5b7e898ffd9a 100644
--- a/arch/x86/Makefile.um
+++ b/arch/x86/Makefile.um
@@ -18,7 +18,7 @@ LDS_EXTRA		:= -Ui386
 export LDS_EXTRA
 
 # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y.
-include $(srctree)/arch/x86/Makefile_32.cpu
+include arch/x86/Makefile_32.cpu
 
 # prevent gcc from keeping the stack 16 byte aligned. Taken from i386.
 cflags-y += $(call cc-option,-mpreferred-stack-boundary=2)
-- 
cgit v1.2.3


From 687805e4a60fe83a11556c041840161f8016a367 Mon Sep 17 00:00:00 2001
From: Kan Liang
Date: Fri, 27 Mar 2015 10:38:25 -0400
Subject: perf/x86/intel: Filter branches for PEBS event

For supporting Intel LBR branches filtering, Intel LBR sharing logic
mechanism is introduced from commit b36817e88630 ("perf/x86: Add Intel
LBR sharing logic"). It modifies __intel_shared_reg_get_constraints() to
config lbr_sel, which is finally used to set LBR_SELECT.

However, the intel_shared_regs_constraints() function is called after
intel_pebs_constraints(). The PEBS event will return immediately after
intel_pebs_constraints(). So it's impossible to filter branches for PEBS
events.

This patch moves intel_shared_regs_constraints() ahead of
intel_pebs_constraints().

We can safely do that because the intel_shared_regs_constraints() function
only returns empty constraint if its rejecting the event, otherwise it
returns NULL such that we continue calling intel_pebs_constraints() and
x86_get_event_constraint().

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1427467105-9260-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 498b6d967138..40898abdff20 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1649,11 +1649,11 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	c = intel_pebs_constraints(event);
+	c = intel_shared_regs_constraints(cpuc, event);
 	if (c)
 		return c;
 
-	c = intel_shared_regs_constraints(cpuc, event);
+	c = intel_pebs_constraints(event);
 	if (c)
 		return c;
 
-- 
cgit v1.2.3


From c420f19b9cdc59662dbb56677417487efc1729ec Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Mon, 9 Mar 2015 11:20:22 -0700
Subject: perf/x86/intel: Fix Haswell CYCLE_ACTIVITY.* counter constraints

Some of the CYCLE_ACTIVITY.* events can only be scheduled on
counter 2.  Due to a typo Haswell matched those with
INTEL_EVENT_CONSTRAINT, which lead to the events never
matching as the comparison does not expect anything
in the umask too. Fix the typo.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1425925222-32361-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 40898abdff20..258990688a5e 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -212,11 +212,11 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	/* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
-	INTEL_EVENT_CONSTRAINT(0x08a3, 0x4),
+	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4),
 	/* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
-	INTEL_EVENT_CONSTRAINT(0x0ca3, 0x4),
+	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
 	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
-	INTEL_EVENT_CONSTRAINT(0x04a3, 0xf),
+	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
 	EVENT_CONSTRAINT_END
 };
 
-- 
cgit v1.2.3


From ed69628b3b04578179334393d7f5fe60a2681f1c Mon Sep 17 00:00:00 2001
From: Alexander Shishkin
Date: Wed, 14 Jan 2015 14:18:19 +0200
Subject: x86: Add Intel Processor Trace (INTEL_PT) cpu feature detection

Intel Processor Trace is an architecture extension that allows for program
flow tracing.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-11-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 arch/x86/kernel/cpu/scattered.c   | 1 +
 2 files changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 361922dcc9b1..c1553b70fed4 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -195,6 +195,7 @@
 #define X86_FEATURE_HWP_ACT_WINDOW ( 7*32+ 12) /* Intel HWP_ACT_WINDOW */
 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
+#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index 60639093d536..3d423a101fae 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -41,6 +41,7 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c)
 		{ X86_FEATURE_HWP_ACT_WINDOW,	CR_EAX, 9, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_EPP,		CR_EAX,10, 0x00000006, 0 },
 		{ X86_FEATURE_HWP_PKG_REQ,	CR_EAX,11, 0x00000006, 0 },
+		{ X86_FEATURE_INTEL_PT,		CR_EBX,25, 0x00000007, 0 },
 		{ X86_FEATURE_APERFMPERF,	CR_ECX, 0, 0x00000006, 0 },
 		{ X86_FEATURE_EPB,		CR_ECX, 3, 0x00000006, 0 },
 		{ X86_FEATURE_HW_PSTATE,	CR_EDX, 7, 0x80000007, 0 },
-- 
cgit v1.2.3


From 4807034248bed58d49a4f9f450c024e3b0f58577 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin
Date: Wed, 14 Jan 2015 14:18:20 +0200
Subject: perf/x86: Mark Intel PT and LBR/BTS as mutually exclusive

Intel PT cannot be used at the same time as LBR or BTS and will cause a
general protection fault if they are used together. In order to avoid
fixing up GPs in the fast path, instead we disallow creating LBR/BTS
events when PT events are present and vice versa.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1421237903-181015-12-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       | 43 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event.h       | 40 +++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 11 +++++++++
 3 files changed, 94 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 0420ebcac116..549d01d6d996 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -263,6 +263,14 @@ static void hw_perf_event_destroy(struct perf_event *event)
 	}
 }
 
+void hw_perf_lbr_event_destroy(struct perf_event *event)
+{
+	hw_perf_event_destroy(event);
+
+	/* undo the lbr/bts event accounting */
+	x86_del_exclusive(x86_lbr_exclusive_lbr);
+}
+
 static inline int x86_pmu_initialized(void)
 {
 	return x86_pmu.handle_irq != NULL;
@@ -302,6 +310,35 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
 	return x86_pmu_extra_regs(val, event);
 }
 
+/*
+ * Check if we can create event of a certain type (that no conflicting events
+ * are present).
+ */
+int x86_add_exclusive(unsigned int what)
+{
+	int ret = -EBUSY, i;
+
+	if (atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what]))
+		return 0;
+
+	mutex_lock(&pmc_reserve_mutex);
+	for (i = 0; i < ARRAY_SIZE(x86_pmu.lbr_exclusive); i++)
+		if (i != what && atomic_read(&x86_pmu.lbr_exclusive[i]))
+			goto out;
+
+	atomic_inc(&x86_pmu.lbr_exclusive[what]);
+	ret = 0;
+
+out:
+	mutex_unlock(&pmc_reserve_mutex);
+	return ret;
+}
+
+void x86_del_exclusive(unsigned int what)
+{
+	atomic_dec(&x86_pmu.lbr_exclusive[what]);
+}
+
 int x86_setup_perfctr(struct perf_event *event)
 {
 	struct perf_event_attr *attr = &event->attr;
@@ -346,6 +383,12 @@ int x86_setup_perfctr(struct perf_event *event)
 		/* BTS is currently only allowed for user-mode. */
 		if (!attr->exclude_kernel)
 			return -EOPNOTSUPP;
+
+		/* disallow bts if conflicting events are present */
+		if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+			return -EBUSY;
+
+		event->destroy = hw_perf_lbr_event_destroy;
 	}
 
 	hwc->config |= config;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 87e5081f4cdc..47499661e8d4 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -408,6 +408,12 @@ union x86_pmu_config {
 
 #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value
 
+enum {
+	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_pt,
+	x86_lbr_exclusive_max,
+};
+
 /*
  * struct x86_pmu - generic x86 pmu
  */
@@ -505,6 +511,11 @@ struct x86_pmu {
 	const int	*lbr_sel_map;		   /* lbr_select mappings */
 	bool		lbr_double_abort;	   /* duplicated lbr aborts */
 
+	/*
+	 * Intel PT/LBR/BTS are exclusive
+	 */
+	atomic_t	lbr_exclusive[x86_lbr_exclusive_max];
+
 	/*
 	 * Extra registers for events
 	 */
@@ -603,6 +614,12 @@ static inline int x86_pmu_rdpmc_index(int index)
 	return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index;
 }
 
+int x86_add_exclusive(unsigned int what);
+
+void x86_del_exclusive(unsigned int what);
+
+void hw_perf_lbr_event_destroy(struct perf_event *event);
+
 int x86_setup_perfctr(struct perf_event *event);
 
 int x86_pmu_hw_config(struct perf_event *event);
@@ -689,6 +706,29 @@ static inline int amd_pmu_init(void)
 
 #ifdef CONFIG_CPU_SUP_INTEL
 
+static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
+{
+	/* user explicitly requested branch sampling */
+	if (has_branch_stack(event))
+		return true;
+
+	/* implicit branch sampling to correct PEBS skid */
+	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
+	    x86_pmu.intel_cap.pebs_format < 2)
+		return true;
+
+	return false;
+}
+
+static inline bool intel_pmu_has_bts(struct perf_event *event)
+{
+	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
+	    !event->attr.freq && event->hw.sample_period == 1)
+		return true;
+
+	return false;
+}
+
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index fc6dbc46af4a..b7b3ff21c832 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1942,6 +1942,17 @@ static int intel_pmu_hw_config(struct perf_event *event)
 		ret = intel_pmu_setup_lbr_filter(event);
 		if (ret)
 			return ret;
+
+		/*
+		 * BTS is set up earlier in this path, so don't account twice
+		 */
+		if (!intel_pmu_has_bts(event)) {
+			/* disallow lbr if conflicting events are present */
+			if (x86_add_exclusive(x86_lbr_exclusive_lbr))
+				return -EBUSY;
+
+			event->destroy = hw_perf_lbr_event_destroy;
+		}
 	}
 
 	if (event->attr.type != PERF_TYPE_RAW)
-- 
cgit v1.2.3


From 52ca9ced3f70779589e6ecc329baffe69d8f5f7a Mon Sep 17 00:00:00 2001
From: Alexander Shishkin
Date: Fri, 30 Jan 2015 12:39:52 +0200
Subject: perf/x86/intel/pt: Add Intel PT PMU driver

Add support for Intel Processor Trace (PT) to kernel's perf events.
PT is an extension of Intel Architecture that collects information about
software execuction such as control flow, execution modes and timings and
formats it into highly compressed binary packets. Even being compressed,
these packets are generated at hundreds of megabytes per second per core,
which makes it impractical to decode them on the fly in the kernel.

This driver exports trace data by through AUX space in the perf ring
buffer, which is zero-copy mapped into userspace for faster data retrieval.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422614392-114498-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/uapi/asm/msr-index.h     |   18 +
 arch/x86/kernel/cpu/Makefile              |    1 +
 arch/x86/kernel/cpu/intel_pt.h            |  131 ++++
 arch/x86/kernel/cpu/perf_event.h          |    2 +
 arch/x86/kernel/cpu/perf_event_intel.c    |    8 +
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 1096 +++++++++++++++++++++++++++++
 6 files changed, 1256 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/intel_pt.h
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_pt.c

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..1a4eae695ca8 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -74,6 +74,24 @@
 #define MSR_IA32_PERF_CAPABILITIES	0x00000345
 #define MSR_PEBS_LD_LAT_THRESHOLD	0x000003f6
 
+#define MSR_IA32_RTIT_CTL		0x00000570
+#define RTIT_CTL_TRACEEN		BIT(0)
+#define RTIT_CTL_OS			BIT(2)
+#define RTIT_CTL_USR			BIT(3)
+#define RTIT_CTL_CR3EN			BIT(7)
+#define RTIT_CTL_TOPA			BIT(8)
+#define RTIT_CTL_TSC_EN			BIT(10)
+#define RTIT_CTL_DISRETC		BIT(11)
+#define RTIT_CTL_BRANCH_EN		BIT(13)
+#define MSR_IA32_RTIT_STATUS		0x00000571
+#define RTIT_STATUS_CONTEXTEN		BIT(1)
+#define RTIT_STATUS_TRIGGEREN		BIT(2)
+#define RTIT_STATUS_ERROR		BIT(4)
+#define RTIT_STATUS_STOPPED		BIT(5)
+#define MSR_IA32_RTIT_CR3_MATCH		0x00000572
+#define MSR_IA32_RTIT_OUTPUT_BASE	0x00000560
+#define MSR_IA32_RTIT_OUTPUT_MASK	0x00000561
+
 #define MSR_MTRRfix64K_00000		0x00000250
 #define MSR_MTRRfix16K_80000		0x00000258
 #define MSR_MTRRfix16K_A0000		0x00000259
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 6c1ca139f736..e6b353f10e09 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,6 +40,7 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/intel_pt.h b/arch/x86/kernel/cpu/intel_pt.h
new file mode 100644
index 000000000000..1c338b0eba05
--- /dev/null
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -0,0 +1,131 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#ifndef __INTEL_PT_H__
+#define __INTEL_PT_H__
+
+/*
+ * Single-entry ToPA: when this close to region boundary, switch
+ * buffers to avoid losing data.
+ */
+#define TOPA_PMI_MARGIN 512
+
+/*
+ * Table of Physical Addresses bits
+ */
+enum topa_sz {
+	TOPA_4K	= 0,
+	TOPA_8K,
+	TOPA_16K,
+	TOPA_32K,
+	TOPA_64K,
+	TOPA_128K,
+	TOPA_256K,
+	TOPA_512K,
+	TOPA_1MB,
+	TOPA_2MB,
+	TOPA_4MB,
+	TOPA_8MB,
+	TOPA_16MB,
+	TOPA_32MB,
+	TOPA_64MB,
+	TOPA_128MB,
+	TOPA_SZ_END,
+};
+
+static inline unsigned int sizes(enum topa_sz tsz)
+{
+	return 1 << (tsz + 12);
+};
+
+struct topa_entry {
+	u64	end	: 1;
+	u64	rsvd0	: 1;
+	u64	intr	: 1;
+	u64	rsvd1	: 1;
+	u64	stop	: 1;
+	u64	rsvd2	: 1;
+	u64	size	: 4;
+	u64	rsvd3	: 2;
+	u64	base	: 36;
+	u64	rsvd4	: 16;
+};
+
+#define TOPA_SHIFT 12
+#define PT_CPUID_LEAVES 2
+
+enum pt_capabilities {
+	PT_CAP_max_subleaf = 0,
+	PT_CAP_cr3_filtering,
+	PT_CAP_topa_output,
+	PT_CAP_topa_multiple_entries,
+	PT_CAP_payloads_lip,
+};
+
+struct pt_pmu {
+	struct pmu		pmu;
+	u32			caps[4 * PT_CPUID_LEAVES];
+};
+
+/**
+ * struct pt_buffer - buffer configuration; one buffer per task_struct or
+ *		cpu, depending on perf event configuration
+ * @cpu:	cpu for per-cpu allocation
+ * @tables:	list of ToPA tables in this buffer
+ * @first:	shorthand for first topa table
+ * @last:	shorthand for last topa table
+ * @cur:	current topa table
+ * @nr_pages:	buffer size in pages
+ * @cur_idx:	current output region's index within @cur table
+ * @output_off:	offset within the current output region
+ * @data_size:	running total of the amount of data in this buffer
+ * @lost:	if data was lost/truncated
+ * @head:	logical write offset inside the buffer
+ * @snapshot:	if this is for a snapshot/overwrite counter
+ * @stop_pos:	STOP topa entry in the buffer
+ * @intr_pos:	INT topa entry in the buffer
+ * @data_pages:	array of pages from perf
+ * @topa_index:	table of topa entries indexed by page offset
+ */
+struct pt_buffer {
+	int			cpu;
+	struct list_head	tables;
+	struct topa		*first, *last, *cur;
+	unsigned int		cur_idx;
+	size_t			output_off;
+	unsigned long		nr_pages;
+	local_t			data_size;
+	local_t			lost;
+	local64_t		head;
+	bool			snapshot;
+	unsigned long		stop_pos, intr_pos;
+	void			**data_pages;
+	struct topa_entry	*topa_index[0];
+};
+
+/**
+ * struct pt - per-cpu pt context
+ * @handle:	perf output handle
+ * @handle_nmi:	do handle PT PMI on this cpu, there's an active event
+ */
+struct pt {
+	struct perf_output_handle handle;
+	int			handle_nmi;
+};
+
+#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 47499661e8d4..f04729ac3290 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -808,6 +808,8 @@ void intel_pmu_lbr_init_hsw(void);
 
 int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
+void intel_pt_interrupt(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index b7b3ff21c832..8eb22ce26303 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1589,6 +1589,14 @@ again:
 		x86_pmu.drain_pebs(regs);
 	}
 
+	/*
+	 * Intel PT
+	 */
+	if (__test_and_clear_bit(55, (unsigned long *)&status)) {
+		handled++;
+		intel_pt_interrupt();
+	}
+
 	/*
 	 * Checkpointed counters can lead to 'spurious' PMIs because the
 	 * rollback caused by the PMI will have cleared the overflow status
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
new file mode 100644
index 000000000000..a9a1092cf836
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -0,0 +1,1096 @@
+/*
+ * Intel(R) Processor Trace PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * Intel PT is specified in the Intel Architecture Instruction Set Extensions
+ * Programming Reference:
+ * http://software.intel.com/en-us/intel-isa-extensions
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/device.h>
+
+#include <asm/perf_event.h>
+#include <asm/insn.h>
+#include <asm/io.h>
+
+#include "perf_event.h"
+#include "intel_pt.h"
+
+static DEFINE_PER_CPU(struct pt, pt_ctx);
+
+static struct pt_pmu pt_pmu;
+
+enum cpuid_regs {
+	CR_EAX = 0,
+	CR_ECX,
+	CR_EDX,
+	CR_EBX
+};
+
+/*
+ * Capabilities of Intel PT hardware, such as number of address bits or
+ * supported output schemes, are cached and exported to userspace as "caps"
+ * attribute group of pt pmu device
+ * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
+ * relevant bits together with intel_pt traces.
+ *
+ * These are necessary for both trace decoding (payloads_lip, contains address
+ * width encoded in IP-related packets), and event configuration (bitmasks with
+ * permitted values for certain bit fields).
+ */
+#define PT_CAP(_n, _l, _r, _m)						\
+	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
+			    .reg = _r, .mask = _m }
+
+static struct pt_cap_desc {
+	const char	*name;
+	u32		leaf;
+	u8		reg;
+	u32		mask;
+} pt_caps[] = {
+	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
+	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
+	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+};
+
+static u32 pt_cap_get(enum pt_capabilities cap)
+{
+	struct pt_cap_desc *cd = &pt_caps[cap];
+	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	unsigned int shift = __ffs(cd->mask);
+
+	return (c & cd->mask) >> shift;
+}
+
+static ssize_t pt_cap_show(struct device *cdev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct dev_ext_attribute *ea =
+		container_of(attr, struct dev_ext_attribute, attr);
+	enum pt_capabilities cap = (long)ea->var;
+
+	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
+}
+
+static struct attribute_group pt_cap_group = {
+	.name	= "caps",
+};
+
+PMU_FORMAT_ATTR(tsc,		"config:10"	);
+PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+
+static struct attribute *pt_formats_attr[] = {
+	&format_attr_tsc.attr,
+	&format_attr_noretcomp.attr,
+	NULL,
+};
+
+static struct attribute_group pt_format_group = {
+	.name	= "format",
+	.attrs	= pt_formats_attr,
+};
+
+static const struct attribute_group *pt_attr_groups[] = {
+	&pt_cap_group,
+	&pt_format_group,
+	NULL,
+};
+
+static int __init pt_pmu_hw_init(void)
+{
+	struct dev_ext_attribute *de_attrs;
+	struct attribute **attrs;
+	size_t size;
+	long i;
+
+	if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) {
+		for (i = 0; i < PT_CPUID_LEAVES; i++)
+			cpuid_count(20, i,
+				    &pt_pmu.caps[CR_EAX + i * 4],
+				    &pt_pmu.caps[CR_EBX + i * 4],
+				    &pt_pmu.caps[CR_ECX + i * 4],
+				    &pt_pmu.caps[CR_EDX + i * 4]);
+	} else {
+		return -ENODEV;
+	}
+
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1);
+	attrs = kzalloc(size, GFP_KERNEL);
+	if (!attrs)
+		goto err_attrs;
+
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1);
+	de_attrs = kzalloc(size, GFP_KERNEL);
+	if (!de_attrs)
+		goto err_de_attrs;
+
+	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
+		de_attrs[i].attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attrs[i].attr.attr);
+		de_attrs[i].attr.attr.mode = S_IRUGO;
+		de_attrs[i].attr.show = pt_cap_show;
+		de_attrs[i].var = (void *)i;
+		attrs[i] = &de_attrs[i].attr.attr;
+	}
+
+	pt_cap_group.attrs = attrs;
+	return 0;
+
+err_de_attrs:
+	kfree(de_attrs);
+err_attrs:
+	kfree(attrs);
+
+	return -ENOMEM;
+}
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+
+static bool pt_event_valid(struct perf_event *event)
+{
+	u64 config = event->attr.config;
+
+	if ((config & PT_CONFIG_MASK) != config)
+		return false;
+
+	return true;
+}
+
+/*
+ * PT configuration helpers
+ * These all are cpu affine and operate on a local PT
+ */
+
+static bool pt_is_running(void)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	return !!(ctl & RTIT_CTL_TRACEEN);
+}
+
+static void pt_config(struct perf_event *event)
+{
+	u64 reg;
+
+	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
+
+	if (!event->attr.exclude_kernel)
+		reg |= RTIT_CTL_OS;
+	if (!event->attr.exclude_user)
+		reg |= RTIT_CTL_USR;
+
+	reg |= (event->attr.config & PT_CONFIG_MASK);
+
+	wrmsrl(MSR_IA32_RTIT_CTL, reg);
+}
+
+static void pt_config_start(bool start)
+{
+	u64 ctl;
+
+	rdmsrl(MSR_IA32_RTIT_CTL, ctl);
+	if (start)
+		ctl |= RTIT_CTL_TRACEEN;
+	else
+		ctl &= ~RTIT_CTL_TRACEEN;
+	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
+
+	/*
+	 * A wrmsr that disables trace generation serializes other PT
+	 * registers and causes all data packets to be written to memory,
+	 * but a fence is required for the data to become globally visible.
+	 *
+	 * The below WMB, separating data store and aux_head store matches
+	 * the consumer's RMB that separates aux_head load and data load.
+	 */
+	if (!start)
+		wmb();
+}
+
+static void pt_config_buffer(void *buf, unsigned int topa_idx,
+			     unsigned int output_off)
+{
+	u64 reg;
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
+
+	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
+
+	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
+}
+
+/*
+ * Keep ToPA table-related metadata on the same page as the actual table,
+ * taking up a few words from the top
+ */
+
+#define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
+
+/**
+ * struct topa - page-sized ToPA table with metadata at the top
+ * @table:	actual ToPA table entries, as understood by PT hardware
+ * @list:	linkage to struct pt_buffer's list of tables
+ * @phys:	physical address of this page
+ * @offset:	offset of the first entry in this table in the buffer
+ * @size:	total size of all entries in this table
+ * @last:	index of the last initialized entry in this table
+ */
+struct topa {
+	struct topa_entry	table[TENTS_PER_PAGE];
+	struct list_head	list;
+	u64			phys;
+	u64			offset;
+	size_t			size;
+	int			last;
+};
+
+/* make -1 stand for the last table entry */
+#define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
+
+/**
+ * topa_alloc() - allocate page-sized ToPA table
+ * @cpu:	CPU on which to allocate.
+ * @gfp:	Allocation flags.
+ *
+ * Return:	On success, return the pointer to ToPA table page.
+ */
+static struct topa *topa_alloc(int cpu, gfp_t gfp)
+{
+	int node = cpu_to_node(cpu);
+	struct topa *topa;
+	struct page *p;
+
+	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
+	if (!p)
+		return NULL;
+
+	topa = page_address(p);
+	topa->last = 0;
+	topa->phys = page_to_phys(p);
+
+	/*
+	 * In case of singe-entry ToPA, always put the self-referencing END
+	 * link as the 2nd entry in the table
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(topa, 1)->end = 1;
+	}
+
+	return topa;
+}
+
+/**
+ * topa_free() - free a page-sized ToPA table
+ * @topa:	Table to deallocate.
+ */
+static void topa_free(struct topa *topa)
+{
+	free_page((unsigned long)topa);
+}
+
+/**
+ * topa_insert_table() - insert a ToPA table into a buffer
+ * @buf:	 PT buffer that's being extended.
+ * @topa:	 New topa table to be inserted.
+ *
+ * If it's the first table in this buffer, set up buffer's pointers
+ * accordingly; otherwise, add a END=1 link entry to @topa to the current
+ * "last" table and adjust the last table pointer to @topa.
+ */
+static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
+{
+	struct topa *last = buf->last;
+
+	list_add_tail(&topa->list, &buf->tables);
+
+	if (!buf->first) {
+		buf->first = buf->last = buf->cur = topa;
+		return;
+	}
+
+	topa->offset = last->offset + last->size;
+	buf->last = topa;
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return;
+
+	BUG_ON(last->last != TENTS_PER_PAGE - 1);
+
+	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
+	TOPA_ENTRY(last, -1)->end = 1;
+}
+
+/**
+ * topa_table_full() - check if a ToPA table is filled up
+ * @topa:	ToPA table.
+ */
+static bool topa_table_full(struct topa *topa)
+{
+	/* single-entry ToPA is a special case */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return !!topa->last;
+
+	return topa->last == TENTS_PER_PAGE - 1;
+}
+
+/**
+ * topa_insert_pages() - create a list of ToPA tables
+ * @buf:	PT buffer being initialized.
+ * @gfp:	Allocation flags.
+ *
+ * This initializes a list of ToPA tables with entries from
+ * the data_pages provided by rb_alloc_aux().
+ *
+ * Return:	0 on success or error code.
+ */
+static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
+{
+	struct topa *topa = buf->last;
+	int order = 0;
+	struct page *p;
+
+	p = virt_to_page(buf->data_pages[buf->nr_pages]);
+	if (PagePrivate(p))
+		order = page_private(p);
+
+	if (topa_table_full(topa)) {
+		topa = topa_alloc(buf->cpu, gfp);
+		if (!topa)
+			return -ENOMEM;
+
+		topa_insert_table(buf, topa);
+	}
+
+	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
+	TOPA_ENTRY(topa, -1)->size = order;
+	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(topa, -1)->intr = 1;
+		TOPA_ENTRY(topa, -1)->stop = 1;
+	}
+
+	topa->last++;
+	topa->size += sizes(order);
+
+	buf->nr_pages += 1ul << order;
+
+	return 0;
+}
+
+/**
+ * pt_topa_dump() - print ToPA tables and their entries
+ * @buf:	PT buffer.
+ */
+static void pt_topa_dump(struct pt_buffer *buf)
+{
+	struct topa *topa;
+
+	list_for_each_entry(topa, &buf->tables, list) {
+		int i;
+
+		pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table,
+			 (void *)topa->phys, topa->offset, topa->size);
+		for (i = 0; i < TENTS_PER_PAGE; i++) {
+			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
+				 &topa->table[i],
+				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
+				 sizes(topa->table[i].size),
+				 topa->table[i].end ?  'E' : ' ',
+				 topa->table[i].intr ? 'I' : ' ',
+				 topa->table[i].stop ? 'S' : ' ',
+				 *(u64 *)&topa->table[i]);
+			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
+			     topa->table[i].stop) ||
+			    topa->table[i].end)
+				break;
+		}
+	}
+}
+
+/**
+ * pt_buffer_advance() - advance to the next output region
+ * @buf:	PT buffer.
+ *
+ * Advance the current pointers in the buffer to the next ToPA entry.
+ */
+static void pt_buffer_advance(struct pt_buffer *buf)
+{
+	buf->output_off = 0;
+	buf->cur_idx++;
+
+	if (buf->cur_idx == buf->cur->last) {
+		if (buf->cur == buf->last)
+			buf->cur = buf->first;
+		else
+			buf->cur = list_entry(buf->cur->list.next, struct topa,
+					      list);
+		buf->cur_idx = 0;
+	}
+}
+
+/**
+ * pt_update_head() - calculate current offsets and sizes
+ * @pt:		Per-cpu pt context.
+ *
+ * Update buffer's current write pointer position and data size.
+ */
+static void pt_update_head(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	u64 topa_idx, base, old;
+
+	/* offset of the first region in this table from the beginning of buf */
+	base = buf->cur->offset + buf->output_off;
+
+	/* offset of the current output region within this table */
+	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
+		base += sizes(buf->cur->table[topa_idx].size);
+
+	if (buf->snapshot) {
+		local_set(&buf->data_size, base);
+	} else {
+		old = (local64_xchg(&buf->head, base) &
+		       ((buf->nr_pages << PAGE_SHIFT) - 1));
+		if (base < old)
+			base += buf->nr_pages << PAGE_SHIFT;
+
+		local_add(base - old, &buf->data_size);
+	}
+}
+
+/**
+ * pt_buffer_region() - obtain current output region's address
+ * @buf:	PT buffer.
+ */
+static void *pt_buffer_region(struct pt_buffer *buf)
+{
+	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
+}
+
+/**
+ * pt_buffer_region_size() - obtain current output region's size
+ * @buf:	PT buffer.
+ */
+static size_t pt_buffer_region_size(struct pt_buffer *buf)
+{
+	return sizes(buf->cur->table[buf->cur_idx].size);
+}
+
+/**
+ * pt_handle_status() - take care of possible status conditions
+ * @pt:		Per-cpu pt context.
+ */
+static void pt_handle_status(struct pt *pt)
+{
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+	int advance = 0;
+	u64 status;
+
+	rdmsrl(MSR_IA32_RTIT_STATUS, status);
+
+	if (status & RTIT_STATUS_ERROR) {
+		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
+		pt_topa_dump(buf);
+		status &= ~RTIT_STATUS_ERROR;
+	}
+
+	if (status & RTIT_STATUS_STOPPED) {
+		status &= ~RTIT_STATUS_STOPPED;
+
+		/*
+		 * On systems that only do single-entry ToPA, hitting STOP
+		 * means we are already losing data; need to let the decoder
+		 * know.
+		 */
+		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
+		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
+			local_inc(&buf->lost);
+			advance++;
+		}
+	}
+
+	/*
+	 * Also on single-entry ToPA implementations, interrupt will come
+	 * before the output reaches its output region's boundary.
+	 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
+	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
+		void *head = pt_buffer_region(buf);
+
+		/* everything within this margin needs to be zeroed out */
+		memset(head + buf->output_off, 0,
+		       pt_buffer_region_size(buf) -
+		       buf->output_off);
+		advance++;
+	}
+
+	if (advance)
+		pt_buffer_advance(buf);
+
+	wrmsrl(MSR_IA32_RTIT_STATUS, status);
+}
+
+/**
+ * pt_read_offset() - translate registers into buffer pointers
+ * @buf:	PT buffer.
+ *
+ * Set buffer's output pointers from MSR values.
+ */
+static void pt_read_offset(struct pt_buffer *buf)
+{
+	u64 offset, base_topa;
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
+	buf->cur = phys_to_virt(base_topa);
+
+	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
+	/* offset within current output region */
+	buf->output_off = offset >> 32;
+	/* index of current output region within this table */
+	buf->cur_idx = (offset & 0xffffff80) >> 7;
+}
+
+/**
+ * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
+ * @buf:	PT buffer.
+ * @pg:		Page offset in the buffer.
+ *
+ * When advancing to the next output region (ToPA entry), given a page offset
+ * into the buffer, we need to find the offset of the first page in the next
+ * region.
+ */
+static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
+{
+	struct topa_entry *te = buf->topa_index[pg];
+
+	/* one region */
+	if (buf->first == buf->last && buf->first->last == 1)
+		return pg;
+
+	do {
+		pg++;
+		pg &= buf->nr_pages - 1;
+	} while (buf->topa_index[pg] == te);
+
+	return pg;
+}
+
+/**
+ * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
+ * @buf:	PT buffer.
+ * @handle:	Current output handle.
+ *
+ * Place INT and STOP marks to prevent overwriting old data that the consumer
+ * hasn't yet collected.
+ */
+static int pt_buffer_reset_markers(struct pt_buffer *buf,
+				   struct perf_output_handle *handle)
+
+{
+	unsigned long idx, npages, end;
+
+	if (buf->snapshot)
+		return 0;
+
+	/* can't stop in the middle of an output region */
+	if (buf->output_off + handle->size + 1 <
+	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
+		return -EINVAL;
+
+
+	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		return 0;
+
+	/* clear STOP and INT from current entry */
+	buf->topa_index[buf->stop_pos]->stop = 0;
+	buf->topa_index[buf->intr_pos]->intr = 0;
+
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		npages = (handle->size + 1) >> PAGE_SHIFT;
+		end = (local64_read(&buf->head) >> PAGE_SHIFT) + npages;
+		/*if (end > handle->wakeup >> PAGE_SHIFT)
+		  end = handle->wakeup >> PAGE_SHIFT;*/
+		idx = end & (buf->nr_pages - 1);
+		buf->stop_pos = idx;
+		idx = (local64_read(&buf->head) >> PAGE_SHIFT) + npages - 1;
+		idx &= buf->nr_pages - 1;
+		buf->intr_pos = idx;
+	}
+
+	buf->topa_index[buf->stop_pos]->stop = 1;
+	buf->topa_index[buf->intr_pos]->intr = 1;
+
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_topa_index() - build topa_index[] table of regions
+ * @buf:	PT buffer.
+ *
+ * topa_index[] references output regions indexed by offset into the
+ * buffer for purposes of quick reverse lookup.
+ */
+static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
+{
+	struct topa *cur = buf->first, *prev = buf->last;
+	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
+		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
+	int pg = 0, idx = 0, ntopa = 0;
+
+	while (pg < buf->nr_pages) {
+		int tidx;
+
+		/* pages within one topa entry */
+		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
+			buf->topa_index[pg] = te_prev;
+
+		te_prev = te_cur;
+
+		if (idx == cur->last - 1) {
+			/* advance to next topa table */
+			idx = 0;
+			cur = list_entry(cur->list.next, struct topa, list);
+			ntopa++;
+		} else
+			idx++;
+		te_cur = TOPA_ENTRY(cur, idx);
+	}
+
+}
+
+/**
+ * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
+ * @buf:	PT buffer.
+ * @head:	Write pointer (aux_head) from AUX buffer.
+ *
+ * Find the ToPA table and entry corresponding to given @head and set buffer's
+ * "current" pointers accordingly.
+ */
+static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
+{
+	int pg;
+
+	if (buf->snapshot)
+		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
+
+	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
+	pg = pt_topa_next_entry(buf, pg);
+
+	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
+	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
+			(unsigned long)buf->cur) / sizeof(struct topa_entry);
+	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
+
+	local64_set(&buf->head, head);
+	local_set(&buf->data_size, 0);
+}
+
+/**
+ * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
+ * @buf:	PT buffer.
+ */
+static void pt_buffer_fini_topa(struct pt_buffer *buf)
+{
+	struct topa *topa, *iter;
+
+	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
+		/*
+		 * right now, this is in free_aux() path only, so
+		 * no need to unlink this table from the list
+		 */
+		topa_free(topa);
+	}
+}
+
+/**
+ * pt_buffer_init_topa() - initialize ToPA table for pt buffer
+ * @buf:	PT buffer.
+ * @size:	Total size of all regions within this ToPA.
+ * @gfp:	Allocation flags.
+ */
+static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
+			       gfp_t gfp)
+{
+	struct topa *topa;
+	int err;
+
+	topa = topa_alloc(buf->cpu, gfp);
+	if (!topa)
+		return -ENOMEM;
+
+	topa_insert_table(buf, topa);
+
+	while (buf->nr_pages < nr_pages) {
+		err = topa_insert_pages(buf, gfp);
+		if (err) {
+			pt_buffer_fini_topa(buf);
+			return -ENOMEM;
+		}
+	}
+
+	pt_buffer_setup_topa_index(buf);
+
+	/* link last table to the first one, unless we're double buffering */
+	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
+		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
+		TOPA_ENTRY(buf->last, -1)->end = 1;
+	}
+
+	pt_topa_dump(buf);
+	return 0;
+}
+
+/**
+ * pt_buffer_setup_aux() - set up topa tables for a PT buffer
+ * @cpu:	Cpu on which to allocate, -1 means current.
+ * @pages:	Array of pointers to buffer pages passed from perf core.
+ * @nr_pages:	Number of pages in the buffer.
+ * @snapshot:	If this is a snapshot/overwrite counter.
+ *
+ * This is a pmu::setup_aux callback that sets up ToPA tables and all the
+ * bookkeeping for an AUX buffer.
+ *
+ * Return:	Our private PT buffer structure.
+ */
+static void *
+pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
+{
+	struct pt_buffer *buf;
+	int node, ret;
+
+	if (!nr_pages)
+		return NULL;
+
+	if (cpu == -1)
+		cpu = raw_smp_processor_id();
+	node = cpu_to_node(cpu);
+
+	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
+			   GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->cpu = cpu;
+	buf->snapshot = snapshot;
+	buf->data_pages = pages;
+
+	INIT_LIST_HEAD(&buf->tables);
+
+	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
+	if (ret) {
+		kfree(buf);
+		return NULL;
+	}
+
+	return buf;
+}
+
+/**
+ * pt_buffer_free_aux() - perf AUX deallocation path callback
+ * @data:	PT buffer.
+ */
+static void pt_buffer_free_aux(void *data)
+{
+	struct pt_buffer *buf = data;
+
+	pt_buffer_fini_topa(buf);
+	kfree(buf);
+}
+
+/**
+ * pt_buffer_is_full() - check if the buffer is full
+ * @buf:	PT buffer.
+ * @pt:		Per-cpu pt handle.
+ *
+ * If the user hasn't read data from the output region that aux_head
+ * points to, the buffer is considered full: the user needs to read at
+ * least this region and update aux_tail to point past it.
+ */
+static bool pt_buffer_is_full(struct pt_buffer *buf, struct pt *pt)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= pt->handle.size)
+		return true;
+
+	return false;
+}
+
+/**
+ * intel_pt_interrupt() - PT PMI handler
+ */
+void intel_pt_interrupt(void)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+	struct perf_event *event = pt->handle.event;
+
+	/*
+	 * There may be a dangling PT bit in the interrupt status register
+	 * after PT has been disabled by pt_event_stop(). Make sure we don't
+	 * do anything (particularly, re-enable) for this event here.
+	 */
+	if (!ACCESS_ONCE(pt->handle_nmi))
+		return;
+
+	pt_config_start(false);
+
+	if (!event)
+		return;
+
+	buf = perf_get_aux(&pt->handle);
+	if (!buf)
+		return;
+
+	pt_read_offset(buf);
+
+	pt_handle_status(pt);
+
+	pt_update_head(pt);
+
+	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+			    local_xchg(&buf->lost, 0));
+
+	if (!event->hw.state) {
+		int ret;
+
+		buf = perf_aux_output_begin(&pt->handle, event);
+		if (!buf) {
+			event->hw.state = PERF_HES_STOPPED;
+			return;
+		}
+
+		pt_buffer_reset_offsets(buf, pt->handle.head);
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			return;
+		}
+
+		pt_config_buffer(buf->cur->table, buf->cur_idx,
+				 buf->output_off);
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+		pt_config(event);
+	}
+}
+
+/*
+ * PMU callbacks
+ */
+
+static void pt_event_start(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+	if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) {
+		event->hw.state = PERF_HES_STOPPED;
+		return;
+	}
+
+	ACCESS_ONCE(pt->handle_nmi) = 1;
+	event->hw.state = 0;
+
+	pt_config_buffer(buf->cur->table, buf->cur_idx,
+			 buf->output_off);
+	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	pt_config(event);
+}
+
+static void pt_event_stop(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+
+	/*
+	 * Protect against the PMI racing with disabling wrmsr,
+	 * see comment in intel_pt_interrupt().
+	 */
+	ACCESS_ONCE(pt->handle_nmi) = 0;
+	pt_config_start(false);
+
+	if (event->hw.state == PERF_HES_STOPPED)
+		return;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (mode & PERF_EF_UPDATE) {
+		struct pt *pt = this_cpu_ptr(&pt_ctx);
+		struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+		if (!buf)
+			return;
+
+		if (WARN_ON_ONCE(pt->handle.event != event))
+			return;
+
+		pt_read_offset(buf);
+
+		pt_handle_status(pt);
+
+		pt_update_head(pt);
+	}
+}
+
+static void pt_event_del(struct perf_event *event, int mode)
+{
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct pt_buffer *buf;
+
+	pt_event_stop(event, PERF_EF_UPDATE);
+
+	buf = perf_get_aux(&pt->handle);
+
+	if (buf) {
+		if (buf->snapshot)
+			pt->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
+				    local_xchg(&buf->lost, 0));
+	}
+}
+
+static int pt_event_add(struct perf_event *event, int mode)
+{
+	struct pt_buffer *buf;
+	struct pt *pt = this_cpu_ptr(&pt_ctx);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	if (pt->handle.event)
+		goto out;
+
+	buf = perf_aux_output_begin(&pt->handle, event);
+	if (!buf) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	pt_buffer_reset_offsets(buf, pt->handle.head);
+	if (!buf->snapshot) {
+		ret = pt_buffer_reset_markers(buf, &pt->handle);
+		if (ret) {
+			perf_aux_output_end(&pt->handle, 0, true);
+			goto out;
+		}
+	}
+
+	if (mode & PERF_EF_START) {
+		pt_event_start(event, 0);
+		if (hwc->state == PERF_HES_STOPPED) {
+			pt_event_del(event, 0);
+			ret = -EBUSY;
+		}
+	} else {
+		hwc->state = PERF_HES_STOPPED;
+	}
+
+	ret = 0;
+out:
+
+	if (ret)
+		hwc->state = PERF_HES_STOPPED;
+
+	return ret;
+}
+
+static void pt_event_read(struct perf_event *event)
+{
+}
+
+static void pt_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_pt);
+}
+
+static int pt_event_init(struct perf_event *event)
+{
+	if (event->attr.type != pt_pmu.pmu.type)
+		return -ENOENT;
+
+	if (!pt_event_valid(event))
+		return -EINVAL;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_pt))
+		return -EBUSY;
+
+	event->destroy = pt_event_destroy;
+
+	return 0;
+}
+
+static __init int pt_init(void)
+{
+	int ret, cpu, prior_warn = 0;
+
+	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
+		u64 ctl;
+
+		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
+		if (!ret && (ctl & RTIT_CTL_TRACEEN))
+			prior_warn++;
+	}
+	put_online_cpus();
+
+	if (prior_warn) {
+		x86_add_exclusive(x86_lbr_exclusive_pt);
+		pr_warn("PT is enabled at boot time, doing nothing\n");
+
+		return -EBUSY;
+	}
+
+	ret = pt_pmu_hw_init();
+	if (ret)
+		return ret;
+
+	if (!pt_cap_get(PT_CAP_topa_output)) {
+		pr_warn("ToPA output is not supported on this CPU\n");
+		return -ENODEV;
+	}
+
+	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
+		pt_pmu.pmu.capabilities =
+			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
+
+	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
+	pt_pmu.pmu.attr_groups	= pt_attr_groups;
+	pt_pmu.pmu.task_ctx_nr	= perf_sw_context;
+	pt_pmu.pmu.event_init	= pt_event_init;
+	pt_pmu.pmu.add		= pt_event_add;
+	pt_pmu.pmu.del		= pt_event_del;
+	pt_pmu.pmu.start	= pt_event_start;
+	pt_pmu.pmu.stop		= pt_event_stop;
+	pt_pmu.pmu.read		= pt_event_read;
+	pt_pmu.pmu.setup_aux	= pt_buffer_setup_aux;
+	pt_pmu.pmu.free_aux	= pt_buffer_free_aux;
+	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
+
+	return ret;
+}
+
+module_init(pt_init);
-- 
cgit v1.2.3


From 8062382c8dbe2dc11d37e7f0b139508cf10de9d4 Mon Sep 17 00:00:00 2001
From: Alexander Shishkin
Date: Fri, 30 Jan 2015 12:40:35 +0200
Subject: perf/x86/intel/bts: Add BTS PMU driver

Add support for Branch Trace Store (BTS) via kernel perf event infrastructure.
The difference with the existing implementation of BTS support is that this
one is a separate PMU that exports events' trace buffers to userspace by means
of AUX area of the perf buffer, which is zero-copy mapped into userspace.

The immediate benefit is that the buffer size can be much bigger, resulting in
fewer interrupts and no kernel side copying is involved and little to no trace
data loss. Also, kernel code can be traced with this driver.

The old way of collecting BTS traces still works.

Signed-off-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Link: http://lkml.kernel.org/r/1422614435-114702-1-git-send-email-alexander.shishkin@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/Makefile               |   2 +-
 arch/x86/kernel/cpu/perf_event.h           |   7 +
 arch/x86/kernel/cpu/perf_event_intel.c     |   6 +-
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 525 +++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel_ds.c  |   3 +-
 5 files changed, 540 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/perf_event_intel_bts.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index e6b353f10e09..9bff68798836 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -40,7 +40,7 @@ endif
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_p6.o perf_event_knc.o perf_event_p4.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_lbr.o perf_event_intel_ds.o perf_event_intel.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_rapl.o perf_event_intel_cqm.o
-obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_intel_pt.o perf_event_intel_bts.o
 
 obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f04729ac3290..eaebfd707016 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -410,6 +410,7 @@ union x86_pmu_config {
 
 enum {
 	x86_lbr_exclusive_lbr,
+	x86_lbr_exclusive_bts,
 	x86_lbr_exclusive_pt,
 	x86_lbr_exclusive_max,
 };
@@ -810,6 +811,12 @@ int intel_pmu_setup_lbr_filter(struct perf_event *event);
 
 void intel_pt_interrupt(void);
 
+int intel_bts_interrupt(void);
+
+void intel_bts_enable_local(void);
+
+void intel_bts_disable_local(void);
+
 int p4_pmu_init(void);
 
 int p6_pmu_init(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 8eb22ce26303..b9861e19cd3d 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1242,6 +1242,8 @@ static void intel_pmu_disable_all(void)
 
 	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
 		intel_pmu_disable_bts();
+	else
+		intel_bts_disable_local();
 
 	intel_pmu_pebs_disable_all();
 	intel_pmu_lbr_disable_all();
@@ -1264,7 +1266,8 @@ static void intel_pmu_enable_all(int added)
 			return;
 
 		intel_pmu_enable_bts(event->hw.config);
-	}
+	} else
+		intel_bts_enable_local();
 }
 
 /*
@@ -1550,6 +1553,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
 	intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
+	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
 	if (!status)
 		goto done;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
new file mode 100644
index 000000000000..fb1a4c28f3e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -0,0 +1,525 @@
+/*
+ * BTS PMU driver for perf
+ * Copyright (c) 2013-2014, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+
+#undef DEBUG
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/bitops.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/coredump.h>
+
+#include <asm-generic/sizes.h>
+#include <asm/perf_event.h>
+
+#include "perf_event.h"
+
+struct bts_ctx {
+	struct perf_output_handle	handle;
+	struct debug_store		ds_back;
+	int				started;
+};
+
+static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
+
+#define BTS_RECORD_SIZE		24
+#define BTS_SAFETY_MARGIN	4080
+
+struct bts_phys {
+	struct page	*page;
+	unsigned long	size;
+	unsigned long	offset;
+	unsigned long	displacement;
+};
+
+struct bts_buffer {
+	size_t		real_size;	/* multiple of BTS_RECORD_SIZE */
+	unsigned int	nr_pages;
+	unsigned int	nr_bufs;
+	unsigned int	cur_buf;
+	bool		snapshot;
+	local_t		data_size;
+	local_t		lost;
+	local_t		head;
+	unsigned long	end;
+	void		**data_pages;
+	struct bts_phys	buf[0];
+};
+
+struct pmu bts_pmu;
+
+void intel_pmu_enable_bts(u64 config);
+void intel_pmu_disable_bts(void);
+
+static size_t buf_size(struct page *page)
+{
+	return 1 << (PAGE_SHIFT + page_private(page));
+}
+
+static void *
+bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
+{
+	struct bts_buffer *buf;
+	struct page *page;
+	int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
+	unsigned long offset;
+	size_t size = nr_pages << PAGE_SHIFT;
+	int pg, nbuf, pad;
+
+	/* count all the high order buffers */
+	for (pg = 0, nbuf = 0; pg < nr_pages;) {
+		page = virt_to_page(pages[pg]);
+		if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
+			return NULL;
+		pg += 1 << page_private(page);
+		nbuf++;
+	}
+
+	/*
+	 * to avoid interrupts in overwrite mode, only allow one physical
+	 */
+	if (overwrite && nbuf > 1)
+		return NULL;
+
+	buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
+	if (!buf)
+		return NULL;
+
+	buf->nr_pages = nr_pages;
+	buf->nr_bufs = nbuf;
+	buf->snapshot = overwrite;
+	buf->data_pages = pages;
+	buf->real_size = size - size % BTS_RECORD_SIZE;
+
+	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
+		unsigned int __nr_pages;
+
+		page = virt_to_page(pages[pg]);
+		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
+		buf->buf[nbuf].page = page;
+		buf->buf[nbuf].offset = offset;
+		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
+		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
+		buf->buf[nbuf].size -= pad;
+
+		pg += __nr_pages;
+		offset += __nr_pages << PAGE_SHIFT;
+	}
+
+	return buf;
+}
+
+static void bts_buffer_free_aux(void *data)
+{
+	kfree(data);
+}
+
+static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
+{
+	return buf->buf[idx].offset + buf->buf[idx].displacement;
+}
+
+static void
+bts_config_buffer(struct bts_buffer *buf)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_phys *phys = &buf->buf[buf->cur_buf];
+	unsigned long index, thresh = 0, end = phys->size;
+	struct page *page = phys->page;
+
+	index = local_read(&buf->head);
+
+	if (!buf->snapshot) {
+		if (buf->end < phys->offset + buf_size(page))
+			end = buf->end - phys->offset - phys->displacement;
+
+		index -= phys->offset + phys->displacement;
+
+		if (end - index > BTS_SAFETY_MARGIN)
+			thresh = end - BTS_SAFETY_MARGIN;
+		else if (end - index > BTS_RECORD_SIZE)
+			thresh = end - BTS_RECORD_SIZE;
+		else
+			thresh = end;
+	}
+
+	ds->bts_buffer_base = (u64)page_address(page) + phys->displacement;
+	ds->bts_index = ds->bts_buffer_base + index;
+	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
+	ds->bts_interrupt_threshold = !buf->snapshot
+		? ds->bts_buffer_base + thresh
+		: ds->bts_absolute_maximum + BTS_RECORD_SIZE;
+}
+
+static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
+{
+	unsigned long index = head - phys->offset;
+
+	memset(page_address(phys->page) + index, 0, phys->size - index);
+}
+
+static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
+{
+	if (buf->snapshot)
+		return false;
+
+	if (local_read(&buf->data_size) >= bts->handle.size ||
+	    bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
+		return true;
+
+	return false;
+}
+
+static void bts_update(struct bts_ctx *bts)
+{
+	int cpu = raw_smp_processor_id();
+	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
+
+	if (!buf)
+		return;
+
+	head = index + bts_buffer_offset(buf, buf->cur_buf);
+	old = local_xchg(&buf->head, head);
+
+	if (!buf->snapshot) {
+		if (old == head)
+			return;
+
+		if (ds->bts_index >= ds->bts_absolute_maximum)
+			local_inc(&buf->lost);
+
+		/*
+		 * old and head are always in the same physical buffer, so we
+		 * can subtract them to get the data size.
+		 */
+		local_add(head - old, &buf->data_size);
+	} else {
+		local_set(&buf->data_size, head);
+	}
+}
+
+static void __bts_event_start(struct perf_event *event)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+	u64 config = 0;
+
+	if (!buf || bts_buffer_is_full(buf, bts))
+		return;
+
+	event->hw.state = 0;
+
+	if (!buf->snapshot)
+		config |= ARCH_PERFMON_EVENTSEL_INT;
+	if (!event->attr.exclude_kernel)
+		config |= ARCH_PERFMON_EVENTSEL_OS;
+	if (!event->attr.exclude_user)
+		config |= ARCH_PERFMON_EVENTSEL_USR;
+
+	bts_config_buffer(buf);
+
+	/*
+	 * local barrier to make sure that ds configuration made it
+	 * before we enable BTS
+	 */
+	wmb();
+
+	intel_pmu_enable_bts(config);
+}
+
+static void bts_event_start(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	__bts_event_start(event);
+
+	/* PMI handler: this counter is running and likely generating PMIs */
+	ACCESS_ONCE(bts->started) = 1;
+}
+
+static void __bts_event_stop(struct perf_event *event)
+{
+	/*
+	 * No extra synchronization is mandated by the documentation to have
+	 * BTS data stores globally visible.
+	 */
+	intel_pmu_disable_bts();
+
+	if (event->hw.state & PERF_HES_STOPPED)
+		return;
+
+	ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
+}
+
+static void bts_event_stop(struct perf_event *event, int flags)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	/* PMI handler: don't restart this counter */
+	ACCESS_ONCE(bts->started) = 0;
+
+	__bts_event_stop(event);
+
+	if (flags & PERF_EF_UPDATE)
+		bts_update(bts);
+}
+
+void intel_bts_enable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event && bts->started)
+		__bts_event_start(bts->handle.event);
+}
+
+void intel_bts_disable_local(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+
+	if (bts->handle.event)
+		__bts_event_stop(bts->handle.event);
+}
+
+static int
+bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
+{
+	unsigned long head, space, next_space, pad, gap, skip, wakeup;
+	unsigned int next_buf;
+	struct bts_phys *phys, *next_phys;
+	int ret;
+
+	if (buf->snapshot)
+		return 0;
+
+	head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
+	if (WARN_ON_ONCE(head != local_read(&buf->head)))
+		return -EINVAL;
+
+	phys = &buf->buf[buf->cur_buf];
+	space = phys->offset + phys->displacement + phys->size - head;
+	pad = space;
+	if (space > handle->size) {
+		space = handle->size;
+		space -= space % BTS_RECORD_SIZE;
+	}
+	if (space <= BTS_SAFETY_MARGIN) {
+		/* See if next phys buffer has more space */
+		next_buf = buf->cur_buf + 1;
+		if (next_buf >= buf->nr_bufs)
+			next_buf = 0;
+		next_phys = &buf->buf[next_buf];
+		gap = buf_size(phys->page) - phys->displacement - phys->size +
+		      next_phys->displacement;
+		skip = pad + gap;
+		if (handle->size >= skip) {
+			next_space = next_phys->size;
+			if (next_space + skip > handle->size) {
+				next_space = handle->size - skip;
+				next_space -= next_space % BTS_RECORD_SIZE;
+			}
+			if (next_space > space || !space) {
+				if (pad)
+					bts_buffer_pad_out(phys, head);
+				ret = perf_aux_output_skip(handle, skip);
+				if (ret)
+					return ret;
+				/* Advance to next phys buffer */
+				phys = next_phys;
+				space = next_space;
+				head = phys->offset + phys->displacement;
+				/*
+				 * After this, cur_buf and head won't match ds
+				 * anymore, so we must not be racing with
+				 * bts_update().
+				 */
+				buf->cur_buf = next_buf;
+				local_set(&buf->head, head);
+			}
+		}
+	}
+
+	/* Don't go far beyond wakeup watermark */
+	wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
+		 handle->head;
+	if (space > wakeup) {
+		space = wakeup;
+		space -= space % BTS_RECORD_SIZE;
+	}
+
+	buf->end = head + space;
+
+	/*
+	 * If we have no space, the lost notification would have been sent when
+	 * we hit absolute_maximum - see bts_update()
+	 */
+	if (!space)
+		return -ENOSPC;
+
+	return 0;
+}
+
+int intel_bts_interrupt(void)
+{
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct perf_event *event = bts->handle.event;
+	struct bts_buffer *buf;
+	s64 old_head;
+	int err;
+
+	if (!event || !bts->started)
+		return 0;
+
+	buf = perf_get_aux(&bts->handle);
+	/*
+	 * Skip snapshot counters: they don't use the interrupt, but
+	 * there's no other way of telling, because the pointer will
+	 * keep moving
+	 */
+	if (!buf || buf->snapshot)
+		return 0;
+
+	old_head = local_read(&buf->head);
+	bts_update(bts);
+
+	/* no new data */
+	if (old_head == local_read(&buf->head))
+		return 0;
+
+	perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+			    !!local_xchg(&buf->lost, 0));
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return 1;
+
+	err = bts_buffer_reset(buf, &bts->handle);
+	if (err)
+		perf_aux_output_end(&bts->handle, 0, false);
+
+	return 1;
+}
+
+static void bts_event_del(struct perf_event *event, int mode)
+{
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct bts_buffer *buf = perf_get_aux(&bts->handle);
+
+	bts_event_stop(event, PERF_EF_UPDATE);
+
+	if (buf) {
+		if (buf->snapshot)
+			bts->handle.head =
+				local_xchg(&buf->data_size,
+					   buf->nr_pages << PAGE_SHIFT);
+		perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
+				    !!local_xchg(&buf->lost, 0));
+	}
+
+	cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
+	cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
+	cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
+}
+
+static int bts_event_add(struct perf_event *event, int mode)
+{
+	struct bts_buffer *buf;
+	struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
+	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
+	struct hw_perf_event *hwc = &event->hw;
+	int ret = -EBUSY;
+
+	event->hw.state = PERF_HES_STOPPED;
+
+	if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
+		return -EBUSY;
+
+	if (bts->handle.event)
+		return -EBUSY;
+
+	buf = perf_aux_output_begin(&bts->handle, event);
+	if (!buf)
+		return -EINVAL;
+
+	ret = bts_buffer_reset(buf, &bts->handle);
+	if (ret) {
+		perf_aux_output_end(&bts->handle, 0, false);
+		return ret;
+	}
+
+	bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
+	bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
+	bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
+
+	if (mode & PERF_EF_START) {
+		bts_event_start(event, 0);
+		if (hwc->state & PERF_HES_STOPPED) {
+			bts_event_del(event, 0);
+			return -EBUSY;
+		}
+	}
+
+	return 0;
+}
+
+static void bts_event_destroy(struct perf_event *event)
+{
+	x86_del_exclusive(x86_lbr_exclusive_bts);
+}
+
+static int bts_event_init(struct perf_event *event)
+{
+	if (event->attr.type != bts_pmu.type)
+		return -ENOENT;
+
+	if (x86_add_exclusive(x86_lbr_exclusive_bts))
+		return -EBUSY;
+
+	event->destroy = bts_event_destroy;
+
+	return 0;
+}
+
+static void bts_event_read(struct perf_event *event)
+{
+}
+
+static __init int bts_init(void)
+{
+	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
+		return -ENODEV;
+
+	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
+	bts_pmu.task_ctx_nr	= perf_sw_context;
+	bts_pmu.event_init	= bts_event_init;
+	bts_pmu.add		= bts_event_add;
+	bts_pmu.del		= bts_event_del;
+	bts_pmu.start		= bts_event_start;
+	bts_pmu.stop		= bts_event_stop;
+	bts_pmu.read		= bts_event_read;
+	bts_pmu.setup_aux	= bts_buffer_setup_aux;
+	bts_pmu.free_aux	= bts_buffer_free_aux;
+
+	return perf_pmu_register(&bts_pmu, "intel_bts", -1);
+}
+
+module_init(bts_init);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 073983398364..a5149c7abe73 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -461,7 +461,8 @@ void intel_pmu_enable_bts(u64 config)
 
 	debugctlmsr |= DEBUGCTLMSR_TR;
 	debugctlmsr |= DEBUGCTLMSR_BTS;
-	debugctlmsr |= DEBUGCTLMSR_BTINT;
+	if (config & ARCH_PERFMON_EVENTSEL_INT)
+		debugctlmsr |= DEBUGCTLMSR_BTINT;
 
 	if (!(config & ARCH_PERFMON_EVENTSEL_OS))
 		debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
-- 
cgit v1.2.3


From 9a5e3fb52ae5458c8bf1a67129b96c39b541a582 Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:06:53 +0100
Subject: perf/x86: Rename x86_pmu::er_flags to 'flags'

Because it will be used for more than just tracking the
presence of extra registers.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  9 ++++++---
 arch/x86/kernel/cpu/perf_event_intel.c | 24 ++++++++++++------------
 2 files changed, 18 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index eaebfd707016..5264010c9a08 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -521,7 +521,7 @@ struct x86_pmu {
 	 * Extra registers for events
 	 */
 	struct extra_reg *extra_regs;
-	unsigned int er_flags;
+	unsigned int flags;
 
 	/*
 	 * Intel host/guest support (KVM)
@@ -545,8 +545,11 @@ do {									\
 	x86_pmu.quirks = &__quirk;					\
 } while (0)
 
-#define ERF_NO_HT_SHARING	1
-#define ERF_HAS_RSP_1		2
+/*
+ * x86_pmu flags
+ */
+#define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
+#define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 1c78f44f4f93..e85988e2ecc7 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1667,7 +1667,7 @@ intel_bts_constraints(struct perf_event *event)
 
 static int intel_alt_er(int idx)
 {
-	if (!(x86_pmu.er_flags & ERF_HAS_RSP_1))
+	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;
 
 	if (idx == EXTRA_REG_RSP_0)
@@ -2250,7 +2250,7 @@ static void intel_pmu_cpu_starting(int cpu)
 	if (!cpuc->shared_regs)
 		return;
 
-	if (!(x86_pmu.er_flags & ERF_NO_HT_SHARING)) {
+	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
@@ -2671,7 +2671,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.event_constraints = intel_slm_event_constraints;
 		x86_pmu.pebs_constraints = intel_slm_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_slm_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		pr_cont("Silvermont events, ");
 		break;
 
@@ -2689,7 +2689,7 @@ __init int intel_pmu_init(void)
 		x86_pmu.enable_all = intel_pmu_nhm_enable_all;
 		x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
 		x86_pmu.extra_regs = intel_westmere_extra_regs;
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 
 		x86_pmu.cpu_events = nhm_events_attrs;
 
@@ -2721,8 +2721,8 @@ __init int intel_pmu_init(void)
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2756,8 +2756,8 @@ __init int intel_pmu_init(void)
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.cpu_events = snb_events_attrs;
 
@@ -2784,8 +2784,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
@@ -2817,8 +2817,8 @@ __init int intel_pmu_init(void)
 		x86_pmu.extra_regs = intel_snbep_extra_regs;
 		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
 		/* all extra regs are per-cpu when HT is on */
-		x86_pmu.er_flags |= ERF_HAS_RSP_1;
-		x86_pmu.er_flags |= ERF_NO_HT_SHARING;
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
 
 		x86_pmu.hw_config = hsw_hw_config;
 		x86_pmu.get_event_constraints = hsw_get_event_constraints;
-- 
cgit v1.2.3


From 90413464313e00fe4975f4a0ebf25fe31d01f793 Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:06:54 +0100
Subject: perf/x86: Vectorize cpuc->kfree_on_online

Make the cpuc->kfree_on_online a vector to accommodate
more than one entry and add the second entry to be
used by a later patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-3-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       | 10 +++++++---
 arch/x86/kernel/cpu/perf_event.h       |  8 +++++++-
 arch/x86/kernel/cpu/perf_event_amd.c   |  3 ++-
 arch/x86/kernel/cpu/perf_event_intel.c |  4 +++-
 4 files changed, 19 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 549d01d6d996..682ef00727e7 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1373,11 +1373,12 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 {
 	unsigned int cpu = (long)hcpu;
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	int ret = NOTIFY_OK;
+	int i, ret = NOTIFY_OK;
 
 	switch (action & ~CPU_TASKS_FROZEN) {
 	case CPU_UP_PREPARE:
-		cpuc->kfree_on_online = NULL;
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++)
+			cpuc->kfree_on_online[i] = NULL;
 		if (x86_pmu.cpu_prepare)
 			ret = x86_pmu.cpu_prepare(cpu);
 		break;
@@ -1388,7 +1389,10 @@ x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
 		break;
 
 	case CPU_ONLINE:
-		kfree(cpuc->kfree_on_online);
+		for (i = 0 ; i < X86_PERF_KFREE_MAX; i++) {
+			kfree(cpuc->kfree_on_online[i]);
+			cpuc->kfree_on_online[i] = NULL;
+		}
 		break;
 
 	case CPU_DYING:
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 5264010c9a08..55b915511e53 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -125,6 +125,12 @@ struct intel_shared_regs {
 
 #define MAX_LBR_ENTRIES		16
 
+enum {
+	X86_PERF_KFREE_SHARED = 0,
+	X86_PERF_KFREE_EXCL   = 1,
+	X86_PERF_KFREE_MAX
+};
+
 struct cpu_hw_events {
 	/*
 	 * Generic x86 PMC bits
@@ -187,7 +193,7 @@ struct cpu_hw_events {
 	/* Inverted mask of bits to clear in the perf_ctr ctrl registers */
 	u64				perf_ctr_virt_mask;
 
-	void				*kfree_on_online;
+	void				*kfree_on_online[X86_PERF_KFREE_MAX];
 };
 
 #define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index 28926311aac1..e4302b8fed2a 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -382,6 +382,7 @@ static int amd_pmu_cpu_prepare(int cpu)
 static void amd_pmu_cpu_starting(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
 	struct amd_nb *nb;
 	int i, nb_id;
 
@@ -399,7 +400,7 @@ static void amd_pmu_cpu_starting(int cpu)
 			continue;
 
 		if (nb->nb_id == nb_id) {
-			cpuc->kfree_on_online = cpuc->amd_nb;
+			*onln = cpuc->amd_nb;
 			cpuc->amd_nb = nb;
 			break;
 		}
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index e85988e2ecc7..c0ed5a4b9537 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2251,12 +2251,14 @@ static void intel_pmu_cpu_starting(int cpu)
 		return;
 
 	if (!(x86_pmu.flags & PMU_FL_NO_HT_SHARING)) {
+		void **onln = &cpuc->kfree_on_online[X86_PERF_KFREE_SHARED];
+
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_shared_regs *pc;
 
 			pc = per_cpu(cpu_hw_events, i).shared_regs;
 			if (pc && pc->core_id == core_id) {
-				cpuc->kfree_on_online = cpuc->shared_regs;
+				*onln = cpuc->shared_regs;
 				cpuc->shared_regs = pc;
 				break;
 			}
-- 
cgit v1.2.3


From c5362c0c376486afcf3c91d3c2691d348ac1e2fd Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou
Date: Mon, 17 Nov 2014 20:06:55 +0100
Subject: perf/x86: Add 3 new scheduling callbacks

This patch adds 3 new PMU model specific callbacks
during the event scheduling done by x86_schedule_events().

  ->start_scheduling():  invoked when entering the schedule routine.
  ->stop_scheduling():   invoked at the end of the schedule routine
  ->commit_scheduling(): invoked for each committed event

To be used optionally by model-specific code.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-4-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 9 +++++++++
 arch/x86/kernel/cpu/perf_event.h | 9 +++++++++
 2 files changed, 18 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 682ef00727e7..cd6115867fb8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -784,6 +784,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
 
+	if (x86_pmu.start_scheduling)
+		x86_pmu.start_scheduling(cpuc);
+
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
 		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
@@ -830,6 +833,8 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
+			if (x86_pmu.commit_scheduling)
+				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
 	/*
@@ -850,6 +855,10 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
 	}
+
+	if (x86_pmu.stop_scheduling)
+		x86_pmu.stop_scheduling(cpuc);
+
 	return num ? -EINVAL : 0;
 }
 
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 55b915511e53..ea27e63fc945 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -460,6 +460,15 @@ struct x86_pmu {
 
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
 						 struct perf_event *event);
+
+	void		(*commit_scheduling)(struct cpu_hw_events *cpuc,
+					     struct perf_event *event,
+					     int cntr);
+
+	void		(*start_scheduling)(struct cpu_hw_events *cpuc);
+
+	void		(*stop_scheduling)(struct cpu_hw_events *cpuc);
+
 	struct event_constraint *event_constraints;
 	struct x86_pmu_quirk *quirks;
 	int		perfctr_second_write;
-- 
cgit v1.2.3


From 79cba822443a168c8f7f5b853d9c7225a6d5415e Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:06:56 +0100
Subject: perf/x86: Add 'index' param to get_event_constraint() callback

This patch adds an index parameter to the get_event_constraint()
x86_pmu callback. It is expected to represent the index of the
event in the cpuc->event_list[] array. When the callback is used
for fake_cpuc (evnet validation), then the index must be -1. The
motivation for passing the index is to use it to index into another
cpuc array.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-5-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  4 ++--
 arch/x86/kernel/cpu/perf_event.h       |  4 +++-
 arch/x86/kernel/cpu/perf_event_amd.c   |  6 ++++--
 arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++-----
 4 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index cd6115867fb8..71755401476c 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -789,7 +789,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) {
 		hwc = &cpuc->event_list[i]->hw;
-		c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
+		c = x86_pmu.get_event_constraints(cpuc, i, cpuc->event_list[i]);
 		hwc->constraint = c;
 
 		wmin = min(wmin, c->weight);
@@ -1777,7 +1777,7 @@ static int validate_event(struct perf_event *event)
 	if (IS_ERR(fake_cpuc))
 		return PTR_ERR(fake_cpuc);
 
-	c = x86_pmu.get_event_constraints(fake_cpuc, event);
+	c = x86_pmu.get_event_constraints(fake_cpuc, -1, event);
 
 	if (!c || !c->weight)
 		ret = -EINVAL;
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index ea27e63fc945..24a65057c1c0 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -456,6 +456,7 @@ struct x86_pmu {
 	u64		max_period;
 	struct event_constraint *
 			(*get_event_constraints)(struct cpu_hw_events *cpuc,
+						 int idx,
 						 struct perf_event *event);
 
 	void		(*put_event_constraints)(struct cpu_hw_events *cpuc,
@@ -751,7 +752,8 @@ static inline bool intel_pmu_has_bts(struct perf_event *event)
 int intel_pmu_save_and_restart(struct perf_event *event);
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event);
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event);
 
 struct intel_shared_regs *allocate_shared_regs(int cpu);
 
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index e4302b8fed2a..1cee5d2d7ece 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -430,7 +430,8 @@ static void amd_pmu_cpu_dead(int cpu)
 }
 
 static struct event_constraint *
-amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	/*
 	 * if not NB event or no NB, then no constraints
@@ -538,7 +539,8 @@ static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
 static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
 
 static struct event_constraint *
-amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
+amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, int idx,
+			       struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 	unsigned int event_code = amd_get_event_code(hwc);
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index c0ed5a4b9537..2dd34b57d3ff 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1827,7 +1827,8 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc,
 }
 
 struct event_constraint *
-x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1844,7 +1845,8 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
 {
 	struct event_constraint *c;
 
@@ -1860,7 +1862,7 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
 	if (c)
 		return c;
 
-	return x86_get_event_constraints(cpuc, event);
+	return x86_get_event_constraints(cpuc, idx, event);
 }
 
 static void
@@ -2105,9 +2107,12 @@ static struct event_constraint counter2_constraint =
 			EVENT_CONSTRAINT(0, 0x4, 0);
 
 static struct event_constraint *
-hsw_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
+hsw_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			  struct perf_event *event)
 {
-	struct event_constraint *c = intel_get_event_constraints(cpuc, event);
+	struct event_constraint *c;
+
+	c = intel_get_event_constraints(cpuc, idx, event);
 
 	/* Handle special quirk on in_tx_checkpointed only in counter 2 */
 	if (event->hw.config & HSW_IN_TX_CHECKPOINTED) {
-- 
cgit v1.2.3


From 6f6539cad926f55d5eb6e79d05bbe99f0d54d56d Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou
Date: Mon, 17 Nov 2014 20:06:57 +0100
Subject: perf/x86/intel: Add cross-HT counter exclusion infrastructure

This patch adds a new shared_regs style structure to the
per-cpu x86 state (cpuc). It is used to coordinate access
between counters which must be used with exclusion across
HyperThreads on Intel processors. This new struct is not
needed on each PMU, thus is is allocated on demand.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[peterz: spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-6-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       | 32 +++++++++++++++
 arch/x86/kernel/cpu/perf_event_intel.c | 71 +++++++++++++++++++++++++++++++---
 2 files changed, 98 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 24a65057c1c0..f31f90e2d859 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -71,6 +71,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
 
 
@@ -123,6 +124,26 @@ struct intel_shared_regs {
 	unsigned                core_id;	/* per-core: core id */
 };
 
+enum intel_excl_state_type {
+	INTEL_EXCL_UNUSED    = 0, /* counter is unused */
+	INTEL_EXCL_SHARED    = 1, /* counter can be used by both threads */
+	INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */
+};
+
+struct intel_excl_states {
+	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
+	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+};
+
+struct intel_excl_cntrs {
+	raw_spinlock_t	lock;
+
+	struct intel_excl_states states[2];
+
+	int		refcnt;		/* per-core: #HT threads */
+	unsigned	core_id;	/* per-core: core id */
+};
+
 #define MAX_LBR_ENTRIES		16
 
 enum {
@@ -185,6 +206,12 @@ struct cpu_hw_events {
 	 * used on Intel NHM/WSM/SNB
 	 */
 	struct intel_shared_regs	*shared_regs;
+	/*
+	 * manage exclusive counter access between hyperthread
+	 */
+	struct event_constraint *constraint_list; /* in enable order */
+	struct intel_excl_cntrs		*excl_cntrs;
+	int excl_thread_id; /* 0 or 1 */
 
 	/*
 	 * AMD specific bits
@@ -208,6 +235,10 @@ struct cpu_hw_events {
 #define EVENT_CONSTRAINT(c, n, m)	\
 	__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
 
+#define INTEL_EXCLEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
+			   0, PERF_X86_EVENT_EXCL)
+
 /*
  * The overlap flag marks event constraints with overlapping counter
  * masks. This is the case if the counter mask of such an event is not
@@ -566,6 +597,7 @@ do {									\
  */
 #define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
 #define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
+#define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 2dd34b57d3ff..7f54000fd0f1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2224,16 +2224,52 @@ struct intel_shared_regs *allocate_shared_regs(int cpu)
 	return regs;
 }
 
+static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu)
+{
+	struct intel_excl_cntrs *c;
+	int i;
+
+	c = kzalloc_node(sizeof(struct intel_excl_cntrs),
+			 GFP_KERNEL, cpu_to_node(cpu));
+	if (c) {
+		raw_spin_lock_init(&c->lock);
+		for (i = 0; i < X86_PMC_IDX_MAX; i++) {
+			c->states[0].state[i] = INTEL_EXCL_UNUSED;
+			c->states[0].init_state[i] = INTEL_EXCL_UNUSED;
+
+			c->states[1].state[i] = INTEL_EXCL_UNUSED;
+			c->states[1].init_state[i] = INTEL_EXCL_UNUSED;
+		}
+		c->core_id = -1;
+	}
+	return c;
+}
+
 static int intel_pmu_cpu_prepare(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 
-	if (!(x86_pmu.extra_regs || x86_pmu.lbr_sel_map))
-		return NOTIFY_OK;
+	if (x86_pmu.extra_regs || x86_pmu.lbr_sel_map) {
+		cpuc->shared_regs = allocate_shared_regs(cpu);
+		if (!cpuc->shared_regs)
+			return NOTIFY_BAD;
+	}
 
-	cpuc->shared_regs = allocate_shared_regs(cpu);
-	if (!cpuc->shared_regs)
-		return NOTIFY_BAD;
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		size_t sz = X86_PMC_IDX_MAX * sizeof(struct event_constraint);
+
+		cpuc->constraint_list = kzalloc(sz, GFP_KERNEL);
+		if (!cpuc->constraint_list)
+			return NOTIFY_BAD;
+
+		cpuc->excl_cntrs = allocate_excl_cntrs(cpu);
+		if (!cpuc->excl_cntrs) {
+			kfree(cpuc->constraint_list);
+			kfree(cpuc->shared_regs);
+			return NOTIFY_BAD;
+		}
+		cpuc->excl_thread_id = 0;
+	}
 
 	return NOTIFY_OK;
 }
@@ -2274,12 +2310,29 @@ static void intel_pmu_cpu_starting(int cpu)
 
 	if (x86_pmu.lbr_sel_map)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
+
+	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		for_each_cpu(i, topology_thread_cpumask(cpu)) {
+			struct intel_excl_cntrs *c;
+
+			c = per_cpu(cpu_hw_events, i).excl_cntrs;
+			if (c && c->core_id == core_id) {
+				cpuc->kfree_on_online[1] = cpuc->excl_cntrs;
+				cpuc->excl_cntrs = c;
+				cpuc->excl_thread_id = 1;
+				break;
+			}
+		}
+		cpuc->excl_cntrs->core_id = core_id;
+		cpuc->excl_cntrs->refcnt++;
+	}
 }
 
 static void intel_pmu_cpu_dying(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
 	struct intel_shared_regs *pc;
+	struct intel_excl_cntrs *c;
 
 	pc = cpuc->shared_regs;
 	if (pc) {
@@ -2287,6 +2340,14 @@ static void intel_pmu_cpu_dying(int cpu)
 			kfree(pc);
 		cpuc->shared_regs = NULL;
 	}
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+		kfree(cpuc->constraint_list);
+		cpuc->constraint_list = NULL;
+	}
 
 	fini_debug_store_on_cpu(cpu);
 }
-- 
cgit v1.2.3


From e979121b1b1556e184492e6fc149bbe188fc83e6 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou
Date: Mon, 17 Nov 2014 20:06:58 +0100
Subject: perf/x86/intel: Implement cross-HT corruption bug workaround

This patch implements a software workaround for a HW erratum
on Intel SandyBridge, IvyBridge and Haswell processors
with Hyperthreading enabled. The errata are documented for
each processor in their respective specification update
documents:

  - SandyBridge: BJ122
  - IvyBridge: BV98
  - Haswell: HSD29

The bug causes silent counter corruption across hyperthreads only
when measuring certain memory events (0xd0, 0xd1, 0xd2, 0xd3).
Counters measuring those events may leak counts to the sibling
counter. For instance, counter 0, thread 0 measuring event 0xd0,
may leak to counter 0, thread 1, regardless of the event measured
there. The size of the leak is not predictible. It all depends on
the workload and the state of each sibling hyper-thread. The
corrupting events do undercount as a consequence of the leak. The
leak is compensated automatically only when the sibling counter measures
the exact same corrupting event AND the workload is on the two threads
is the same. Given, there is no way to guarantee this, a work-around
is necessary. Furthermore, there is a serious problem if the leaked count
is added to a low-occurrence event. In that case the corruption on
the low occurrence event can be very large, e.g., orders of magnitude.

There is no HW or FW workaround for this problem.

The bug is very easy to reproduce on a loaded system.
Here is an example on a Haswell client, where CPU0, CPU4
are siblings. We load the CPUs with a simple triad app
streaming large floating-point vector. We use 0x81d0
corrupting event (MEM_UOPS_RETIRED:ALL_LOADS) and
0x20cc (ROB_MISC_EVENTS:LBR_INSERTS). Given we are not
using the LBR, the 0x20cc event should be zero.

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        139 277 291      r20cc
       10,000969126 seconds time elapsed

In this example, 0x81d0 and r20cc ar eusing sinling counters
on CPU0 and CPU4. 0x81d0 leaks into 0x20cc and corrupts it
from 0 to 139 millions occurrences.

This patch provides a software workaround to this problem by modifying the
way events are scheduled onto counters by the kernel. The patch forces
cross-thread mutual exclusion between counters in case a corrupting event
is measured by one of the hyper-threads. If thread 0, counter 0 is measuring
event 0xd0, then nothing can be measured on counter 0, thread 1. If no corrupting
event is measured on any hyper-thread, event scheduling proceeds as before.

The same example run with the workaround enabled, yield the correct answer:

  $ taskset -c 0 triad &
  $ taskset -c 4 triad &
  $ perf stat -a -C 0 -e r81d0 sleep 100 &
  $ perf stat -a -C 4 -r20cc sleep 10
  Performance counter stats for 'system wide':
        0 r20cc
       10,000969126 seconds time elapsed

The patch does provide correctness for all non-corrupting events. It does not
"repatriate" the leaked counts back to the leaking counter. This is planned
for a second patch series. This patch series makes this repatriation more
easy by guaranteeing the sibling counter is not measuring any useful event.

The patch introduces dynamic constraints for events. That means that events which
did not have constraints, i.e., could be measured on any counters, may now be
constrained to a subset of the counters depending on what is going on the sibling
thread. The algorithm is similar to a cache coherency protocol. We call it XSU
in reference to Exclusive, Shared, Unused, the 3 possible states of a PMU
counter.

As a consequence of the workaround, users may see an increased amount of event
multiplexing, even in situtations where there are fewer events than counters
measured on a CPU.

Patch has been tested on all three impacted processors. Note that when
HT is off, there is no corruption. However, the workaround is still enabled,
yet not costing too much. Adding a dynamic detection of HT on turned out to
be complex are requiring too much to code to be justified.

This patch addresses the issue when PEBS is not used. A subsequent patch
fixes the problem when PEBS is used.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
[spinlock_t -> raw_spinlock_t]
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-7-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c       |  31 ++--
 arch/x86/kernel/cpu/perf_event.h       |   6 +
 arch/x86/kernel/cpu/perf_event_intel.c | 307 ++++++++++++++++++++++++++++++++-
 3 files changed, 331 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 71755401476c..b8b7a1277d8d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -779,7 +779,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	struct event_constraint *c;
 	unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
 	struct perf_event *e;
-	int i, wmin, wmax, num = 0;
+	int i, wmin, wmax, unsched = 0;
 	struct hw_perf_event *hwc;
 
 	bitmap_zero(used_mask, X86_PMC_IDX_MAX);
@@ -822,14 +822,20 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 
 	/* slow path */
 	if (i != n)
-		num = perf_assign_events(cpuc->event_list, n, wmin,
-					 wmax, assign);
+		unsched = perf_assign_events(cpuc->event_list, n, wmin,
+					     wmax, assign);
 
 	/*
-	 * Mark the event as committed, so we do not put_constraint()
-	 * in case new events are added and fail scheduling.
+	 * In case of success (unsched = 0), mark events as committed,
+	 * so we do not put_constraint() in case new events are added
+	 * and fail to be scheduled
+	 *
+	 * We invoke the lower level commit callback to lock the resource
+	 *
+	 * We do not need to do all of this in case we are called to
+	 * validate an event group (assign == NULL)
 	 */
-	if (!num && assign) {
+	if (!unsched && assign) {
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			e->hw.flags |= PERF_X86_EVENT_COMMITTED;
@@ -837,11 +843,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 				x86_pmu.commit_scheduling(cpuc, e, assign[i]);
 		}
 	}
-	/*
-	 * scheduling failed or is just a simulation,
-	 * free resources if necessary
-	 */
-	if (!assign || num) {
+
+	if (!assign || unsched) {
+
 		for (i = 0; i < n; i++) {
 			e = cpuc->event_list[i];
 			/*
@@ -851,6 +855,9 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 			if ((e->hw.flags & PERF_X86_EVENT_COMMITTED))
 				continue;
 
+			/*
+			 * release events that failed scheduling
+			 */
 			if (x86_pmu.put_event_constraints)
 				x86_pmu.put_event_constraints(cpuc, e);
 		}
@@ -859,7 +866,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
 	if (x86_pmu.stop_scheduling)
 		x86_pmu.stop_scheduling(cpuc);
 
-	return num ? -EINVAL : 0;
+	return unsched ? -EINVAL : 0;
 }
 
 /*
diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index f31f90e2d859..236afee35587 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -72,6 +72,7 @@ struct event_constraint {
 #define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
 #define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
 #define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
 
 
@@ -133,6 +134,7 @@ enum intel_excl_state_type {
 struct intel_excl_states {
 	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	bool sched_started; /* true if scheduling has started */
 };
 
 struct intel_excl_cntrs {
@@ -296,6 +298,10 @@ struct cpu_hw_events {
 #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n)	\
 	EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS)
 
+#define INTEL_EXCLUEVT_CONSTRAINT(c, n)	\
+	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \
+			   HWEIGHT(n), 0, PERF_X86_EVENT_EXCL)
+
 #define INTEL_PLD_CONSTRAINT(c, n)	\
 	__EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			   HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 7f54000fd0f1..91cc7749d7ce 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1845,7 +1845,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 }
 
 static struct event_constraint *
-intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+__intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -1865,6 +1865,254 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 	return x86_get_event_constraints(cpuc, idx, event);
 }
 
+static void
+intel_start_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	xl->sched_started = true;
+
+	/*
+	 * lock shared state until we are done scheduling
+	 * in stop_event_scheduling()
+	 * makes scheduling appear as a transaction
+	 */
+	WARN_ON_ONCE(!irqs_disabled());
+	raw_spin_lock(&excl_cntrs->lock);
+
+	/*
+	 * save initial state of sibling thread
+	 */
+	memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state));
+}
+
+static void
+intel_stop_scheduling(struct cpu_hw_events *cpuc)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* sibling thread */
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
+		return;
+
+	xlo = &excl_cntrs->states[o_tid];
+	xl = &excl_cntrs->states[tid];
+
+	/*
+	 * make new sibling thread state visible
+	 */
+	memcpy(xlo->state, xlo->init_state, sizeof(xlo->state));
+
+	xl->sched_started = false;
+	/*
+	 * release shared state lock (acquired in intel_start_scheduling())
+	 */
+	raw_spin_unlock(&excl_cntrs->lock);
+}
+
+static struct event_constraint *
+intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
+			   int idx, struct event_constraint *c)
+{
+	struct event_constraint *cx;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xl, *xlo;
+	int is_excl, i;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid; /* alternate */
+
+	/*
+	 * validating a group does not require
+	 * enforcing cross-thread  exclusion
+	 */
+	if (cpuc->is_fake)
+		return c;
+
+	/*
+	 * event requires exclusive counter access
+	 * across HT threads
+	 */
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	/*
+	 * xl = state of current HT
+	 * xlo = state of sibling HT
+	 */
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	cx = c;
+
+	/*
+	 * because we modify the constraint, we need
+	 * to make a copy. Static constraints come
+	 * from static const tables.
+	 *
+	 * only needed when constraint has not yet
+	 * been cloned (marked dynamic)
+	 */
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) {
+
+		/* sanity check */
+		if (idx < 0)
+			return &emptyconstraint;
+
+		/*
+		 * grab pre-allocated constraint entry
+		 */
+		cx = &cpuc->constraint_list[idx];
+
+		/*
+		 * initialize dynamic constraint
+		 * with static constraint
+		 */
+		memcpy(cx, c, sizeof(*cx));
+
+		/*
+		 * mark constraint as dynamic, so we
+		 * can free it later on
+		 */
+		cx->flags |= PERF_X86_EVENT_DYNAMIC;
+	}
+
+	/*
+	 * From here on, the constraint is dynamic.
+	 * Either it was just allocated above, or it
+	 * was allocated during a earlier invocation
+	 * of this function
+	 */
+
+	/*
+	 * Modify static constraint with current dynamic
+	 * state of thread
+	 *
+	 * EXCLUSIVE: sibling counter measuring exclusive event
+	 * SHARED   : sibling counter measuring non-exclusive event
+	 * UNUSED   : sibling counter unused
+	 */
+	for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) {
+		/*
+		 * exclusive event in sibling counter
+		 * our corresponding counter cannot be used
+		 * regardless of our event
+		 */
+		if (xl->state[i] == INTEL_EXCL_EXCLUSIVE)
+			__clear_bit(i, cx->idxmsk);
+		/*
+		 * if measuring an exclusive event, sibling
+		 * measuring non-exclusive, then counter cannot
+		 * be used
+		 */
+		if (is_excl && xl->state[i] == INTEL_EXCL_SHARED)
+			__clear_bit(i, cx->idxmsk);
+	}
+
+	/*
+	 * recompute actual bit weight for scheduling algorithm
+	 */
+	cx->weight = hweight64(cx->idxmsk64);
+
+	/*
+	 * if we return an empty mask, then switch
+	 * back to static empty constraint to avoid
+	 * the cost of freeing later on
+	 */
+	if (cx->weight == 0)
+		cx = &emptyconstraint;
+
+	return cx;
+}
+
+static struct event_constraint *
+intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
+			    struct perf_event *event)
+{
+	struct event_constraint *c = event->hw.constraint;
+
+	/*
+	 * first time only
+	 * - static constraint: no change across incremental scheduling calls
+	 * - dynamic constraint: handled by intel_get_excl_constraints()
+	 */
+	if (!c)
+		c = __intel_get_event_constraints(cpuc, idx, event);
+
+	if (cpuc->excl_cntrs)
+		return intel_get_excl_constraints(cpuc, event, idx, c);
+
+	return c;
+}
+
+static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
+		struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct intel_excl_states *xlo, *xl;
+	unsigned long flags = 0; /* keep compiler happy */
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+
+	/*
+	 * nothing needed if in group validation mode
+	 */
+	if (cpuc->is_fake)
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	/*
+	 * put_constraint may be called from x86_schedule_events()
+	 * which already has the lock held so here make locking
+	 * conditional
+	 */
+	if (!xl->sched_started)
+		raw_spin_lock_irqsave(&excl_cntrs->lock, flags);
+
+	/*
+	 * if event was actually assigned, then mark the
+	 * counter state as unused now
+	 */
+	if (hwc->idx >= 0)
+		xlo->state[hwc->idx] = INTEL_EXCL_UNUSED;
+
+	if (!xl->sched_started)
+		raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags);
+}
+
 static void
 intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
@@ -1883,7 +2131,57 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc,
 static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
 					struct perf_event *event)
 {
+	struct event_constraint *c = event->hw.constraint;
+
 	intel_put_shared_regs_event_constraints(cpuc, event);
+
+	/*
+	 * is PMU has exclusive counter restrictions, then
+	 * all events are subject to and must call the
+	 * put_excl_constraints() routine
+	 */
+	if (c && cpuc->excl_cntrs)
+		intel_put_excl_constraints(cpuc, event);
+
+	/* cleanup dynamic constraint */
+	if (c && (c->flags & PERF_X86_EVENT_DYNAMIC))
+		event->hw.constraint = NULL;
+}
+
+static void intel_commit_scheduling(struct cpu_hw_events *cpuc,
+				    struct perf_event *event, int cntr)
+{
+	struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs;
+	struct event_constraint *c = event->hw.constraint;
+	struct intel_excl_states *xlo, *xl;
+	int tid = cpuc->excl_thread_id;
+	int o_tid = 1 - tid;
+	int is_excl;
+
+	if (cpuc->is_fake || !c)
+		return;
+
+	is_excl = c->flags & PERF_X86_EVENT_EXCL;
+
+	if (!(c->flags & PERF_X86_EVENT_DYNAMIC))
+		return;
+
+	WARN_ON_ONCE(!excl_cntrs);
+
+	if (!excl_cntrs)
+		return;
+
+	xl = &excl_cntrs->states[tid];
+	xlo = &excl_cntrs->states[o_tid];
+
+	WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock));
+
+	if (cntr >= 0) {
+		if (is_excl)
+			xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE;
+		else
+			xlo->init_state[cntr] = INTEL_EXCL_SHARED;
+	}
 }
 
 static void intel_pebs_aliases_core2(struct perf_event *event)
@@ -2349,6 +2647,13 @@ static void intel_pmu_cpu_dying(int cpu)
 		cpuc->constraint_list = NULL;
 	}
 
+	c = cpuc->excl_cntrs;
+	if (c) {
+		if (c->core_id == -1 || --c->refcnt == 0)
+			kfree(c);
+		cpuc->excl_cntrs = NULL;
+	}
+
 	fini_debug_store_on_cpu(cpu);
 }
 
-- 
cgit v1.2.3


From 93fcf72cc0fa286aa8c3e11a1a8fd4659f0e27c0 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou
Date: Mon, 17 Nov 2014 20:06:59 +0100
Subject: perf/x86/intel: Enforce HT bug workaround for SNB/IVB/HSW

This patches activates the HT bug workaround for the
SNB/IVB/HSW processors. This covers non-PEBS mode.
Activation is done thru the constraint tables.

Both client and server processors needs this workaround.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-8-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 53 ++++++++++++++++++++++++++++------
 1 file changed, 44 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 91cc7749d7ce..9350de0f5407 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -113,6 +113,12 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly =
 	INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */
 	INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -131,15 +137,12 @@ static struct event_constraint intel_ivb_event_constraints[] __read_mostly =
 	INTEL_UEVENT_CONSTRAINT(0x08a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4), /* CYCLE_ACTIVITY.STALLS_L1D_PENDING */
 	INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
-	/*
-	 * Errata BV98 -- MEM_*_RETIRED events can leak between counters of SMT
-	 * siblings; disable these events because they can corrupt unrelated
-	 * counters.
-	 */
-	INTEL_EVENT_CONSTRAINT(0xd0, 0x0), /* MEM_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd1, 0x0), /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd2, 0x0), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
-	INTEL_EVENT_CONSTRAINT(0xd3, 0x0), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -217,6 +220,12 @@ static struct event_constraint intel_hsw_event_constraints[] = {
 	INTEL_UEVENT_CONSTRAINT(0x0ca3, 0x4),
 	/* CYCLE_ACTIVITY.CYCLES_NO_EXECUTE */
 	INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf),
+
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf), /* MEM_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf), /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
+
 	EVENT_CONSTRAINT_END
 };
 
@@ -2865,6 +2874,27 @@ static __init void intel_nehalem_quirk(void)
 	}
 }
 
+/*
+ * enable software workaround for errata:
+ * SNB: BJ122
+ * IVB: BV98
+ * HSW: HSD29
+ *
+ * Only needed when HT is enabled. However detecting
+ * this is too difficult and model specific so we enable
+ * it even with HT off for now.
+ */
+static __init void intel_ht_bug(void)
+{
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS;
+
+	x86_pmu.commit_scheduling = intel_commit_scheduling;
+	x86_pmu.start_scheduling = intel_start_scheduling;
+	x86_pmu.stop_scheduling = intel_stop_scheduling;
+
+	pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n");
+}
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_hsw,	"event=0xd0,umask=0x82")
 
@@ -3079,6 +3109,7 @@ __init int intel_pmu_init(void)
 	case 42: /* 32nm SandyBridge         */
 	case 45: /* 32nm SandyBridge-E/EN/EP */
 		x86_add_quirk(intel_sandybridge_quirk);
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs,
@@ -3093,6 +3124,8 @@ __init int intel_pmu_init(void)
 			x86_pmu.extra_regs = intel_snbep_extra_regs;
 		else
 			x86_pmu.extra_regs = intel_snb_extra_regs;
+
+
 		/* all extra regs are per-cpu when HT is on */
 		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
 		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
@@ -3111,6 +3144,7 @@ __init int intel_pmu_init(void)
 
 	case 58: /* 22nm IvyBridge       */
 	case 62: /* 22nm IvyBridge-EP/EX */
+		x86_add_quirk(intel_ht_bug);
 		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
 		       sizeof(hw_cache_event_ids));
 		/* dTLB-load-misses on IVB is different than SNB */
@@ -3146,6 +3180,7 @@ __init int intel_pmu_init(void)
 	case 63: /* 22nm Haswell Server */
 	case 69: /* 22nm Haswell ULT */
 	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+		x86_add_quirk(intel_ht_bug);
 		x86_pmu.late_ack = true;
 		memcpy(hw_cache_event_ids, hsw_hw_cache_event_ids, sizeof(hw_cache_event_ids));
 		memcpy(hw_cache_extra_regs, hsw_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
-- 
cgit v1.2.3


From b63b4b459a78a9a45ea47a4803b8d1868e9d17d5 Mon Sep 17 00:00:00 2001
From: Maria Dimakopoulou
Date: Mon, 17 Nov 2014 20:07:00 +0100
Subject: perf/x86/intel: Enforce HT bug workaround with PEBS for SNB/IVB/HSW

This patch modifies the PEBS constraint tables for SNB/IVB/HSW
such that corrupting events supporting PEBS activate the HT
workaround.

Signed-off-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-9-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h          | 20 +++++++++++++++++++-
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 28 ++++++++++++++++++----------
 2 files changed, 37 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 236afee35587..2ba71ecc244f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -326,22 +326,40 @@ struct cpu_hw_events {
 
 /* Check flags and event code, and set the HSW load flag */
 #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \
-	__EVENT_CONSTRAINT(code, n, 			\
+	__EVENT_CONSTRAINT(code, n,			\
 			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW store flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW load flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
 			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
 			  HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW)
 
+#define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \
+	__EVENT_CONSTRAINT(code, n,			\
+			  INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \
+			  HWEIGHT(n), 0, \
+			  PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL)
+
 /* Check flags and event code/umask, and set the HSW N/A flag */
 #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \
 	__EVENT_CONSTRAINT(code, n, 			\
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index a5149c7abe73..ca69ea56c712 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -612,6 +612,10 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+        INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+        INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
@@ -623,6 +627,10 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = {
 	INTEL_PST_CONSTRAINT(0x02cd, 0x8),    /* MEM_TRANS_RETIRED.PRECISE_STORES */
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_EXCLEVT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
+	INTEL_EXCLEVT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
         EVENT_CONSTRAINT_END
@@ -634,16 +642,16 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
 	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
-	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
-	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x11d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x21d0, 0xf), /* MEM_UOPS_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x41d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(0x81d0, 0xf), /* MEM_UOPS_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x12d0, 0xf), /* MEM_UOPS_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x42d0, 0xf), /* MEM_UOPS_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(0x82d0, 0xf), /* MEM_UOPS_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd2, 0xf),    /* MEM_LOAD_UOPS_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(0xd3, 0xf),    /* MEM_LOAD_UOPS_L3_MISS_RETIRED.* */
 	/* Allow all events as PEBS with no flags */
 	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
 	EVENT_CONSTRAINT_END
-- 
cgit v1.2.3


From a90738c2cb0dceb882e0d7f7f9141e0062809b4d Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:07:01 +0100
Subject: perf/x86/intel: Fix intel_get_event_constraints() for dynamic
 constraints

With dynamic constraint, we need to restart from the static
constraints each time the intel_get_event_constraints() is called.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Maria Dimakopoulou <maria.n.dimakopoulou@gmail.com>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Link: http://lkml.kernel.org/r/1416251225-17721-10-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9350de0f5407..4773ae0b52d0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -2063,20 +2063,25 @@ static struct event_constraint *
 intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
 			    struct perf_event *event)
 {
-	struct event_constraint *c = event->hw.constraint;
+	struct event_constraint *c1 = event->hw.constraint;
+	struct event_constraint *c2;
 
 	/*
 	 * first time only
 	 * - static constraint: no change across incremental scheduling calls
 	 * - dynamic constraint: handled by intel_get_excl_constraints()
 	 */
-	if (!c)
-		c = __intel_get_event_constraints(cpuc, idx, event);
+	c2 = __intel_get_event_constraints(cpuc, idx, event);
+	if (c1 && (c1->flags & PERF_X86_EVENT_DYNAMIC)) {
+		bitmap_copy(c1->idxmsk, c2->idxmsk, X86_PMC_IDX_MAX);
+		c1->weight = c2->weight;
+		c2 = c1;
+	}
 
 	if (cpuc->excl_cntrs)
-		return intel_get_excl_constraints(cpuc, event, idx, c);
+		return intel_get_excl_constraints(cpuc, event, idx, c2);
 
-	return c;
+	return c2;
 }
 
 static void intel_put_excl_constraints(struct cpu_hw_events *cpuc,
-- 
cgit v1.2.3


From c02cdbf60b51b8d98a49185535f5d527a2965142 Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:07:02 +0100
Subject: perf/x86/intel: Limit to half counters when the HT workaround is
 enabled, to avoid exclusive mode starvation

This patch limits the number of counters available to each CPU when
the HT bug workaround is enabled.

This is necessary to avoid situation of counter starvation. Such can
arise from configuration where one HT thread, HT0, is using all 4 counters
with corrupting events which require exclusion the the sibling HT, HT1.

In such case, HT1 would not be able to schedule any event until HT0
is done. To mitigate this problem, this patch artificially limits
the number of counters to 2.

That way, we can gurantee that at least 2 counters are not in exclusive
mode and therefore allow the sibling thread to schedule events of the
same type (system vs. per-thread). The 2 counters are not determined
in advance. We simply set the limit to two events per HT.

This helps mitigate starvation in case of events with specific counter
constraints such a PREC_DIST.

Note that this does not elimintate the starvation is all cases. But
it is better than not having it.

(Solution suggested by Peter Zjilstra.)

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-11-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  2 ++
 arch/x86/kernel/cpu/perf_event_intel.c | 22 ++++++++++++++++++++--
 2 files changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 2ba71ecc244f..176749cca98f 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -134,6 +134,8 @@ enum intel_excl_state_type {
 struct intel_excl_states {
 	enum intel_excl_state_type init_state[X86_PMC_IDX_MAX];
 	enum intel_excl_state_type state[X86_PMC_IDX_MAX];
+	int  num_alloc_cntrs;/* #counters allocated */
+	int  max_alloc_cntrs;/* max #counters allowed */
 	bool sched_started; /* true if scheduling has started */
 };
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4773ae0b52d0..4187d3f4ed12 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1897,7 +1897,7 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	xl = &excl_cntrs->states[tid];
 
 	xl->sched_started = true;
-
+	xl->num_alloc_cntrs = 0;
 	/*
 	 * lock shared state until we are done scheduling
 	 * in stop_event_scheduling()
@@ -1963,7 +1963,6 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 */
 	if (cpuc->is_fake)
 		return c;
-
 	/*
 	 * event requires exclusive counter access
 	 * across HT threads
@@ -1977,6 +1976,18 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	xl = &excl_cntrs->states[tid];
 	xlo = &excl_cntrs->states[o_tid];
 
+	/*
+	 * do not allow scheduling of more than max_alloc_cntrs
+	 * which is set to half the available generic counters.
+	 * this helps avoid counter starvation of sibling thread
+	 * by ensuring at most half the counters cannot be in
+	 * exclusive mode. There is not designated counters for the
+	 * limits. Any N/2 counters can be used. This helps with
+	 * events with specifix counter constraints
+	 */
+	if (xl->num_alloc_cntrs++ == xl->max_alloc_cntrs)
+		return &emptyconstraint;
+
 	cx = c;
 
 	/*
@@ -2624,6 +2635,8 @@ static void intel_pmu_cpu_starting(int cpu)
 		cpuc->lbr_sel = &cpuc->shared_regs->regs[EXTRA_REG_LBR];
 
 	if (x86_pmu.flags & PMU_FL_EXCL_CNTRS) {
+		int h = x86_pmu.num_counters >> 1;
+
 		for_each_cpu(i, topology_thread_cpumask(cpu)) {
 			struct intel_excl_cntrs *c;
 
@@ -2637,6 +2650,11 @@ static void intel_pmu_cpu_starting(int cpu)
 		}
 		cpuc->excl_cntrs->core_id = core_id;
 		cpuc->excl_cntrs->refcnt++;
+		/*
+		 * set hard limit to half the number of generic counters
+		 */
+		cpuc->excl_cntrs->states[0].max_alloc_cntrs = h;
+		cpuc->excl_cntrs->states[1].max_alloc_cntrs = h;
 	}
 }
 
-- 
cgit v1.2.3


From b37609c30e41264c4df4acff78abfc894499a49b Mon Sep 17 00:00:00 2001
From: Stephane Eranian
Date: Mon, 17 Nov 2014 20:07:04 +0100
Subject: perf/x86/intel: Make the HT bug workaround conditional on HT enabled

This patch disables the PMU HT bug when Hyperthreading (HT)
is disabled. We cannot do this test immediately when perf_events
is initialized. We need to wait until the topology information
is setup properly. As such, we register a later initcall, check
the topology and potentially disable the workaround. To do this,
we need to ensure there is no user of the PMU. At this point of
the boot, the only user is the NMI watchdog, thus we disable
it during the switch and re-enable it right after.

Having the workaround disabled when it is not needed provides
some benefits by limiting the overhead is time and space.
The workaround still ensures correct scheduling of the corrupting
memory events (0xd0, 0xd1, 0xd2) when HT is off. Those events
can only be measured on counters 0-3. Something else the current
kernel did not handle correctly.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: bp@alien8.de
Cc: jolsa@redhat.com
Cc: kan.liang@intel.com
Cc: maria.n.dimakopoulou@gmail.com
Link: http://lkml.kernel.org/r/1416251225-17721-13-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h       |  5 ++
 arch/x86/kernel/cpu/perf_event_intel.c | 95 ++++++++++++++++++++++++++--------
 2 files changed, 79 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 176749cca98f..7250c0281f9d 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -624,6 +624,7 @@ do {									\
 #define PMU_FL_NO_HT_SHARING	0x1 /* no hyper-threading resource sharing */
 #define PMU_FL_HAS_RSP_1	0x2 /* has 2 equivalent offcore_rsp regs   */
 #define PMU_FL_EXCL_CNTRS	0x4 /* has exclusive counter requirements  */
+#define PMU_FL_EXCL_ENABLED	0x8 /* exclusive counter active */
 
 #define EVENT_VAR(_id)  event_attr_##_id
 #define EVENT_PTR(_id) &event_attr_##_id.attr.attr
@@ -904,6 +905,10 @@ int knc_pmu_init(void);
 ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr,
 			  char *page);
 
+static inline int is_ht_workaround_enabled(void)
+{
+	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
+}
 #else /* CONFIG_CPU_SUP_INTEL */
 
 static inline void reserve_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 4187d3f4ed12..6ea61a572fb0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/watchdog.h>
 
 #include <asm/cpufeature.h>
 #include <asm/hardirq.h>
@@ -1885,8 +1886,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * nothing needed if in group validation mode
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
 		return;
+
 	/*
 	 * no exclusion needed
 	 */
@@ -1923,7 +1925,7 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc)
 	/*
 	 * nothing needed if in group validation mode
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
 		return;
 	/*
 	 * no exclusion needed
@@ -1961,7 +1963,13 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event,
 	 * validating a group does not require
 	 * enforcing cross-thread  exclusion
 	 */
-	if (cpuc->is_fake)
+	if (cpuc->is_fake || !is_ht_workaround_enabled())
+		return c;
+
+	/*
+	 * no exclusion needed
+	 */
+	if (!excl_cntrs)
 		return c;
 	/*
 	 * event requires exclusive counter access
@@ -2658,18 +2666,11 @@ static void intel_pmu_cpu_starting(int cpu)
 	}
 }
 
-static void intel_pmu_cpu_dying(int cpu)
+static void free_excl_cntrs(int cpu)
 {
 	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
-	struct intel_shared_regs *pc;
 	struct intel_excl_cntrs *c;
 
-	pc = cpuc->shared_regs;
-	if (pc) {
-		if (pc->core_id == -1 || --pc->refcnt == 0)
-			kfree(pc);
-		cpuc->shared_regs = NULL;
-	}
 	c = cpuc->excl_cntrs;
 	if (c) {
 		if (c->core_id == -1 || --c->refcnt == 0)
@@ -2678,14 +2679,22 @@ static void intel_pmu_cpu_dying(int cpu)
 		kfree(cpuc->constraint_list);
 		cpuc->constraint_list = NULL;
 	}
+}
 
-	c = cpuc->excl_cntrs;
-	if (c) {
-		if (c->core_id == -1 || --c->refcnt == 0)
-			kfree(c);
-		cpuc->excl_cntrs = NULL;
+static void intel_pmu_cpu_dying(int cpu)
+{
+	struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
+	struct intel_shared_regs *pc;
+
+	pc = cpuc->shared_regs;
+	if (pc) {
+		if (pc->core_id == -1 || --pc->refcnt == 0)
+			kfree(pc);
+		cpuc->shared_regs = NULL;
 	}
 
+	free_excl_cntrs(cpu);
+
 	fini_debug_store_on_cpu(cpu);
 }
 
@@ -2904,18 +2913,18 @@ static __init void intel_nehalem_quirk(void)
  * HSW: HSD29
  *
  * Only needed when HT is enabled. However detecting
- * this is too difficult and model specific so we enable
- * it even with HT off for now.
+ * if HT is enabled is difficult (model specific). So instead,
+ * we enable the workaround in the early boot, and verify if
+ * it is needed in a later initcall phase once we have valid
+ * topology information to check if HT is actually enabled
  */
 static __init void intel_ht_bug(void)
 {
-	x86_pmu.flags |= PMU_FL_EXCL_CNTRS;
+	x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED;
 
 	x86_pmu.commit_scheduling = intel_commit_scheduling;
 	x86_pmu.start_scheduling = intel_start_scheduling;
 	x86_pmu.stop_scheduling = intel_stop_scheduling;
-
-	pr_info("CPU erratum BJ122, BV98, HSD29 worked around\n");
 }
 
 EVENT_ATTR_STR(mem-loads,	mem_ld_hsw,	"event=0xcd,umask=0x1,ldlat=3");
@@ -3343,3 +3352,47 @@ __init int intel_pmu_init(void)
 
 	return 0;
 }
+
+/*
+ * HT bug: phase 2 init
+ * Called once we have valid topology information to check
+ * whether or not HT is enabled
+ * If HT is off, then we disable the workaround
+ */
+static __init int fixup_ht_bug(void)
+{
+	int cpu = smp_processor_id();
+	int w, c;
+	/*
+	 * problem not present on this CPU model, nothing to do
+	 */
+	if (!(x86_pmu.flags & PMU_FL_EXCL_ENABLED))
+		return 0;
+
+	w = cpumask_weight(topology_thread_cpumask(cpu));
+	if (w > 1) {
+		pr_info("PMU erratum BJ122, BV98, HSD29 worked around, HT is on\n");
+		return 0;
+	}
+
+	watchdog_nmi_disable_all();
+
+	x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED);
+
+	x86_pmu.commit_scheduling = NULL;
+	x86_pmu.start_scheduling = NULL;
+	x86_pmu.stop_scheduling = NULL;
+
+	watchdog_nmi_enable_all();
+
+	get_online_cpus();
+
+	for_each_online_cpu(c) {
+		free_excl_cntrs(c);
+	}
+
+	put_online_cpus();
+	pr_info("PMU erratum BJ122, BV98, HSD29 workaround disabled, HT off\n");
+	return 0;
+}
+subsys_initcall(fixup_ht_bug)
-- 
cgit v1.2.3


From 8882edf735738c949aba4b65d3ec3453066bab12 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 27 Feb 2015 09:48:30 -0800
Subject: perf/x86/intel: Reset more state in PMU reset

The PMU reset code didn't quite keep up with newer PMU features.
Improve it a bit to really reset a modern PMU:

  - Clear all overflow status
  - Clear LBRs and freezing state
  - Disable fixed counters too

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 6ea61a572fb0..59994602bb94 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1538,6 +1538,18 @@ static void intel_pmu_reset(void)
 	if (ds)
 		ds->bts_index = ds->bts_buffer_base;
 
+	/* Ack all overflows and disable fixed counters */
+	if (x86_pmu.version >= 2) {
+		intel_pmu_ack_status(intel_pmu_get_status());
+		wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
+	}
+
+	/* Reset LBRs and LBR freezing */
+	if (x86_pmu.lbr_nr) {
+		update_debugctlmsr(get_debugctlmsr() &
+			~(DEBUGCTLMSR_FREEZE_LBRS_ON_PMI|DEBUGCTLMSR_LBR));
+	}
+
 	local_irq_restore(flags);
 }
 
-- 
cgit v1.2.3


From da3e606d885a17525eb18afd423f5c438860b833 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 27 Feb 2015 09:48:31 -0800
Subject: perf/x86: Dump DEBUGCTL in PMU dump

LBRs and LBR freezing are controlled through the DEBUGCTL MSR. So
dump the state of DEBUGCTL too when dumping the PMU state.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-3-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b8b7a1277d8d..994737263daa 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1171,7 +1171,7 @@ static void x86_pmu_start(struct perf_event *event, int flags)
 void perf_event_print_debug(void)
 {
 	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
-	u64 pebs;
+	u64 pebs, debugctl;
 	struct cpu_hw_events *cpuc;
 	unsigned long flags;
 	int cpu, idx;
@@ -1197,6 +1197,10 @@ void perf_event_print_debug(void)
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
 		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.lbr_nr) {
+			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
+		}
 	}
 	pr_info("CPU#%d: active:     %016llx\n", cpu, *(u64 *)cpuc->active_mask);
 
-- 
cgit v1.2.3


From 15fde1101a1aed11958e0d86bc360f01866a74b1 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 27 Feb 2015 09:48:32 -0800
Subject: perf/x86: Only dump PEBS register when PEBS has been detected

Technically PEBS_ENABLED is only guaranteed to exist when we
detected PEBS. So add a check for this to the PMU dump function.
I don't think it can happen on a real CPU, but could in a VM.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1425059312-18217-4-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 994737263daa..689e35760924 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1189,14 +1189,16 @@ void perf_event_print_debug(void)
 		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
 		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
 		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
-		rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
 
 		pr_info("\n");
 		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl);
 		pr_info("CPU#%d: status:     %016llx\n", cpu, status);
 		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow);
 		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed);
-		pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		if (x86_pmu.pebs_constraints) {
+			rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
+			pr_info("CPU#%d: pebs:       %016llx\n", cpu, pebs);
+		}
 		if (x86_pmu.lbr_nr) {
 			rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 			pr_info("CPU#%d: debugctl:   %016llx\n", cpu, debugctl);
-- 
cgit v1.2.3


From 1a78d93750bb5f61abdc59a91fc3bd06a214542a Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 20 Mar 2015 10:11:23 -0700
Subject: perf/x86/intel: Streamline LBR MSR handling in PMI

The perf PMI currently does unnecessary MSR accesses when
LBRs are enabled. We use LBR freezing, or when in callstack
mode force the LBRs to only filter on ring 3.

So there is no need to disable the LBRs explicitely in the
PMI handler.

Also we always unnecessarily rewrite LBR_SELECT in the LBR
handler, even though it can never change.

 5)               |  /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
 5)               |  /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 70000000f */
 5)               |  /* write_msr: MSR_CORE_PERF_GLOBAL_CTRL(38f), value 0 */
 5)               |  /* write_msr: MSR_LBR_SELECT(1c8), value 0 */
 5)               |  /* read_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */
 5)               |  /* write_msr: MSR_IA32_DEBUGCTLMSR(1d9), value 1801 */

This patch:

  - Avoids disabling already frozen LBRs unnecessarily in the PMI
  - Avoids changing LBR_SELECT in the PMI

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-1-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h           |  2 +-
 arch/x86/kernel/cpu/perf_event_intel.c     | 23 ++++++++++++++++++-----
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 12 ++++++++----
 3 files changed, 27 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 7250c0281f9d..329f0356ad4a 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -870,7 +870,7 @@ void intel_pmu_lbr_enable(struct perf_event *event);
 
 void intel_pmu_lbr_disable(struct perf_event *event);
 
-void intel_pmu_lbr_enable_all(void);
+void intel_pmu_lbr_enable_all(bool pmi);
 
 void intel_pmu_lbr_disable_all(void);
 
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 59994602bb94..9da2400c2ec3 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1244,7 +1244,10 @@ static __initconst const u64 slm_hw_cache_event_ids
  },
 };
 
-static void intel_pmu_disable_all(void)
+/*
+ * Use from PMIs where the LBRs are already disabled.
+ */
+static void __intel_pmu_disable_all(void)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
@@ -1256,15 +1259,20 @@ static void intel_pmu_disable_all(void)
 		intel_bts_disable_local();
 
 	intel_pmu_pebs_disable_all();
+}
+
+static void intel_pmu_disable_all(void)
+{
+	__intel_pmu_disable_all();
 	intel_pmu_lbr_disable_all();
 }
 
-static void intel_pmu_enable_all(int added)
+static void __intel_pmu_enable_all(int added, bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	intel_pmu_pebs_enable_all();
-	intel_pmu_lbr_enable_all();
+	intel_pmu_lbr_enable_all(pmi);
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL,
 			x86_pmu.intel_ctrl & ~cpuc->intel_ctrl_guest_mask);
 
@@ -1280,6 +1288,11 @@ static void intel_pmu_enable_all(int added)
 		intel_bts_enable_local();
 }
 
+static void intel_pmu_enable_all(int added)
+{
+	__intel_pmu_enable_all(added, false);
+}
+
 /*
  * Workaround for:
  *   Intel Errata AAK100 (model 26)
@@ -1573,7 +1586,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	 */
 	if (!x86_pmu.late_ack)
 		apic_write(APIC_LVTPC, APIC_DM_NMI);
-	intel_pmu_disable_all();
+	__intel_pmu_disable_all();
 	handled = intel_pmu_drain_bts_buffer();
 	handled += intel_bts_interrupt();
 	status = intel_pmu_get_status();
@@ -1658,7 +1671,7 @@ again:
 		goto again;
 
 done:
-	intel_pmu_enable_all(0);
+	__intel_pmu_enable_all(0, true);
 	/*
 	 * Only unmask the NMI after the overflow counters
 	 * have been reset. This avoids spurious NMIs on
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 0473874109cb..3d537252f011 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -132,12 +132,16 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
  * otherwise it becomes near impossible to get a reliable stack.
  */
 
-static void __intel_pmu_lbr_enable(void)
+static void __intel_pmu_lbr_enable(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	u64 debugctl, lbr_select = 0;
 
-	if (cpuc->lbr_sel) {
+	/*
+	 * No need to reprogram LBR_SELECT in a PMI, as it
+	 * did not change.
+	 */
+	if (cpuc->lbr_sel && !pmi) {
 		lbr_select = cpuc->lbr_sel->config;
 		wrmsrl(MSR_LBR_SELECT, lbr_select);
 	}
@@ -351,12 +355,12 @@ void intel_pmu_lbr_disable(struct perf_event *event)
 	}
 }
 
-void intel_pmu_lbr_enable_all(void)
+void intel_pmu_lbr_enable_all(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 
 	if (cpuc->lbr_users)
-		__intel_pmu_lbr_enable();
+		__intel_pmu_lbr_enable(pmi);
 }
 
 void intel_pmu_lbr_disable_all(void)
-- 
cgit v1.2.3


From cd1f11de69226cc7ce7e7f22bdab9010103ddaa6 Mon Sep 17 00:00:00 2001
From: Andi Kleen
Date: Fri, 20 Mar 2015 10:11:24 -0700
Subject: perf/x86/intel: Avoid rewriting DEBUGCTL with the same value for LBRs

perf with LBRs on has a tendency to rewrite the DEBUGCTL MSR with
the same value. Add a little optimization to skip the unnecessary
write.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: eranian@google.com
Link: http://lkml.kernel.org/r/1426871484-21285-2-git-send-email-andi@firstfloor.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_lbr.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
index 3d537252f011..94e5b506caa6 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -135,7 +135,7 @@ static void intel_pmu_lbr_filter(struct cpu_hw_events *cpuc);
 static void __intel_pmu_lbr_enable(bool pmi)
 {
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	u64 debugctl, lbr_select = 0;
+	u64 debugctl, lbr_select = 0, orig_debugctl;
 
 	/*
 	 * No need to reprogram LBR_SELECT in a PMI, as it
@@ -147,6 +147,7 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	}
 
 	rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	orig_debugctl = debugctl;
 	debugctl |= DEBUGCTLMSR_LBR;
 	/*
 	 * LBR callstack does not work well with FREEZE_LBRS_ON_PMI.
@@ -155,7 +156,8 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	 */
 	if (!(lbr_select & LBR_CALL_STACK))
 		debugctl |= DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
-	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+	if (orig_debugctl != debugctl)
+		wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
 }
 
 static void __intel_pmu_lbr_disable(void)
-- 
cgit v1.2.3


From 2e54a5bdba107f80a9ba90065fb43bba0498147d Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 2 Apr 2015 17:57:59 +0200
Subject: perf/x86/intel/pt: Fix the 32-bit build

On a 32-bit build I got:

  arch/x86/kernel/cpu/perf_event_intel_pt.c:413:5: warning: cast to pointer from integer of different size [-Wint-to-pointer-cast]
  arch/x86/kernel/cpu/perf_event_intel_bts.c:162:24: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast]

Fix it. The code should probably be (re-)tested on 32-bit systems to make
sure all is fine.

Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kaixu Xia <kaixu.xia@linaro.org>
Cc: linux-kernel@vger.kernel.org
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: acme@infradead.org
Cc: adrian.hunter@intel.com
Cc: kan.liang@intel.com
Cc: markus.t.metzger@intel.com
Cc: mathieu.poirier@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 2 +-
 arch/x86/kernel/cpu/perf_event_intel_pt.c  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index fb1a4c28f3e1..ac1f0c55f379 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -159,7 +159,7 @@ bts_config_buffer(struct bts_buffer *buf)
 			thresh = end;
 	}
 
-	ds->bts_buffer_base = (u64)page_address(page) + phys->displacement;
+	ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
 	ds->bts_index = ds->bts_buffer_base + index;
 	ds->bts_absolute_maximum = ds->bts_buffer_base + end;
 	ds->bts_interrupt_threshold = !buf->snapshot
diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index a9a1092cf836..f5a3afc65371 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -409,8 +409,8 @@ static void pt_topa_dump(struct pt_buffer *buf)
 	list_for_each_entry(topa, &buf->tables, list) {
 		int i;
 
-		pr_debug("# table @%p (%p), off %llx size %zx\n", topa->table,
-			 (void *)topa->phys, topa->offset, topa->size);
+		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
+			 topa->phys, topa->offset, topa->size);
 		for (i = 0; i < TENTS_PER_PAGE; i++) {
 			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
 				 &topa->table[i],
-- 
cgit v1.2.3


From d9dc64f30abe42f71bc7e9eb9d38c41006cf39f9 Mon Sep 17 00:00:00 2001
From: Ross Zwisler
Date: Tue, 27 Jan 2015 09:53:51 -0700
Subject: x86/asm: Add support for the CLWB instruction

Add support for the new CLWB (cache line write back)
instruction.  This instruction was announced in the document
"Intel Architecture Instruction Set Extensions Programming
Reference" with reference number 319433-022.

  https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf

The CLWB instruction is used to write back the contents of
dirtied cache lines to memory without evicting the cache lines
from the processor's cache hierarchy.  This should be used in
favor of clflushopt or clflush in cases where you require the
cache line to be written to memory but plan to access the data
again in the near future.

One of the main use cases for this is with persistent memory
where CLWB can be used with PCOMMIT to ensure that data has been
accepted to memory and is durable on the DIMM.

This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH
and PCOMMIT with appropriate fencing:

void flush_and_commit_buffer(void *vaddr, unsigned int size)
{
	void *vend = vaddr + size - 1;

	for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size)
		clwb(vaddr);

	/* Flush any possible final partial cacheline */
	clwb(vend);

	/*
	 * Use SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes.
	 * (MFENCE via mb() also works)
	 */
	wmb();

	/* PCOMMIT and the required SFENCE for ordering */
	pcommit_sfence();
}

After this function completes the data pointed to by vaddr is
has been accepted to memory and will be durable if the vaddr
points to persistent memory.

Regarding the details of how the alternatives assembly is set
up, we need one additional byte at the beginning of the CLFLUSH
so that we can flip it into a CLFLUSHOPT by changing that byte
into a 0x66 prefix.  Two options are to either insert a 1 byte
ASM_NOP1, or to add a 1 byte NOP_DS_PREFIX.  Both have no
functional effect with the plain CLFLUSH, but I've been told
that executing a CLFLUSH + prefix should be faster than
executing a CLFLUSH + NOP.

We had to hard code the assembly for CLWB because, lacking the
ability to assemble the CLWB instruction itself, the next
closest thing is to have an xsaveopt instruction with a 0x66
prefix.  Unfortunately XSAVEOPT itself is also relatively new,
and isn't included by all the GCC versions that the kernel needs
to support.

Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Acked-by: Borislav Petkov <bp@suse.de>
Acked-by: H. Peter Anvin <hpa@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1422377631-8986-3-git-send-email-ross.zwisler@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/cpufeature.h    |  1 +
 arch/x86/include/asm/special_insns.h | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0f7a5a1a8db2..854c04b3c9c2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -233,6 +233,7 @@
 #define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
 #define X86_FEATURE_PCOMMIT	( 9*32+22) /* PCOMMIT instruction */
 #define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
 #define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
 #define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
 #define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 2ec1a5392542..aeb4666e0c0a 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -201,6 +201,20 @@ static inline void clflushopt(volatile void *__p)
 		       "+m" (*(volatile char __force *)__p));
 }
 
+static inline void clwb(volatile void *__p)
+{
+	volatile struct { char x[64]; } *p = __p;
+
+	asm volatile(ALTERNATIVE_2(
+		".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])",
+		".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */
+		X86_FEATURE_CLFLUSHOPT,
+		".byte 0x66, 0x0f, 0xae, 0x30",  /* clwb (%%rax) */
+		X86_FEATURE_CLWB)
+		: [p] "+m" (*p)
+		: [pax] "a" (p));
+}
+
 static inline void pcommit_sfence(void)
 {
 	alternative(ASM_NOP7,
-- 
cgit v1.2.3


From ff8287f36381deff729aa4e7b02296a080519fd0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 2 Apr 2015 12:41:44 -0700
Subject: x86/asm/entry/32: Improve a TOP_OF_KERNEL_STACK_PADDING comment

At Denys' request, clean up the comment describing stack padding
in the 32-bit sysenter path.

No code changes.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/41fee7bb8490ae840fe7ef2699f9c2feb932e729.1428002830.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_32.S | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 4c8cc34e6d68..effa2793feba 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
 	/*CFI_REL_OFFSET cs, 0*/
 	/*
 	 * Push current_thread_info()->sysenter_return to the stack.
-	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
-	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
+	 * A tiny bit of offset fixup is necessary: TI_sysenter_return
+	 * is relative to thread_info, which is at the bottom of the
+	 * kernel stack page.  4*4 means the 4 words pushed above;
+	 * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+	 * and THREAD_SIZE takes us to the bottom.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
-- 
cgit v1.2.3


From cf9328cc9989e028fdc64d8c0a7b1b043dc96735 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 2 Apr 2015 12:41:45 -0700
Subject: x86/asm/entry/32: Stop caching MSR_IA32_SYSENTER_ESP in tss.sp1

We write a stack pointer to MSR_IA32_SYSENTER_ESP exactly once,
and we unnecessarily cache the value in tss.sp1.  We never
read the cached value.

Remove all of the caching.  It serves no purpose.

Suggested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/05a0163eb33ef5208363f0015496855da7cebadd.1428002830.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/processor.h | 22 +++++++++++-----------
 arch/x86/kernel/cpu/common.c     |  9 +++++----
 2 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 572099710ba2..d2203b5d9538 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -209,21 +209,21 @@ struct x86_hw_tss {
 	unsigned short		back_link, __blh;
 	unsigned long		sp0;
 	unsigned short		ss0, __ss0h;
+	unsigned long		sp1;
 
 	/*
-	 * We don't use ring 1, so sp1 and ss1 are convenient scratch
-	 * spaces in the same cacheline as sp0.  We use them to cache
-	 * some MSR values to avoid unnecessary wrmsr instructions.
+	 * We don't use ring 1, so ss1 is a convenient scratch space in
+	 * the same cacheline as sp0.  We use ss1 to cache the value in
+	 * MSR_IA32_SYSENTER_CS.  When we context switch
+	 * MSR_IA32_SYSENTER_CS, we first check if the new value being
+	 * written matches ss1, and, if it's not, then we wrmsr the new
+	 * value and update ss1.
 	 *
-	 * We use SYSENTER_ESP to find sp0 and for the NMI emergency
-	 * stack, but we need to context switch it because we do
-	 * horrible things to the kernel stack in vm86 mode.
-	 *
-	 * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid
-	 * corrupting the stack if we went through the sysenter path
-	 * from vm86 mode.
+	 * The only reason we context switch MSR_IA32_SYSENTER_CS is
+	 * that we set it to zero in vm86 tasks to avoid corrupting the
+	 * stack if we were to go through the sysenter path from vm86
+	 * mode.
 	 */
-	unsigned long		sp1;	/* MSR_IA32_SYSENTER_ESP */
 	unsigned short		ss1;	/* MSR_IA32_SYSENTER_CS */
 
 	unsigned short		__ss1h;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 71e4adcb15f1..a383d53bf0ed 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -976,15 +976,16 @@ void enable_sep_cpu(void)
 		goto out;
 
 	/*
-	 * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware,
-	 * we cache the SYSENTER CS and ESP values there for easy access:
+	 * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+	 * see the big comment in struct x86_hw_tss's definition.
 	 */
 
 	tss->x86_tss.ss1 = __KERNEL_CS;
 	wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
 
-	tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack);
-	wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
+	wrmsr(MSR_IA32_SYSENTER_ESP,
+	      (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+	      0);
 
 	wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
 
-- 
cgit v1.2.3


From 162a688e84df49c5bcc855a5e5bf812d0ec89ad5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 3 Apr 2015 02:01:28 +0200
Subject: x86/amd/idle, clockevents: Use explicit broadcast control function

Replace the clockevents_notify() call with an explicit function call.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1528188.S1pjqkSL1P@vostro.rjw.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..e94b733de302 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
 #include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
 #include <linux/random.h>
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
@@ -377,11 +377,8 @@ static void amd_e400_idle(void)
 
 		if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
 			cpumask_set_cpu(cpu, amd_e400_c1e_mask);
-			/*
-			 * Force broadcast so ACPI can not interfere.
-			 */
-			clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
-					   &cpu);
+			/* Force broadcast so ACPI can not interfere. */
+			tick_broadcast_force();
 			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
 		}
 		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
-- 
cgit v1.2.3


From 435c350e8197488f12c97e7df28a9c2199bd1673 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 3 Apr 2015 02:05:53 +0200
Subject: x86/amd/idle, clockevents: Use explicit broadcast oneshot control
 functions

Replace the clockevents_notify() call with an explicit function call.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/8569669.lgxIty9PKW@vostro.rjw.lan
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e94b733de302..2f43c62fcd6d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -381,7 +381,7 @@ static void amd_e400_idle(void)
 			tick_broadcast_force();
 			pr_info("Switch to broadcast mode on CPU%d\n", cpu);
 		}
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+		tick_broadcast_enter();
 
 		default_idle();
 
@@ -390,7 +390,7 @@ static void amd_e400_idle(void)
 		 * called with interrupts disabled.
 		 */
 		local_irq_disable();
-		clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+		tick_broadcast_exit();
 		local_irq_enable();
 	} else
 		default_idle();
-- 
cgit v1.2.3


From 4214a16b02971c60960afd675d03544e109e0d75 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski
Date: Thu, 2 Apr 2015 17:12:12 -0700
Subject: x86/asm/entry/64/compat: Use SYSRETL to return from compat mode
 SYSENTER

SYSEXIT is scary on 64-bit kernels -- SYSEXIT must be invoked
with usergs and IRQs on.  That means that we rely on STI to
correctly mask interrupts for one instruction.  This is okay by
itself, but the semantics with respect to NMIs are unclear.

Avoid the whole issue by using SYSRETL instead.  For background,
Intel CPUs don't allow SYSCALL from compat mode, but they do
allow SYSRETL back to compat mode.  Go figure.

To avoid doing too much at once, this doesn't revamp the calling
convention.  We still return with EBP, EDX, and ECX on the user
stack.

Oddly this seems to be 30 cycles or so faster.  Avoiding POPFQ
and STI will account for under half of that, I think, so my best
guess is that Intel just optimizes SYSRET much better than
SYSEXIT.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/57a0bf1b5230b2716a64ebe48e9bc1110f7ab433.1428019097.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S | 53 +++++++++++++++++++++++++++++++++++------------
 1 file changed, 40 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 8d01cce7b6b8..5d8f987a340d 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -186,28 +186,55 @@ sysenter_dispatch:
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	sysexit_audit
 sysexit_from_sys_call:
+	/*
+	 * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an
+	 * NMI between STI and SYSEXIT has poorly specified behavior,
+	 * and and NMI followed by an IRQ with usergs is fatal.  So
+	 * we just pretend we're using SYSEXIT but we really use
+	 * SYSRETL instead.
+	 *
+	 * This code path is still called 'sysexit' because it pairs
+	 * with 'sysenter' and it uses the SYSENTER calling convention.
+	 */
 	andl    $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	/* clear IF, that popfq doesn't enable interrupts early */
-	andl	$~0x200,EFLAGS(%rsp)
-	movl	RIP(%rsp),%edx		/* User %eip */
-	CFI_REGISTER rip,rdx
+	movl	RIP(%rsp),%ecx		/* User %eip */
+	CFI_REGISTER rip,rcx
 	RESTORE_RSI_RDI
-	/* pop everything except ss,rsp,rflags slots */
-	REMOVE_PT_GPREGS_FROM_STACK 3*8
+	xorl	%edx,%edx		/* avoid info leaks */
 	xorq	%r8,%r8
 	xorq	%r9,%r9
 	xorq	%r10,%r10
-	xorq	%r11,%r11
-	popfq_cfi
+	movl	EFLAGS(%rsp),%r11d	/* User eflags */
 	/*CFI_RESTORE rflags*/
-	popq_cfi %rcx				/* User %esp */
-	CFI_REGISTER rsp,rcx
 	TRACE_IRQS_ON
+
 	/*
-	 * 32bit SYSEXIT restores eip from edx, esp from ecx.
-	 * cs and ss are loaded from MSRs.
+	 * SYSRETL works even on Intel CPUs.  Use it in preference to SYSEXIT,
+	 * since it avoids a dicey window with interrupts enabled.
 	 */
-	ENABLE_INTERRUPTS_SYSEXIT32
+	movl	RSP(%rsp),%esp
+
+	/*
+	 * USERGS_SYSRET32 does:
+	 *  gsbase = user's gs base
+	 *  eip = ecx
+	 *  rflags = r11
+	 *  cs = __USER32_CS
+	 *  ss = __USER_DS
+	 *
+	 * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does:
+	 *
+	 *  pop %ebp
+	 *  pop %edx
+	 *  pop %ecx
+	 *
+	 * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to
+	 * avoid info leaks.  R11 ends up with VDSO32_SYSENTER_RETURN's
+	 * address (already known to user code), and R12-R15 are
+	 * callee-saved and therefore don't contain any interesting
+	 * kernel data.
+	 */
+	USERGS_SYSRET32
 
 	CFI_RESTORE_STATE
 
-- 
cgit v1.2.3


From 47091e3c5b072daca29a15d2a3caf40359b0d140 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 3 Apr 2015 10:28:34 +0200
Subject: x86/asm/entry: Drop now unused ENABLE_INTERRUPTS_SYSEXIT32

Commit:

  4214a16b0297 ("x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER")

removed the last user of ENABLE_INTERRUPTS_SYSEXIT32. Kill the
macro now too.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: virtualization@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/1428049714-829-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/irqflags.h | 4 ----
 arch/x86/include/asm/paravirt.h | 5 -----
 2 files changed, 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9a63eae04e4b..b77f5edb03b0 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void)
 #define USERGS_SYSRET32				\
 	swapgs;					\
 	sysretl
-#define ENABLE_INTERRUPTS_SYSEXIT32		\
-	swapgs;					\
-	sti;					\
-	sysexit
 
 #else
 #define INTERRUPT_RETURN		iret
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 965c47d254aa..5f6051d5d139 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -976,11 +976,6 @@ extern void default_banner(void);
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
 		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
-
-#define ENABLE_INTERRUPTS_SYSEXIT32					\
-	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit),	\
-		  CLBR_NONE,						\
-		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
 #endif	/* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
-- 
cgit v1.2.3


From cee8f5a6c8c917613dd021552909d071b1dab592 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Tue, 31 Mar 2015 10:04:41 -0500
Subject: x86/mce/severity: Fix warning about indented braces

Dan reported compiler warnings about missing curly braces in
mce_severity_amd(). Reindent the catch-all "return MCE_AR_SEVERITY"
correctly to single tab.

While at it, chain ctx == IN_KERNEL check with mcgstatus check to make
it cleaner, as suggested by Boris.

No functional changes are introduced by this patch.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/1427814281-18192-1-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/mcheck/mce-severity.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 155c9261d3ef..9c682c222071 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -208,12 +208,11 @@ static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_exc
 		 */
 		if (mce_flags.overflow_recov) {
 			/* software can try to contain */
-			if (!(m->mcgstatus & MCG_STATUS_RIPV))
-				if (ctx == IN_KERNEL)
-					return MCE_PANIC_SEVERITY;
+			if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+				return MCE_PANIC_SEVERITY;
 
-				/* kill current process */
-				return MCE_AR_SEVERITY;
+			/* kill current process */
+			return MCE_AR_SEVERITY;
 		} else {
 			/* at least one error was not logged */
 			if (m->status & MCI_STATUS_OVER)
-- 
cgit v1.2.3


From 7f99f8b94c9355acdb14f1be28cb19aac741da68 Mon Sep 17 00:00:00 2001
From: Mark Einon
Date: Wed, 1 Apr 2015 22:32:04 +0100
Subject: x86/earlyprintk: Put CONFIG_PCI-only functions under the #ifdef

Two static functions are only used if CONFIG_PCI is defined, so only
build them if this is the case. Fixes the build warnings:

  arch/x86/kernel/early_printk.c:98:13: warning: ‘mem32_serial_out’ defined but not used [-Wunused-function]
   static void mem32_serial_out(unsigned long addr, int offset, int value)
             ^
  arch/x86/kernel/early_printk.c:105:21: warning: ‘mem32_serial_in’ defined but not used [-Wunused-function]
   static unsigned int mem32_serial_in(unsigned long addr, int offset)
                     ^

Also convert a few related instances of uintXX_t to kernel specific uXX
defines.

Signed-off-by: Mark Einon <mark.einon@gmail.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stuart.r.anderson@intel.com
Link: http://lkml.kernel.org/r/1427923924-22653-1-git-send-email-mark.einon@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/early_printk.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a1be88..49ff55ef9b26 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8;  /* ttyS0 */
 #define DLL             0       /*  Divisor Latch Low         */
 #define DLH             1       /*  Divisor latch High        */
 
-static void mem32_serial_out(unsigned long addr, int offset, int value)
-{
-	uint32_t *vaddr = (uint32_t *)addr;
-	/* shift implied by pointer type */
-	writel(value, vaddr + offset);
-}
-
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
-{
-	uint32_t *vaddr = (uint32_t *)addr;
-	/* shift implied by pointer type */
-	return readl(vaddr + offset);
-}
-
 static unsigned int io_serial_in(unsigned long addr, int offset)
 {
 	return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
 }
 
 #ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+	u32 *vaddr = (u32 *)addr;
+	/* shift implied by pointer type */
+	writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+	u32 *vaddr = (u32 *)addr;
+	/* shift implied by pointer type */
+	return readl(vaddr + offset);
+}
+
 /*
  * early_pci_serial_init()
  *
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
 	unsigned divisor;
 	unsigned long baud = DEFAULT_BAUD;
 	u8 bus, slot, func;
-	uint32_t classcode, bar0;
-	uint16_t cmdreg;
+	u32 classcode, bar0;
+	u16 cmdreg;
 	char *e;
 
 
-- 
cgit v1.2.3


From 78cac48c0434c82e860fade3cd0420a7a4adbb08 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Wed, 1 Apr 2015 12:49:52 +0200
Subject: x86/mm/KASLR: Propagate KASLR status to kernel proper

Commit:

  e2b32e678513 ("x86, kaslr: randomize module base load address")

made module base address randomization unconditional and didn't regard
disabled KKASLR due to CONFIG_HIBERNATION and command line option
"nokaslr". For more info see (now reverted) commit:

  f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation")

In order to propagate KASLR status to kernel proper, we need a single bit
in boot_params.hdr.loadflags and we've chosen bit 1 thus leaving the
top-down allocated bits for bits supposed to be used by the bootloader.

Originally-From: Jiri Kosina <jkosina@suse.cz>
Suggested-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Kees Cook <keescook@chromium.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 Documentation/x86/boot.txt            |  6 ++++++
 arch/x86/boot/compressed/aslr.c       |  5 ++++-
 arch/x86/boot/compressed/misc.c       |  5 ++++-
 arch/x86/boot/compressed/misc.h       |  6 ++++--
 arch/x86/include/asm/setup.h          |  5 +++++
 arch/x86/include/uapi/asm/bootparam.h |  1 +
 arch/x86/kernel/module.c              | 11 ++---------
 arch/x86/kernel/setup.c               | 13 +++++++++----
 8 files changed, 35 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt
index a75e3adaa39d..88b85899d309 100644
--- a/Documentation/x86/boot.txt
+++ b/Documentation/x86/boot.txt
@@ -406,6 +406,12 @@ Protocol:	2.00+
 	- If 0, the protected-mode code is loaded at 0x10000.
 	- If 1, the protected-mode code is loaded at 0x100000.
 
+  Bit 1 (kernel internal): ALSR_FLAG
+	- Used internally by the compressed kernel to communicate
+	  KASLR status to kernel proper.
+	  If 1, KASLR enabled.
+	  If 0, KASLR disabled.
+
   Bit 5 (write): QUIET_FLAG
 	- If 0, print early messages.
 	- If 1, suppress early messages.
diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index bb1376381985..d7b1f655b3ef 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum,
 	return slots_fetch_random();
 }
 
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
@@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
 	}
 #endif
 
+	boot_params->hdr.loadflags |= KASLR_FLAG;
+
 	/* Record the various known unsafe memory ranges. */
 	mem_avoid_init((unsigned long)input, input_size,
 		       (unsigned long)output, output_size);
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index a950864a64da..a107b935e22f 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 
 	real_mode = rmode;
 
+	/* Clear it for solely in-kernel use */
+	real_mode->hdr.loadflags &= ~KASLR_FLAG;
+
 	sanitize_boot_params(real_mode);
 
 	if (real_mode->screen_info.orig_video_mode == 7) {
@@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
 	 * the entire decompressed kernel plus relocation table, or the
 	 * entire decompressed kernel plus .bss and .brk sections.
 	 */
-	output = choose_kernel_location(input_data, input_len, output,
+	output = choose_kernel_location(real_mode, input_data, input_len, output,
 					output_len > run_size ? output_len
 							      : run_size);
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 04477d68403f..89dd0d78013a 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option);
 
 #if CONFIG_RANDOMIZE_BASE
 /* aslr.c */
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size);
@@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input,
 bool has_cpuflag(int flag);
 #else
 static inline
-unsigned char *choose_kernel_location(unsigned char *input,
+unsigned char *choose_kernel_location(struct boot_params *boot_params,
+				      unsigned char *input,
 				      unsigned long input_size,
 				      unsigned char *output,
 				      unsigned long output_size)
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ff4e7b236e21..f69e06b283fb 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { }
  */
 extern struct boot_params boot_params;
 
+static inline bool kaslr_enabled(void)
+{
+	return !!(boot_params.hdr.loadflags & KASLR_FLAG);
+}
+
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h
index 225b0988043a..ab456dc233b5 100644
--- a/arch/x86/include/uapi/asm/bootparam.h
+++ b/arch/x86/include/uapi/asm/bootparam.h
@@ -15,6 +15,7 @@
 
 /* loadflags */
 #define LOADED_HIGH	(1<<0)
+#define KASLR_FLAG	(1<<1)
 #define QUIET_FLAG	(1<<5)
 #define KEEP_SEGMENTS	(1<<6)
 #define CAN_USE_HEAP	(1<<7)
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b72c72..005c03e93fc5 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <asm/setup.h>
 
 #if 0
 #define DEBUGP(fmt, ...)				\
@@ -47,21 +48,13 @@ do {							\
 
 #ifdef CONFIG_RANDOMIZE_BASE
 static unsigned long module_load_offset;
-static int randomize_modules = 1;
 
 /* Mutex protects the module_load_offset. */
 static DEFINE_MUTEX(module_kaslr_mutex);
 
-static int __init parse_nokaslr(char *p)
-{
-	randomize_modules = 0;
-	return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
 static unsigned long int get_module_load_offset(void)
 {
-	if (randomize_modules) {
+	if (kaslr_enabled()) {
 		mutex_lock(&module_kaslr_mutex);
 		/*
 		 * Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421cca01f..014466b152b5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
 static int
 dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
 {
-	pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
-		 "(relocation range: 0x%lx-0x%lx)\n",
-		 (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
-		 __START_KERNEL_map, MODULES_VADDR-1);
+	if (kaslr_enabled()) {
+		pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+			 (unsigned long)&_text - __START_KERNEL,
+			 __START_KERNEL,
+			 __START_KERNEL_map,
+			 MODULES_VADDR-1);
+	} else {
+		pr_emerg("Kernel Offset: disabled\n");
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 7c74d5b7b7b63fc279ed446ebd78de20d81f2824 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 3 Apr 2015 11:42:10 +0200
Subject: x86/asm/entry/64: Fix MSR_IA32_SYSENTER_CS MSR value

Commit:

  d56fe4bf5f3c ("x86/asm/entry/64: Always set up SYSENTER MSRs")

missed to add "ULL" to the 0 and wrmsrl_safe() complains:

  arch/x86/kernel/cpu/common.c: In function ‘syscall_init’:
  arch/x86/kernel/cpu/common.c:1226:2: warning: right shift count >= width of type wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0);

Fix it.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428054130-25847-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index a383d53bf0ed..d56d30714d43 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1184,7 +1184,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 #else
 	wrmsrl(MSR_CSTAR, ignore_sysret);
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0);
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
 #endif
-- 
cgit v1.2.3


From 6b51311c976593fb7311322b1647a912cd456ec4 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Fri, 3 Apr 2015 14:25:28 +0200
Subject: x86/asm/entry/64: Use a define for an invalid segment selector

... instead of a naked number, for better readability.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428054130-25847-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/segment.h | 2 ++
 arch/x86/kernel/cpu/common.c   | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index d394899e055c..5a9856eb12ba 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -39,6 +39,8 @@
 /* ... GDT has it cleared */
 #define SEGMENT_GDT		0x0
 
+#define GDT_ENTRY_INVALID_SEG	0
+
 #ifdef CONFIG_X86_32
 /*
  * The layout of the per-CPU GDT under Linux:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index d56d30714d43..3f70538012e2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1184,7 +1184,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
 #else
 	wrmsrl(MSR_CSTAR, ignore_sysret);
-	wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0ULL);
+	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
 #endif
-- 
cgit v1.2.3


From dbe4058a6a44af4ca5d146aebe01b0a1f9b7fd2a Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 4 Apr 2015 15:34:43 +0200
Subject: x86/alternatives: Fix ALTERNATIVE_2 padding generation properly

Quentin caught a corner case with the generation of instruction
padding in the ALTERNATIVE_2 macro: if len(orig_insn) <
len(alt1) < len(alt2), then not enough padding gets added and
that is not good(tm) as we could overwrite the beginning of the
next instruction.

Luckily, at the time of this writing, we don't have
ALTERNATIVE_2() invocations which have that problem and even if
we did, a simple fix would be to prepend the instructions with
enough prefixes so that that corner case doesn't happen.

However, best it would be if we fixed it properly. See below for
a simple, abstracted example of what we're doing.

So what we ended up doing is, we compute the

	max(len(alt1), len(alt2)) - len(orig_insn)

and feed that value to the .skip gas directive. The max() cannot
have conditionals due to gas limitations, thus the fancy integer
math.

With this patch, all ALTERNATIVE_2 sites get padded correctly;
generating obscure test cases pass too:

  #define alt_max_short(a, b)    ((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))

  #define gen_skip(orig, alt1, alt2, marker)	\
  	.skip -((alt_max_short(alt1, alt2) - (orig)) > 0) * \
  		(alt_max_short(alt1, alt2) - (orig)),marker

  	.pushsection .text, "ax"
  .globl main
  main:
  	gen_skip(1, 2, 4, 0x09)
  	gen_skip(4, 1, 2, 0x10)
  	...
  	.popsection

Thanks to Quentin for catching it and double-checking the fix!

Reported-by: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20150404133443.GE21152@pd.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/alternative-asm.h | 14 ++++++++++++--
 arch/x86/include/asm/alternative.h     | 16 ++++++++++++----
 arch/x86/kernel/alternative.c          |  4 ++--
 3 files changed, 26 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 524bddce0b76..bdf02eeee765 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -45,12 +45,22 @@
 	.popsection
 .endm
 
+#define old_len			141b-140b
+#define new_len1		144f-143f
+#define new_len2		145f-144f
+
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ */
+#define alt_max_short(a, b)	((a) ^ (((a) ^ (b)) & -(-((a) < (b)))))
+
 .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
 140:
 	\oldinstr
 141:
-	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
-	.skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+	.skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \
+		(alt_max_short(new_len1, new_len2) - (old_len)),0x90
 142:
 
 	.pushsection .altinstructions,"a"
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 5aef6a97d80e..ba32af062f61 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -95,14 +95,22 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	__OLDINSTR(oldinstr, num)					\
 	alt_end_marker ":\n"
 
+/*
+ * max without conditionals. Idea adapted from:
+ * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax
+ *
+ * The additional "-" is needed because gas works with s32s.
+ */
+#define alt_max_short(a, b)	"((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))"
+
 /*
  * Pad the second replacement alternative with additional NOPs if it is
  * additionally longer than the first replacement alternative.
  */
-#define OLDINSTR_2(oldinstr, num1, num2)					\
-	__OLDINSTR(oldinstr, num1)						\
-	".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
-		"((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n"  \
+#define OLDINSTR_2(oldinstr, num1, num2) \
+	"661:\n\t" oldinstr "\n662:\n"								\
+	".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * "	\
+		"(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n"	\
 	alt_end_marker ":\n"
 
 #define ALTINSTR_ENTRY(feature, num)					      \
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 5c993c94255e..7c4ad005d7a0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -369,11 +369,11 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			continue;
 		}
 
-		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
 			a->cpuid >> 5,
 			a->cpuid & 0x1f,
 			instr, a->instrlen,
-			replacement, a->replacementlen);
+			replacement, a->replacementlen, a->padlen);
 
 		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
 		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
-- 
cgit v1.2.3


From 6a3713f001b3b53587e411ab0d3036ae9b0fb93b Mon Sep 17 00:00:00 2001
From: Brian Gerst
Date: Sat, 4 Apr 2015 08:58:23 -0400
Subject: x86/signal: Remove pax argument from restore_sigcontext

The 'pax' argument is unnecesary.  Instead, store the RAX value
directly in regs.

This pattern goes all the way back to 2.1.106pre1, when restore_sigcontext()
was changed to return an error code instead of EAX directly:

  https://git.kernel.org/cgit/linux/kernel/git/history/history.git/diff/arch/i386/kernel/signal.c?id=9a8f8b7ca3f319bd668298d447bdf32730e51174

In 2007 sigaltstack syscall support was added, where the return
value of restore_sigcontext() was changed to carry the memory-copying
failure code.

But instead of putting 'ax' into regs->ax directly, it was carried
in via a pointer and then returned, where the generic syscall return
code copied it to regs->ax.

So there was never any deeper reason for this suboptimal pattern, it
was simply never noticed after being introduced.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1428152303-17154-1-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32_signal.c        | 17 ++++++-----------
 arch/x86/include/asm/sighandling.h |  4 +---
 arch/x86/kernel/signal.c           | 22 ++++++++--------------
 3 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 1f5e2b0e09ff..c81d35e6c7f1 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 }
 
 static int ia32_restore_sigcontext(struct pt_regs *regs,
-				   struct sigcontext_ia32 __user *sc,
-				   unsigned int *pax)
+				   struct sigcontext_ia32 __user *sc)
 {
 	unsigned int tmpflags, err = 0;
 	void __user *buf;
@@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 		RELOAD_SEG(es);
 
 		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip);
+		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
 		/* Don't touch extended registers */
 
 		COPY_SEG_CPL3(cs);
@@ -197,8 +196,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
 
 		get_user_ex(tmp, &sc->fpstate);
 		buf = compat_ptr(tmp);
-
-		get_user_ex(*pax, &sc->ax);
 	} get_user_catch(err);
 
 	err |= restore_xstate_sig(buf, 1);
@@ -213,7 +210,6 @@ asmlinkage long sys32_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8);
 	sigset_t set;
-	unsigned int ax;
 
 	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
@@ -226,9 +222,9 @@ asmlinkage long sys32_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->sc, &ax))
+	if (ia32_restore_sigcontext(regs, &frame->sc))
 		goto badframe;
-	return ax;
+	return regs->ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit sigreturn");
@@ -240,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe_ia32 __user *frame;
 	sigset_t set;
-	unsigned int ax;
 
 	frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4);
 
@@ -251,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+	if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
-	return ax;
+	return regs->ax;
 
 badframe:
 	signal_fault(regs, frame, "32bit rt sigreturn");
diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h
index 7a958164088c..89db46752a8f 100644
--- a/arch/x86/include/asm/sighandling.h
+++ b/arch/x86/include/asm/sighandling.h
@@ -13,9 +13,7 @@
 			 X86_EFLAGS_CF | X86_EFLAGS_RF)
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
-
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
-		       unsigned long *pax);
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc);
 int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
 		     struct pt_regs *regs, unsigned long mask);
 
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index eaa2c5e3f2cd..53cc4085c3d7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
 	regs->seg = GET_SEG(seg) | 3;			\
 } while (0)
 
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
-		       unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
 {
 	void __user *buf;
 	unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 #endif /* CONFIG_X86_32 */
 
 		COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
-		COPY(dx); COPY(cx); COPY(ip);
+		COPY(dx); COPY(cx); COPY(ip); COPY(ax);
 
 #ifdef CONFIG_X86_64
 		COPY(r8);
@@ -102,8 +101,6 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
 		regs->orig_ax = -1;		/* disable syscall checks */
 
 		get_user_ex(buf, &sc->fpstate);
-
-		get_user_ex(*pax, &sc->ax);
 	} get_user_catch(err);
 
 	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
@@ -545,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
 {
 	struct pt_regs *regs = current_pt_regs();
 	struct sigframe __user *frame;
-	unsigned long ax;
 	sigset_t set;
 
 	frame = (struct sigframe __user *)(regs->sp - 8);
@@ -559,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->sc, &ax))
+	if (restore_sigcontext(regs, &frame->sc))
 		goto badframe;
-	return ax;
+	return regs->ax;
 
 badframe:
 	signal_fault(regs, frame, "sigreturn");
@@ -574,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
 {
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe __user *frame;
-	unsigned long ax;
 	sigset_t set;
 
 	frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -585,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
-	return ax;
+	return regs->ax;
 
 badframe:
 	signal_fault(regs, frame, "rt_sigreturn");
@@ -786,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 	struct pt_regs *regs = current_pt_regs();
 	struct rt_sigframe_x32 __user *frame;
 	sigset_t set;
-	unsigned long ax;
 
 	frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
 
@@ -797,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
 
 	set_current_blocked(&set);
 
-	if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+	if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
 		goto badframe;
 
 	if (compat_restore_altstack(&frame->uc.uc_stack))
 		goto badframe;
 
-	return ax;
+	return regs->ax;
 
 badframe:
 	signal_fault(regs, frame, "x32 rt_sigreturn");
-- 
cgit v1.2.3


From fc3e958a2b552fe6210e6de3bebb4348156a0b5f Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Sat, 4 Apr 2015 20:55:19 +0200
Subject: x86/asm/entry: Clear EXTRA_REGS for all executable formats

On failure, sys_execve() does not clobber EXTRA_REGS, so we can
just return to userpsace without saving/restoring them.

On success, ELF_PLAT_INIT() in sys_execve() clears all these
registers.

On other executable formats:

  - binfmt_flat.c has similar FLAT_PLAT_INIT, but x86 (and everyone
    else except sh) doesn't define it.

  - binfmt_elf_fdpic.c has ELF_FDPIC_PLAT_INIT, but x86 (and most
    others) doesn't define it.

  - There are no such hooks in binfmt_aout.c et al. We inherit
    EXTRA_REGS from the prior executable.

This inconsistency was not intended.

This change removes SAVE/RESTORE_EXTRA_REGS in stub_execve,
removes register clearing in ELF_PLAT_INIT(),
and instead simply clears them on success in stub_execve.

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428173719-7637-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/calling.h |  9 ++++++++
 arch/x86/include/asm/elf.h     |  7 +++---
 arch/x86/kernel/entry_64.S     | 50 +++++++++++++++++++-----------------------
 3 files changed, 35 insertions(+), 31 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 4b5f7bf2b780..1c8b50edb2db 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -151,6 +151,15 @@ For 32-bit we have the following conventions - kernel is built with
 	movq_cfi_restore 5*8+\offset, rbx
 	.endm
 
+	.macro ZERO_EXTRA_REGS
+	xorl	%r15d, %r15d
+	xorl	%r14d, %r14d
+	xorl	%r13d, %r13d
+	xorl	%r12d, %r12d
+	xorl	%ebp, %ebp
+	xorl	%ebx, %ebx
+	.endm
+
 	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
 	.if \rstor_r11
 	movq_cfi_restore 6*8, r11
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index ca3347a9dab5..3563107b5060 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -171,10 +171,11 @@ do {						\
 static inline void elf_common_init(struct thread_struct *t,
 				   struct pt_regs *regs, const u16 ds)
 {
-	regs->ax = regs->bx = regs->cx = regs->dx = 0;
-	regs->si = regs->di = regs->bp = 0;
+	/* Commented-out registers are cleared in stub_execve */
+	/*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0;
+	regs->si = regs->di /*= regs->bp*/ = 0;
 	regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
-	regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
+	/*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/
 	t->fs = t->gs = 0;
 	t->fsindex = t->gsindex = 0;
 	t->ds = t->es = ds;
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 386375d43d14..f4270ff73f2a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -419,25 +419,27 @@ END(stub_\func)
 
 ENTRY(stub_execve)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
-	call sys_execve
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	DEFAULT_FRAME 0, 8
+	call	sys_execve
+return_from_execve:
+	testl	%eax, %eax
+	jz	1f
+	/* exec failed, can use fast SYSRET code path in this case */
+	ret
+1:
+	/* must use IRET code path (pt_regs->cs may have changed) */
+	addq	$8, %rsp
+	ZERO_EXTRA_REGS
+	movq	%rax,RAX(%rsp)
+	jmp	int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
 
 ENTRY(stub_execveat)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
-	call sys_execveat
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	DEFAULT_FRAME 0, 8
+	call	sys_execveat
+	jmp	return_from_execve
 	CFI_ENDPROC
 END(stub_execveat)
 
@@ -472,25 +474,17 @@ END(stub_x32_rt_sigreturn)
 
 ENTRY(stub_x32_execve)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
-	call compat_sys_execve
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	DEFAULT_FRAME 0, 8
+	call	compat_sys_execve
+	jmp	return_from_execve
 	CFI_ENDPROC
 END(stub_x32_execve)
 
 ENTRY(stub_x32_execveat)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
-	call compat_sys_execveat
-	movq %rax,RAX(%rsp)
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	DEFAULT_FRAME 0, 8
+	call	compat_sys_execveat
+	jmp	return_from_execve
 	CFI_ENDPROC
 END(stub_x32_execveat)
 
-- 
cgit v1.2.3


From 69df353ff305805fc16082d0c5bfa6e20fa8b863 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Sat, 4 Apr 2015 23:07:42 +0200
Subject: x86/alternatives: Guard NOPs optimization

Take a look at the first instruction byte before optimizing the NOP -
there might be something else there already, like the ALTERNATIVE_2()
in rdtsc_barrier() which NOPs out on AMD even though we just
patched in an MFENCE.

This happens because the alternatives sees X86_FEATURE_MFENCE_RDTSC,
AMD CPUs set it, we patch in the MFENCE and right afterwards it sees
X86_FEATURE_LFENCE_RDTSC which AMD CPUs don't set and we blindly
optimize the NOP.

Checking whether at least the first byte is 0x90 prevents that.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1428181662-18020-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/alternative.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 7c4ad005d7a0..aef653193160 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -325,6 +325,9 @@ done:
 
 static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 {
+	if (instr[0] != 0x90)
+		return;
+
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
 
 	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
-- 
cgit v1.2.3


From 22ef882e6b5bd2bf668d10b1e2be3dc2fc365b99 Mon Sep 17 00:00:00 2001
From: Dave Young
Date: Tue, 7 Apr 2015 21:41:32 +0800
Subject: x86/mm/numa: Fix kernel stack corruption in
 numa_init()->numa_clear_kernel_node_hotplug()

I got below kernel panic during kdump test on Thinkpad T420
laptop:

[    0.000000] No NUMA configuration found
[    0.000000] Faking a node at [mem 0x0000000000000000-0x0000000037ba4fff]
[    0.000000] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff81d21910
 ...
[    0.000000] Call Trace:
[    0.000000]  [<ffffffff817c2a26>] dump_stack+0x45/0x57
[    0.000000]  [<ffffffff817bc8d2>] panic+0xd0/0x204
[    0.000000]  [<ffffffff81d21910>] ? numa_clear_kernel_node_hotplug+0xe6/0xf2
[    0.000000]  [<ffffffff8107741b>] __stack_chk_fail+0x1b/0x20
[    0.000000]  [<ffffffff81d21910>] numa_clear_kernel_node_hotplug+0xe6/0xf2
[    0.000000]  [<ffffffff81d21e5d>] numa_init+0x1a5/0x520
[    0.000000]  [<ffffffff81d222b1>] x86_numa_init+0x19/0x3d
[    0.000000]  [<ffffffff81d22460>] initmem_init+0x9/0xb
[    0.000000]  [<ffffffff81d0d00c>] setup_arch+0x94f/0xc82
[    0.000000]  [<ffffffff81d05120>] ? early_idt_handlers+0x120/0x120
[    0.000000]  [<ffffffff817bd0bb>] ? printk+0x55/0x6b
[    0.000000]  [<ffffffff81d05120>] ? early_idt_handlers+0x120/0x120
[    0.000000]  [<ffffffff81d05d9b>] start_kernel+0xe8/0x4d6
[    0.000000]  [<ffffffff81d05120>] ? early_idt_handlers+0x120/0x120
[    0.000000]  [<ffffffff81d05120>] ? early_idt_handlers+0x120/0x120
[    0.000000]  [<ffffffff81d055ee>] x86_64_start_reservations+0x2a/0x2c
[    0.000000]  [<ffffffff81d05751>] x86_64_start_kernel+0x161/0x184
[    0.000000] ---[ end Kernel panic - not syncing: stack-protector: Kernel sta

This is caused by writing over the end of numa mask bitmap
in numa_clear_kernel_node().

numa_clear_kernel_node() tries to set the node id in a mask bitmap,
by iterating all reserved regions and assuming that every region
has a valid nid.

This assumption is not true because there's an exception for some
graphic memory quirks. See trim_snb_memory() in arch/x86/kernel/setup.c

It is easily to reproduce the bug in the kdump kernel because kdump
kernel use pre-reserved memory instead of the whole memory, but
kexec pass other reserved memory ranges to 2nd kernel as well.
like below in my test:

kdump kernel ram 0x2d000000 - 0x37bfffff
One of the reserved regions: 0x40000000 - 0x40100000 which
includes 0x40004000, a page excluded in trim_snb_memory(). For
this memblock reserved region the nid is not set, it is still
default value MAX_NUMNODES. later node_set will set bit
MAX_NUMNODES thus stack corruption happen.

This also happens when booting with mem= kernel commandline
during my test.

Fixing it by adding a check, do not call node_set in case nid is
MAX_NUMNODES.

Signed-off-by: Dave Young <dyoung@redhat.com>
Reviewed-by: Yasuaki Ishimatsu <isimatu.yasuaki@jp.fujitsu.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bhe@redhat.com
Cc: qiuxishi@huawei.com
Link: http://lkml.kernel.org/r/20150407134132.GA23522@dhcp-16-198.nay.redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/mm/numa.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index cd4785bbacb9..4053bb58bf92 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -482,9 +482,16 @@ static void __init numa_clear_kernel_node_hotplug(void)
 				  &memblock.reserved, mb->nid);
 	}
 
-	/* Mark all kernel nodes. */
+	/*
+	 * Mark all kernel nodes.
+	 *
+	 * When booting with mem=nn[kMG] or in a kdump kernel, numa_meminfo
+	 * may not include all the memblock.reserved memory ranges because
+	 * trim_snb_memory() reserves specific pages for Sandy Bridge graphics.
+	 */
 	for_each_memblock(reserved, r)
-		node_set(r->nid, numa_kernel_nodes);
+		if (r->nid != MAX_NUMNODES)
+			node_set(r->nid, numa_kernel_nodes);
 
 	/* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */
 	for (i = 0; i < numa_meminfo.nr_blks; i++) {
-- 
cgit v1.2.3


From fffbb5dcfd29f8831e41b4dd2ab938bd36d35283 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Thu, 2 Apr 2015 18:46:59 +0200
Subject: x86/asm/entry/64: Move opportunistic sysret code to syscall code path

This change does two things:

Copy-pastes "retint_swapgs:" code into syscall handling code,
the copy is under "syscall_return:" label. The code is unchanged
apart from some label renames.

Removes "opportunistic sysret" code from "retint_swapgs:" code
block, since now it won't be reached by syscall return. This in
fact removes most of the code in question.

   text	   data	    bss	    dec	    hex	filename
  12530	      0	      0	  12530	   30f2	entry_64.o.before
  12562	      0	      0	  12562	   3112	entry_64.o

Run-tested.

Acked-and-Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 158 ++++++++++++++++++++++++---------------------
 1 file changed, 86 insertions(+), 72 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 65485b3baa59..e4c810395bae 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -354,8 +354,8 @@ GLOBAL(int_with_check)
 	movl TI_flags(%rcx),%edx
 	andl %edi,%edx
 	jnz   int_careful
-	andl    $~TS_COMPAT,TI_status(%rcx)
-	jmp   retint_swapgs
+	andl	$~TS_COMPAT,TI_status(%rcx)
+	jmp	syscall_return
 
 	/* Either reschedule or signal or syscall exit tracking needed. */
 	/* First do a reschedule test. */
@@ -399,9 +399,86 @@ int_restore_rest:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	jmp int_with_check
+
+syscall_return:
+	/* The IRETQ could re-enable interrupts: */
+	DISABLE_INTERRUPTS(CLBR_ANY)
+	TRACE_IRQS_IRETQ
+
+	/*
+	 * Try to use SYSRET instead of IRET if we're returning to
+	 * a completely clean 64-bit userspace context.
+	 */
+	movq RCX(%rsp),%rcx
+	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
+	jne opportunistic_sysret_failed
+
+	/*
+	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+	 * in kernel space.  This essentially lets the user take over
+	 * the kernel, since userspace controls RSP.  It's not worth
+	 * testing for canonicalness exactly -- this check detects any
+	 * of the 17 high bits set, which is true for non-canonical
+	 * or kernel addresses.  (This will pessimize vsyscall=native.
+	 * Big deal.)
+	 *
+	 * If virtual addresses ever become wider, this will need
+	 * to be updated to remain correct on both old and new CPUs.
+	 */
+	.ifne __VIRTUAL_MASK_SHIFT - 47
+	.error "virtual address width changed -- SYSRET checks need update"
+	.endif
+	shr $__VIRTUAL_MASK_SHIFT, %rcx
+	jnz opportunistic_sysret_failed
+
+	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
+	jne opportunistic_sysret_failed
+
+	movq R11(%rsp),%r11
+	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
+	jne opportunistic_sysret_failed
+
+	/*
+	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
+	 * restoring TF results in a trap from userspace immediately after
+	 * SYSRET.  This would cause an infinite loop whenever #DB happens
+	 * with register state that satisfies the opportunistic SYSRET
+	 * conditions.  For example, single-stepping this user code:
+	 *
+	 *           movq $stuck_here,%rcx
+	 *           pushfq
+	 *           popq %r11
+	 *   stuck_here:
+	 *
+	 * would never get past 'stuck_here'.
+	 */
+	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+	jnz opportunistic_sysret_failed
+
+	/* nothing to check for RSP */
+
+	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
+	jne opportunistic_sysret_failed
+
+	/*
+	 * We win!  This label is here just for ease of understanding
+	 * perf profiles.  Nothing jumps here.
+	 */
+syscall_return_via_sysret:
+	CFI_REMEMBER_STATE
+	/* r11 is already restored (see code above) */
+	RESTORE_C_REGS_EXCEPT_R11
+	movq RSP(%rsp),%rsp
+	USERGS_SYSRET64
+	CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+	SWAPGS
+	jmp	restore_c_regs_and_iret
 	CFI_ENDPROC
 END(system_call)
 
+
 	.macro FORK_LIKE func
 ENTRY(stub_\func)
 	CFI_STARTPROC
@@ -673,76 +750,8 @@ retint_swapgs:		/* return to user-space */
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_IRETQ
 
-	/*
-	 * Try to use SYSRET instead of IRET if we're returning to
-	 * a completely clean 64-bit userspace context.
-	 */
-	movq RCX(%rsp),%rcx
-	cmpq %rcx,RIP(%rsp)		/* RCX == RIP */
-	jne opportunistic_sysret_failed
-
-	/*
-	 * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
-	 * in kernel space.  This essentially lets the user take over
-	 * the kernel, since userspace controls RSP.  It's not worth
-	 * testing for canonicalness exactly -- this check detects any
-	 * of the 17 high bits set, which is true for non-canonical
-	 * or kernel addresses.  (This will pessimize vsyscall=native.
-	 * Big deal.)
-	 *
-	 * If virtual addresses ever become wider, this will need
-	 * to be updated to remain correct on both old and new CPUs.
-	 */
-	.ifne __VIRTUAL_MASK_SHIFT - 47
-	.error "virtual address width changed -- sysret checks need update"
-	.endif
-	shr $__VIRTUAL_MASK_SHIFT, %rcx
-	jnz opportunistic_sysret_failed
-
-	cmpq $__USER_CS,CS(%rsp)	/* CS must match SYSRET */
-	jne opportunistic_sysret_failed
-
-	movq R11(%rsp),%r11
-	cmpq %r11,EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne opportunistic_sysret_failed
-
-	/*
-	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.  This would cause an infinite loop whenever #DB happens
-	 * with register state that satisfies the opportunistic SYSRET
-	 * conditions.  For example, single-stepping this user code:
-	 *
-	 *           movq $stuck_here,%rcx
-	 *           pushfq
-	 *           popq %r11
-	 *   stuck_here:
-	 *
-	 * would never get past 'stuck_here'.
-	 */
-	testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz opportunistic_sysret_failed
-
-	/* nothing to check for RSP */
-
-	cmpq $__USER_DS,SS(%rsp)	/* SS must match SYSRET */
-	jne opportunistic_sysret_failed
-
-	/*
-	 * We win!  This label is here just for ease of understanding
-	 * perf profiles.  Nothing jumps here.
-	 */
-irq_return_via_sysret:
-	CFI_REMEMBER_STATE
-	/* r11 is already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_R11
-	movq RSP(%rsp),%rsp
-	USERGS_SYSRET64
-	CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
 	SWAPGS
-	jmp restore_args
+	jmp	restore_c_regs_and_iret
 
 /* Returning to kernel space */
 retint_kernel:
@@ -761,7 +770,12 @@ retint_kernel:
 	 * The iretq could re-enable interrupts:
 	 */
 	TRACE_IRQS_IRETQ
-restore_args:
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
 
-- 
cgit v1.2.3


From 3304c9c37bef30ebd2ef71d986e6568372ce80f8 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 3 Apr 2015 21:49:13 +0200
Subject: x86/asm/entry/irq: Simplify interrupt dispatch table (IDT) layout

Interrupt entry points are handled with the following code,
each 32-byte code block contains seven entry points:

		...
		[push][jump 22] // 4 bytes
		[push][jump 18] // 4 bytes
		[push][jump 14] // 4 bytes
		[push][jump 10] // 4 bytes
		[push][jump  6] // 4 bytes
		[push][jump  2] // 4 bytes
		[push][jump common_interrupt][padding] // 8 bytes

		[push][jump]
		[push][jump]
		[push][jump]
		[push][jump]
		[push][jump]
		[push][jump]
		[push][jump common_interrupt][padding]

		[padding_2]
	common_interrupt:

And there is a table which holds pointers to every entry point,
IOW: to every push.

In cold cache, two jumps are still costlier than one, even
though we get the benefit of them residing in the same
cacheline.

This change replaces short jumps with near ones to
'common_interrupt', and pads every push+jump pair to 8 bytes. This
way, each interrupt takes only one jump.

This change replaces ".p2align CONFIG_X86_L1_CACHE_SHIFT" before
dispatch table with ".align 8" - we do not need anything
stronger than that.

The table of entry addresses (the interrupt[] array) is no
longer necessary, the address of entries can be easily
calculated as (irq_entries_start + i*8).

   text	   data	    bss	    dec	    hex	filename
  12546	      0	      0	  12546	   3102	entry_64.o.before
  11626	      0	      0	  11626	   2d6a	entry_64.o

The size decrease is because 1656 bytes of .init.rodata are
gone. That's initdata, though. The resident size does go up a
bit.

Run-tested (32 and 64 bits).

Acked-and-Tested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428090553-7283-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/hw_irq.h |  5 ++---
 arch/x86/kernel/entry_32.S    | 41 ++++++++++-------------------------------
 arch/x86/kernel/entry_64.S    | 41 ++++++++++-------------------------------
 arch/x86/kernel/irqinit.c     |  3 ++-
 arch/x86/lguest/boot.c        |  3 ++-
 5 files changed, 26 insertions(+), 67 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 9662290e0b20..e9571ddabc4f 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *);
 extern __visible void smp_invalidate_interrupt(struct pt_regs *);
 #endif
 
-extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR
-				    - FIRST_EXTERNAL_VECTOR])(void);
+extern char irq_entries_start[];
 #ifdef CONFIG_TRACING
-#define trace_interrupt interrupt
+#define trace_irq_entries_start irq_entries_start
 #endif
 
 #define VECTOR_UNDEFINED	(-1)
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index effa2793feba..02bec0f1d1e1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -723,43 +723,22 @@ END(sysenter_badsys)
 .endm
 
 /*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
  */
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.section .entry.text, "ax"
-	.p2align 5
-	.p2align CONFIG_X86_L1_CACHE_SHIFT
+	.align 8
 ENTRY(irq_entries_start)
 	RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
-	.balign 32
-  .rept	7
-    .if vector < FIRST_SYSTEM_VECTOR
-      .if vector <> FIRST_EXTERNAL_VECTOR
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+    vector=vector+1
+	jmp	common_interrupt
 	CFI_ADJUST_CFA_OFFSET -4
-      .endif
-1:	pushl_cfi $(~vector+0x80)	/* Note: always in signed byte range */
-      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
-	jmp 2f
-      .endif
-      .previous
-	.long 1b
-      .section .entry.text, "ax"
-vector=vector+1
-    .endif
-  .endr
-2:	jmp common_interrupt
-.endr
+	.align	8
+    .endr
 END(irq_entries_start)
 
-.previous
-END(interrupt)
-.previous
-
 /*
  * the CPU automatically disables interrupts when executing an IRQ vector,
  * so IRQ-flags tracing has to follow that:
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e4c810395bae..4ca03c518ab4 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -608,44 +608,23 @@ ENTRY(ret_from_fork)
 END(ret_from_fork)
 
 /*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
  */
-	.section .init.rodata,"a"
-ENTRY(interrupt)
-	.section .entry.text
-	.p2align 5
-	.p2align CONFIG_X86_L1_CACHE_SHIFT
+	.align 8
 ENTRY(irq_entries_start)
 	INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
-	.balign 32
-  .rept	7
-    .if vector < FIRST_SYSTEM_VECTOR
-      .if vector <> FIRST_EXTERNAL_VECTOR
+    vector=FIRST_EXTERNAL_VECTOR
+    .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
+    vector=vector+1
+	jmp	common_interrupt
 	CFI_ADJUST_CFA_OFFSET -8
-      .endif
-1:	pushq_cfi $(~vector+0x80)	/* Note: always in signed byte range */
-      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
-	jmp 2f
-      .endif
-      .previous
-	.quad 1b
-      .section .entry.text
-vector=vector+1
-    .endif
-  .endr
-2:	jmp common_interrupt
-.endr
+	.align	8
+    .endr
 	CFI_ENDPROC
 END(irq_entries_start)
 
-.previous
-END(interrupt)
-.previous
-
 /*
  * Interrupt entry/exit.
  *
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181ea1eac..cd10a6437264 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
 #endif
 	for_each_clear_bit_from(i, used_vectors, first_system_vector) {
 		/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
-		set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+		set_intr_gate(i, irq_entries_start +
+				8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
 #ifdef CONFIG_X86_LOCAL_APIC
 	for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 8561585ee2c6..717908b16037 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void)
 		/* Some systems map "vectors" to interrupts weirdly.  Not us! */
 		__this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
 		if (i != SYSCALL_VECTOR)
-			set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+			set_intr_gate(i, irq_entries_start +
+					8 * (i - FIRST_EXTERNAL_VECTOR));
 	}
 
 	/*
-- 
cgit v1.2.3


From 8b3607b5b8c591d8bf045911cb7c90ecaa6e7b73 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 18:42:47 +0200
Subject: x86/asm/entry/64: Add forgotten CFI annotation

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428424967-14460-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 4ca03c518ab4..3197f41ce320 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -506,6 +506,7 @@ return_from_execve:
 1:
 	/* must use IRET code path (pt_regs->cs may have changed) */
 	addq	$8, %rsp
+	CFI_ADJUST_CFA_OFFSET -8
 	ZERO_EXTRA_REGS
 	movq	%rax,RAX(%rsp)
 	jmp	int_ret_from_sys_call
-- 
cgit v1.2.3


From 35fd68a38d574188835110cde2937d18fe9b46dd Mon Sep 17 00:00:00 2001
From: Wanpeng Li
Date: Wed, 8 Apr 2015 14:08:14 +0800
Subject: kvm: x86: fix x86 eflags fixed bit

Guest can't be booted w/ ept=0, there is a message dumped as below:

If you're running a guest on an Intel machine without unrestricted mode
support, the failure can be most likely due to the guest entering an invalid
state for Intel VT. For example, the guest maybe running in big real mode
which is not supported on less recent Intel processors.

EAX=00000011 EBX=f000d2f6 ECX=00006cac EDX=000f8956
ESI=bffbdf62 EDI=00000000 EBP=00006c68 ESP=00006c68
EIP=0000d187 EFL=00000004 [-----P-] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =e000 000e0000 ffffffff 00809300 DPL=0 DS16 [-WA]
CS =f000 000f0000 ffffffff 00809b00 DPL=0 CS16 [-RA]
SS =0000 00000000 ffffffff 00809300 DPL=0 DS16 [-WA]
DS =0000 00000000 ffffffff 00809300 DPL=0 DS16 [-WA]
FS =0000 00000000 ffffffff 00809300 DPL=0 DS16 [-WA]
GS =0000 00000000 ffffffff 00809300 DPL=0 DS16 [-WA]
LDT=0000 00000000 0000ffff 00008200 DPL=0 LDT
TR =0000 00000000 0000ffff 00008b00 DPL=0 TSS32-busy
GDT=     000f6a80 00000037
IDT=     000f6abe 00000000
CR0=00000011 CR2=00000000 CR3=00000000 CR4=00000000
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
DR6=00000000ffff0ff0 DR7=0000000000000400
EFER=0000000000000000
Code=01 1e b8 6a 2e 0f 01 16 74 6a 0f 20 c0 66 83 c8 01 0f 22 c0 <66> ea 8f d1 0f 00 08 00 b8 10 00 00 00 8e d8 8e c0 8e d0 8e e0 8e e8 89 c8 ff e2 89 c1 b8X

X86 eflags bit 1 is fixed set, which means that 1 << 1 is set instead of 1,
this patch fix it.

Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com>
Message-Id: <1428473294-6633-1-git-send-email-wanpeng.li@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/emulate.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index b304728aabe3..630bcb0d7a04 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2033,7 +2033,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 			     X86_EFLAGS_IF | X86_EFLAGS_DF | X86_EFLAGS_OF |
 			     X86_EFLAGS_IOPL | X86_EFLAGS_NT | X86_EFLAGS_RF |
 			     X86_EFLAGS_AC | X86_EFLAGS_ID |
-			     X86_EFLAGS_FIXED_BIT;
+			     X86_EFLAGS_FIXED;
 	unsigned long vm86_mask = X86_EFLAGS_VM | X86_EFLAGS_VIF |
 				  X86_EFLAGS_VIP;
 
@@ -2072,7 +2072,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt)
 	}
 
 	ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
-	ctxt->eflags |= X86_EFLAGS_FIXED_BIT;
+	ctxt->eflags |= X86_EFLAGS_FIXED;
 	ctxt->ops->set_nmi_mask(ctxt, false);
 
 	return rc;
@@ -2390,7 +2390,7 @@ static int em_syscall(struct x86_emulate_ctxt *ctxt)
 
 		ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~msr_data;
-		ctxt->eflags |= X86_EFLAGS_FIXED_BIT;
+		ctxt->eflags |= X86_EFLAGS_FIXED;
 #endif
 	} else {
 		/* legacy mode */
-- 
cgit v1.2.3


From 362c698f8220e636edf1c40b1935715fa57f492f Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Fri, 6 Feb 2015 12:48:04 +0100
Subject: KVM: x86: extract blocking logic from __vcpu_run

Rename the old __vcpu_run to vcpu_run, and extract part of it to a new
function vcpu_block.

The next patch will add a new condition in vcpu_block, avoid extra
indentation.

Reviewed-by: David Matlack <dmatlack@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 62 ++++++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a284c927551e..6256dfa598a1 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6186,7 +6186,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 }
 
 /*
- * Returns 1 to let __vcpu_run() continue the guest execution loop without
+ * Returns 1 to let vcpu_run() continue the guest execution loop without
  * exiting to the userspace.  Otherwise, the value will be returned to the
  * userspace.
  */
@@ -6404,42 +6404,46 @@ out:
 	return r;
 }
 
+static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
+{
+	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+	kvm_vcpu_block(vcpu);
+	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+		return 1;
+
+	kvm_apic_accept_events(vcpu);
+	switch(vcpu->arch.mp_state) {
+	case KVM_MP_STATE_HALTED:
+		vcpu->arch.pv.pv_unhalted = false;
+		vcpu->arch.mp_state =
+			KVM_MP_STATE_RUNNABLE;
+	case KVM_MP_STATE_RUNNABLE:
+		vcpu->arch.apf.halted = false;
+		break;
+	case KVM_MP_STATE_INIT_RECEIVED:
+		break;
+	default:
+		return -EINTR;
+		break;
+	}
+	return 1;
+}
 
-static int __vcpu_run(struct kvm_vcpu *vcpu)
+static int vcpu_run(struct kvm_vcpu *vcpu)
 {
 	int r;
 	struct kvm *kvm = vcpu->kvm;
 
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
-	r = 1;
-	while (r > 0) {
+	for (;;) {
 		if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
 		    !vcpu->arch.apf.halted)
 			r = vcpu_enter_guest(vcpu);
-		else {
-			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-			kvm_vcpu_block(vcpu);
-			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) {
-				kvm_apic_accept_events(vcpu);
-				switch(vcpu->arch.mp_state) {
-				case KVM_MP_STATE_HALTED:
-					vcpu->arch.pv.pv_unhalted = false;
-					vcpu->arch.mp_state =
-						KVM_MP_STATE_RUNNABLE;
-				case KVM_MP_STATE_RUNNABLE:
-					vcpu->arch.apf.halted = false;
-					break;
-				case KVM_MP_STATE_INIT_RECEIVED:
-					break;
-				default:
-					r = -EINTR;
-					break;
-				}
-			}
-		}
-
+		else
+			r = vcpu_block(kvm, vcpu);
 		if (r <= 0)
 			break;
 
@@ -6451,6 +6455,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.request_irq_exits;
+			break;
 		}
 
 		kvm_check_async_pf_completion(vcpu);
@@ -6459,6 +6464,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			r = -EINTR;
 			vcpu->run->exit_reason = KVM_EXIT_INTR;
 			++vcpu->stat.signal_exits;
+			break;
 		}
 		if (need_resched()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -6590,7 +6596,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	} else
 		WARN_ON(vcpu->arch.pio.count || vcpu->mmio_needed);
 
-	r = __vcpu_run(vcpu);
+	r = vcpu_run(vcpu);
 
 out:
 	post_kvm_run_save(vcpu);
-- 
cgit v1.2.3


From 9c8fd1ba2201c072bd3cf6940e2ca4d0a7aed723 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini
Date: Fri, 6 Feb 2015 12:58:42 +0100
Subject: KVM: x86: optimize delivery of TSC deadline timer interrupt

The newly-added tracepoint shows the following results on
the tscdeadline_latency test:

        qemu-kvm-8387  [002]  6425.558974: kvm_vcpu_wakeup:      poll time 10407 ns
        qemu-kvm-8387  [002]  6425.558984: kvm_vcpu_wakeup:      poll time 0 ns
        qemu-kvm-8387  [002]  6425.561242: kvm_vcpu_wakeup:      poll time 10477 ns
        qemu-kvm-8387  [002]  6425.561251: kvm_vcpu_wakeup:      poll time 0 ns

and so on.  This is because we need to go through kvm_vcpu_block again
after the timer IRQ is injected.  Avoid it by polling once before
entering kvm_vcpu_block.

On my machine (Xeon E5 Sandy Bridge) this removes about 500 cycles (7%)
from the latency of the TSC deadline timer.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 6256dfa598a1..50861dd15a94 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6406,12 +6406,13 @@ out:
 
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
-	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-	kvm_vcpu_block(vcpu);
-	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-
-	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
-		return 1;
+	if (!kvm_arch_vcpu_runnable(vcpu)) {
+		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
+		kvm_vcpu_block(vcpu);
+		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
+			return 1;
+	}
 
 	kvm_apic_accept_events(vcpu);
 	switch(vcpu->arch.mp_state) {
-- 
cgit v1.2.3


From 80f7fdb1c7f0f9266421f823964fd1962681f6ce Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 2 Apr 2015 20:44:23 +0200
Subject: x86: vdso: fix pvclock races with task migration

If we were migrated right after __getcpu, but before reading the
migration_count, we wouldn't notice that we read TSC of a different
VCPU, nor that KVM's bug made pvti invalid, as only migration_count
on source VCPU is increased.

Change vdso instead of updating migration_count on destination.

Cc: stable@vger.kernel.org
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Fixes: 0a4e6be9ca17 ("x86: kvm: Revert "remove sched notifier for cross-cpu migrations"")
Message-Id: <1428000263-11892-1-git-send-email-rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/vdso/vclock_gettime.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 30933760ee5f..40d2473836c9 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -99,21 +99,25 @@ static notrace cycle_t vread_pvclock(int *mode)
 		 * __getcpu() calls (Gleb).
 		 */
 
-		pvti = get_pvti(cpu);
+		/* Make sure migrate_count will change if we leave the VCPU. */
+		do {
+			pvti = get_pvti(cpu);
+			migrate_count = pvti->migrate_count;
 
-		migrate_count = pvti->migrate_count;
+			cpu1 = cpu;
+			cpu = __getcpu() & VGETCPU_CPU_MASK;
+		} while (unlikely(cpu != cpu1));
 
 		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
 
 		/*
 		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
+		 * - We must read TSC of pvti's VCPU.
+		 * - KVM doesn't follow the versioning protocol, so data could
+		 *   change before version if we left the VCPU.
 		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			  (pvti->pvti.version & 1) ||
+		smp_rmb();
+	} while (unlikely((pvti->pvti.version & 1) ||
 			  pvti->pvti.version != version ||
 			  pvti->migrate_count != migrate_count));
 
-- 
cgit v1.2.3


From 80f0e95d1b221688b2608e0a80134e7acccd9a89 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 2 Apr 2015 21:11:05 +0200
Subject: KVM: vmx: pass error code with internal error #2

Exposing the on-stack error code with internal error is cheap and
potentially useful.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Message-Id: <1428001865-32280-1-git-send-email-rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b5a6425d8d97..9e4b12b5bff6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5089,9 +5089,10 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	    !(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
-		vcpu->run->internal.ndata = 2;
+		vcpu->run->internal.ndata = 3;
 		vcpu->run->internal.data[0] = vect_info;
 		vcpu->run->internal.data[1] = intr_info;
+		vcpu->run->internal.data[2] = error_code;
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 5a4f55cde81f1633cb7ae9f0963b722e47acdc36 Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky
Date: Sun, 29 Mar 2015 23:56:12 +0300
Subject: KVM: x86: cache maxphyaddr CPUID leaf in struct kvm_vcpu

cpuid_maxphyaddr(), which performs lot of memory accesses is called
extensively across KVM, especially in nVMX code.

This patch adds a cached value of maxphyaddr to vcpu.arch to reduce the
pressure onto CPU cache and simplify the code of cpuid_maxphyaddr()
callers. The cached value is initialized in kvm_arch_vcpu_init() and
reloaded every time CPUID is updated by usermode. It is obvious that
these reloads occur infrequently.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Message-Id: <20150329205612.GA1223@gnote>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  4 +++-
 arch/x86/kvm/cpuid.c            | 33 ++++++++++++++++++---------------
 arch/x86/kvm/cpuid.h            |  6 ++++++
 arch/x86/kvm/x86.c              |  2 ++
 4 files changed, 29 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 30b28dc76411..8d92e3bab118 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -426,6 +426,9 @@ struct kvm_vcpu_arch {
 
 	int cpuid_nent;
 	struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
+
+	int maxphyaddr;
+
 	/* emulate context */
 
 	struct x86_emulate_ctxt emulate_ctxt;
@@ -1124,7 +1127,6 @@ int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
 int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end);
 int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
 void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a80737ee6e6..59b69f6a2844 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -104,6 +104,9 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
 		((best->eax & 0xff00) >> 8) != 0)
 		return -EINVAL;
 
+	/* Update physical-address width */
+	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
 	kvm_pmu_cpuid_update(vcpu);
 	return 0;
 }
@@ -135,6 +138,21 @@ static void cpuid_fix_nx_cap(struct kvm_vcpu *vcpu)
 	}
 }
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
+	if (!best || best->eax < 0x80000008)
+		goto not_found;
+	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
+	if (best)
+		return best->eax & 0xff;
+not_found:
+	return 36;
+}
+EXPORT_SYMBOL_GPL(cpuid_query_maxphyaddr);
+
 /* when an old userspace process fills a new kernel module */
 int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 			     struct kvm_cpuid *cpuid,
@@ -757,21 +775,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
 }
 EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
 
-int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
-{
-	struct kvm_cpuid_entry2 *best;
-
-	best = kvm_find_cpuid_entry(vcpu, 0x80000000, 0);
-	if (!best || best->eax < 0x80000008)
-		goto not_found;
-	best = kvm_find_cpuid_entry(vcpu, 0x80000008, 0);
-	if (best)
-		return best->eax & 0xff;
-not_found:
-	return 36;
-}
-EXPORT_SYMBOL_GPL(cpuid_maxphyaddr);
-
 /*
  * If no match is found, check whether we exceed the vCPU's limit
  * and return the content of the highest valid _standard_ leaf instead.
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index 26228466f3f8..c3b1ad9fca81 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -20,6 +20,12 @@ int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 			      struct kvm_cpuid_entry2 __user *entries);
 void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx);
 
+int cpuid_query_maxphyaddr(struct kvm_vcpu *vcpu);
+
+static inline int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
+{
+	return vcpu->arch.maxphyaddr;
+}
 
 static inline bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 50861dd15a94..a578629acb42 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7317,6 +7317,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	vcpu->arch.guest_supported_xcr0 = 0;
 	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
 
+	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 
-- 
cgit v1.2.3


From 9090422f1ca5270795738549cf91a4ae7cb47662 Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky
Date: Sun, 29 Mar 2015 23:56:27 +0300
Subject: KVM: nVMX: checks for address bits beyond MAXPHYADDR on VM-entry

On each VM-entry CPU should check the following VMCS fields for zero bits
beyond physical address width:
-  APIC-access address
-  virtual-APIC address
-  posted-interrupt descriptor address
This patch adds these checks required by Intel SDM.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Message-Id: <20150329205627.GA1244@gnote>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 9e4b12b5bff6..6f770e875936 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8622,10 +8622,11 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 					struct vmcs12 *vmcs12)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	int maxphyaddr = cpuid_maxphyaddr(vcpu);
 
 	if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
-		/* TODO: Also verify bits beyond physical address width are 0 */
-		if (!PAGE_ALIGNED(vmcs12->apic_access_addr))
+		if (!PAGE_ALIGNED(vmcs12->apic_access_addr) ||
+		    vmcs12->apic_access_addr >> maxphyaddr)
 			return false;
 
 		/*
@@ -8641,8 +8642,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 	}
 
 	if (nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW)) {
-		/* TODO: Also verify bits beyond physical address width are 0 */
-		if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr))
+		if (!PAGE_ALIGNED(vmcs12->virtual_apic_page_addr) ||
+		    vmcs12->virtual_apic_page_addr >> maxphyaddr)
 			return false;
 
 		if (vmx->nested.virtual_apic_page) /* shouldn't happen */
@@ -8665,7 +8666,8 @@ static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
 	}
 
 	if (nested_cpu_has_posted_intr(vmcs12)) {
-		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64))
+		if (!IS_ALIGNED(vmcs12->posted_intr_desc_addr, 64) ||
+		    vmcs12->posted_intr_desc_addr >> maxphyaddr)
 			return false;
 
 		if (vmx->nested.pi_desc_page) { /* shouldn't happen */
@@ -9386,7 +9388,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 	}
 
 	if (!nested_get_vmcs12_pages(vcpu, vmcs12)) {
-		/*TODO: Also verify bits beyond physical address width are 0*/
 		nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
 		return 1;
 	}
-- 
cgit v1.2.3


From 92d71bc6951364c20f7d8c70a83cd93a68a63ea7 Mon Sep 17 00:00:00 2001
From: Eugene Korenevsky
Date: Sun, 29 Mar 2015 23:56:44 +0300
Subject: KVM: nVMX: remove unnecessary double caching of MAXPHYADDR

After speed-up of cpuid_maxphyaddr() it can be called frequently:
instead of heavyweight enumeration of CPUID entries it returns a cached
pre-computed value. It is also inlined now. So caching its result became
unnecessary and can be removed.

Signed-off-by: Eugene Korenevsky <ekorenevsky@gmail.com>
Message-Id: <20150329205644.GA1258@gnote>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/vmx.c | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6f770e875936..ddce07e8bef8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8866,9 +8866,9 @@ static int nested_vmx_check_apicv_controls(struct kvm_vcpu *vcpu,
 
 static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 				       unsigned long count_field,
-				       unsigned long addr_field,
-				       int maxphyaddr)
+				       unsigned long addr_field)
 {
+	int maxphyaddr;
 	u64 count, addr;
 
 	if (vmcs12_read_any(vcpu, count_field, &count) ||
@@ -8878,6 +8878,7 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 	}
 	if (count == 0)
 		return 0;
+	maxphyaddr = cpuid_maxphyaddr(vcpu);
 	if (!IS_ALIGNED(addr, 16) || addr >> maxphyaddr ||
 	    (addr + count * sizeof(struct vmx_msr_entry) - 1) >> maxphyaddr) {
 		pr_warn_ratelimited(
@@ -8891,19 +8892,16 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 static int nested_vmx_check_msr_switch_controls(struct kvm_vcpu *vcpu,
 						struct vmcs12 *vmcs12)
 {
-	int maxphyaddr;
-
 	if (vmcs12->vm_exit_msr_load_count == 0 &&
 	    vmcs12->vm_exit_msr_store_count == 0 &&
 	    vmcs12->vm_entry_msr_load_count == 0)
 		return 0; /* Fast path */
-	maxphyaddr = cpuid_maxphyaddr(vcpu);
 	if (nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_LOAD_COUNT,
-					VM_EXIT_MSR_LOAD_ADDR, maxphyaddr) ||
+					VM_EXIT_MSR_LOAD_ADDR) ||
 	    nested_vmx_check_msr_switch(vcpu, VM_EXIT_MSR_STORE_COUNT,
-					VM_EXIT_MSR_STORE_ADDR, maxphyaddr) ||
+					VM_EXIT_MSR_STORE_ADDR) ||
 	    nested_vmx_check_msr_switch(vcpu, VM_ENTRY_MSR_LOAD_COUNT,
-					VM_ENTRY_MSR_LOAD_ADDR, maxphyaddr))
+					VM_ENTRY_MSR_LOAD_ADDR))
 		return -EINVAL;
 	return 0;
 }
-- 
cgit v1.2.3


From 03d2249ea60818e97475ac529aa183cf130935bb Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 12 Feb 2015 19:41:31 +0100
Subject: KVM: x86: use MDA for interrupt matching

In mixed modes, we musn't deliver xAPIC IPIs like x2APIC and vice versa.
Instead of preserving the information in apic_send_ipi(), we regain it
by converting all destinations into correct MDA in the slow path.
This allows easier reasoning about subsequent matching.

Our kvm_apic_broadcast() had an interesting design decision: it didn't
consider IOxAPIC 0xff as broadcast in x2APIC mode ...
everything worked because IOxAPIC can't set that in physical mode and
logical mode considered it as a message for first 8 VCPUs.
This patch interprets IOxAPIC 0xff as x2APIC broadcast.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Message-Id: <1423766494-26150-2-git-send-email-rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c | 40 +++++++++++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 44f7b9afbedb..69569744745d 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -588,15 +588,23 @@ static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 	apic_update_ppr(apic);
 }
 
-static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
 {
-	return dest == (apic_x2apic_mode(apic) ?
-			X2APIC_BROADCAST : APIC_BROADCAST);
+	if (apic_x2apic_mode(apic))
+		return mda == X2APIC_BROADCAST;
+
+	return GET_APIC_DEST_FIELD(mda) == APIC_BROADCAST;
 }
 
-static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 dest)
+static bool kvm_apic_match_physical_addr(struct kvm_lapic *apic, u32 mda)
 {
-	return kvm_apic_id(apic) == dest || kvm_apic_broadcast(apic, dest);
+	if (kvm_apic_broadcast(apic, mda))
+		return true;
+
+	if (apic_x2apic_mode(apic))
+		return mda == kvm_apic_id(apic);
+
+	return mda == SET_APIC_DEST_FIELD(kvm_apic_id(apic));
 }
 
 static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
@@ -613,6 +621,7 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 		       && (logical_id & mda & 0xffff) != 0;
 
 	logical_id = GET_APIC_LOGICAL_ID(logical_id);
+	mda = GET_APIC_DEST_FIELD(mda);
 
 	switch (kvm_apic_get_reg(apic, APIC_DFR)) {
 	case APIC_DFR_FLAT:
@@ -627,10 +636,27 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
 	}
 }
 
+/* KVM APIC implementation has two quirks
+ *  - dest always begins at 0 while xAPIC MDA has offset 24,
+ *  - IOxAPIC messages have to be delivered (directly) to x2APIC.
+ */
+static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
+                                              struct kvm_lapic *target)
+{
+	bool ipi = source != NULL;
+	bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
+
+	if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+		return X2APIC_BROADCAST;
+
+	return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
+}
+
 bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 			   int short_hand, unsigned int dest, int dest_mode)
 {
 	struct kvm_lapic *target = vcpu->arch.apic;
+	u32 mda = kvm_apic_mda(dest, source, target);
 
 	apic_debug("target %p, source %p, dest 0x%x, "
 		   "dest_mode 0x%x, short_hand 0x%x\n",
@@ -640,9 +666,9 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
 		if (dest_mode == APIC_DEST_PHYSICAL)
-			return kvm_apic_match_physical_addr(target, dest);
+			return kvm_apic_match_physical_addr(target, mda);
 		else
-			return kvm_apic_match_logical_addr(target, dest);
+			return kvm_apic_match_logical_addr(target, mda);
 	case APIC_DEST_SELF:
 		return target == source;
 	case APIC_DEST_ALLINC:
-- 
cgit v1.2.3


From 9ea369b032d87b88f1a47187b51ad4321dea5766 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 12 Feb 2015 19:41:32 +0100
Subject: KVM: x86: fix mixed APIC mode broadcast

Broadcast allowed only one global APIC mode, but mixed modes are
theoretically possible.  x2APIC IPI doesn't mean 0xff as broadcast,
the rest does.

x2APIC broadcasts are accepted by xAPIC.  If we take SDM to be logical,
even addreses beginning with 0xff should be accepted, but real hardware
disagrees.  This patch aims for simple code by considering most of real
behavior as undefined.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Message-Id: <1423766494-26150-3-git-send-email-rkrcmar@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/lapic.c            | 9 ++++-----
 2 files changed, 5 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8d92e3bab118..5b1bc97a258a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -552,7 +552,7 @@ struct kvm_apic_map {
 	struct rcu_head rcu;
 	u8 ldr_bits;
 	/* fields bellow are used to decode ldr values in different modes */
-	u32 cid_shift, cid_mask, lid_mask, broadcast;
+	u32 cid_shift, cid_mask, lid_mask;
 	struct kvm_lapic *phys_map[256];
 	/* first index is cluster id second is cpu id in a cluster */
 	struct kvm_lapic *logical_map[16][16];
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 69569744745d..d45f00ba7440 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -151,7 +151,6 @@ static void recalculate_apic_map(struct kvm *kvm)
 	new->cid_shift = 8;
 	new->cid_mask = 0;
 	new->lid_mask = 0xff;
-	new->broadcast = APIC_BROADCAST;
 
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvm_lapic *apic = vcpu->arch.apic;
@@ -163,7 +162,6 @@ static void recalculate_apic_map(struct kvm *kvm)
 			new->ldr_bits = 32;
 			new->cid_shift = 16;
 			new->cid_mask = new->lid_mask = 0xffff;
-			new->broadcast = X2APIC_BROADCAST;
 		} else if (kvm_apic_get_reg(apic, APIC_LDR)) {
 			if (kvm_apic_get_reg(apic, APIC_DFR) ==
 							APIC_DFR_CLUSTER) {
@@ -690,6 +688,7 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	struct kvm_lapic **dst;
 	int i;
 	bool ret = false;
+	bool x2apic_ipi = src && apic_x2apic_mode(src);
 
 	*r = -1;
 
@@ -701,15 +700,15 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 	if (irq->shorthand)
 		return false;
 
+	if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+		return false;
+
 	rcu_read_lock();
 	map = rcu_dereference(kvm->arch.apic_map);
 
 	if (!map)
 		goto out;
 
-	if (irq->dest_id == map->broadcast)
-		goto out;
-
 	ret = true;
 
 	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
-- 
cgit v1.2.3


From 3548a259f6990d8cb4f520e6c14f4b45b1f2fd38 Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 12 Feb 2015 19:41:33 +0100
Subject: KVM: x86: avoid logical_map when it is invalid

We want to support mixed modes and the easiest solution is to avoid
optimizing those weird and unlikely scenarios.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Message-Id: <1423766494-26150-4-git-send-email-rkrcmar@redhat.com>
[Add comment above KVM_APIC_MODE_* defines. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 12 ++++++++++++
 arch/x86/kvm/lapic.c            | 24 +++++++++++++++++++++++-
 2 files changed, 35 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 5b1bc97a258a..9477b27e8e1e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -548,8 +548,20 @@ struct kvm_arch_memory_slot {
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
 };
 
+/*
+ * We use as the mode the number of bits allocated in the LDR for the
+ * logical processor ID.  It happens that these are all powers of two.
+ * This makes it is very easy to detect cases where the APICs are
+ * configured for multiple modes; in that case, we cannot use the map and
+ * hence cannot use kvm_irq_delivery_to_apic_fast either.
+ */
+#define KVM_APIC_MODE_XAPIC_CLUSTER          4
+#define KVM_APIC_MODE_XAPIC_FLAT             8
+#define KVM_APIC_MODE_X2APIC                16
+
 struct kvm_apic_map {
 	struct rcu_head rcu;
+	u8 mode;
 	u8 ldr_bits;
 	/* fields bellow are used to decode ldr values in different modes */
 	u32 cid_shift, cid_mask, lid_mask;
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index d45f00ba7440..fa4bd89ea381 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -133,6 +133,14 @@ static inline int kvm_apic_id(struct kvm_lapic *apic)
 	return (kvm_apic_get_reg(apic, APIC_ID) >> 24) & 0xff;
 }
 
+/* The logical map is definitely wrong if we have multiple
+ * modes at the same time.  (Physical map is always right.)
+ */
+static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
+{
+	return !(map->mode & (map->mode - 1));
+}
+
 static void recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
@@ -162,16 +170,19 @@ static void recalculate_apic_map(struct kvm *kvm)
 			new->ldr_bits = 32;
 			new->cid_shift = 16;
 			new->cid_mask = new->lid_mask = 0xffff;
+			new->mode |= KVM_APIC_MODE_X2APIC;
 		} else if (kvm_apic_get_reg(apic, APIC_LDR)) {
 			if (kvm_apic_get_reg(apic, APIC_DFR) ==
 							APIC_DFR_CLUSTER) {
 				new->cid_shift = 4;
 				new->cid_mask = 0xf;
 				new->lid_mask = 0xf;
+				new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
 			} else {
 				new->cid_shift = 8;
 				new->cid_mask = 0;
 				new->lid_mask = 0xff;
+				new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
 			}
 		}
 
@@ -201,6 +212,10 @@ static void recalculate_apic_map(struct kvm *kvm)
 
 		if (aid < ARRAY_SIZE(new->phys_map))
 			new->phys_map[aid] = apic;
+
+		if (!kvm_apic_logical_map_valid(new));
+			continue;
+
 		if (lid && cid < ARRAY_SIZE(new->logical_map))
 			new->logical_map[cid][ffs(lid) - 1] = apic;
 	}
@@ -718,7 +733,14 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 		dst = &map->phys_map[irq->dest_id];
 	} else {
 		u32 mda = irq->dest_id << (32 - map->ldr_bits);
-		u16 cid = apic_cluster_id(map, mda);
+		u16 cid;
+
+		if (!kvm_apic_logical_map_valid(map)) {
+			ret = false;
+			goto out;
+		}
+
+		cid = apic_cluster_id(map, mda);
 
 		if (cid >= ARRAY_SIZE(map->logical_map))
 			goto out;
-- 
cgit v1.2.3


From 3b5a5ffa928a3f875b0d5dd284eeb7c322e1688a Mon Sep 17 00:00:00 2001
From: Radim Krčmář
Date: Thu, 12 Feb 2015 19:41:34 +0100
Subject: KVM: x86: simplify kvm_apic_map

recalculate_apic_map() uses two passes over all VCPUs.  This is a relic
from time when we selected a global mode in the first pass and set up
the optimized table in the second pass (to have a consistent mode).

Recent changes made mixed mode unoptimized and we can do it in one pass.
Format of logical MDA is a function of the mode, so we encode it in
apic_logical_id() and drop obsoleted variables from the struct.

Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Message-Id: <1423766494-26150-5-git-send-email-rkrcmar@redhat.com>
[Add lid_bits temporary in apic_logical_id. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  3 --
 arch/x86/kvm/lapic.c            | 78 +++++++++++++++--------------------------
 arch/x86/kvm/lapic.h            | 15 --------
 3 files changed, 28 insertions(+), 68 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9477b27e8e1e..d0cfdb08b4c2 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -562,9 +562,6 @@ struct kvm_arch_memory_slot {
 struct kvm_apic_map {
 	struct rcu_head rcu;
 	u8 mode;
-	u8 ldr_bits;
-	/* fields bellow are used to decode ldr values in different modes */
-	u32 cid_shift, cid_mask, lid_mask;
 	struct kvm_lapic *phys_map[256];
 	/* first index is cluster id second is cpu id in a cluster */
 	struct kvm_lapic *logical_map[16][16];
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index fa4bd89ea381..11a0af113f27 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -141,6 +141,20 @@ static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
 	return !(map->mode & (map->mode - 1));
 }
 
+static inline void
+apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+{
+	unsigned lid_bits;
+
+	BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER !=  4);
+	BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT    !=  8);
+	BUILD_BUG_ON(KVM_APIC_MODE_X2APIC        != 16);
+	lid_bits = map->mode;
+
+	*cid = dest_id >> lid_bits;
+	*lid = dest_id & ((1 << lid_bits) - 1);
+}
+
 static void recalculate_apic_map(struct kvm *kvm)
 {
 	struct kvm_apic_map *new, *old = NULL;
@@ -154,49 +168,6 @@ static void recalculate_apic_map(struct kvm *kvm)
 	if (!new)
 		goto out;
 
-	new->ldr_bits = 8;
-	/* flat mode is default */
-	new->cid_shift = 8;
-	new->cid_mask = 0;
-	new->lid_mask = 0xff;
-
-	kvm_for_each_vcpu(i, vcpu, kvm) {
-		struct kvm_lapic *apic = vcpu->arch.apic;
-
-		if (!kvm_apic_present(vcpu))
-			continue;
-
-		if (apic_x2apic_mode(apic)) {
-			new->ldr_bits = 32;
-			new->cid_shift = 16;
-			new->cid_mask = new->lid_mask = 0xffff;
-			new->mode |= KVM_APIC_MODE_X2APIC;
-		} else if (kvm_apic_get_reg(apic, APIC_LDR)) {
-			if (kvm_apic_get_reg(apic, APIC_DFR) ==
-							APIC_DFR_CLUSTER) {
-				new->cid_shift = 4;
-				new->cid_mask = 0xf;
-				new->lid_mask = 0xf;
-				new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
-			} else {
-				new->cid_shift = 8;
-				new->cid_mask = 0;
-				new->lid_mask = 0xff;
-				new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
-			}
-		}
-
-		/*
-		 * All APICs have to be configured in the same mode by an OS.
-		 * We take advatage of this while building logical id loockup
-		 * table. After reset APICs are in software disabled mode, so if
-		 * we find apic with different setting we assume this is the mode
-		 * OS wants all apics to be in; build lookup table accordingly.
-		 */
-		if (kvm_apic_sw_enabled(apic))
-			break;
-	}
-
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 		struct kvm_lapic *apic = vcpu->arch.apic;
 		u16 cid, lid;
@@ -207,15 +178,25 @@ static void recalculate_apic_map(struct kvm *kvm)
 
 		aid = kvm_apic_id(apic);
 		ldr = kvm_apic_get_reg(apic, APIC_LDR);
-		cid = apic_cluster_id(new, ldr);
-		lid = apic_logical_id(new, ldr);
 
 		if (aid < ARRAY_SIZE(new->phys_map))
 			new->phys_map[aid] = apic;
 
-		if (!kvm_apic_logical_map_valid(new));
+		if (apic_x2apic_mode(apic)) {
+			new->mode |= KVM_APIC_MODE_X2APIC;
+		} else if (ldr) {
+			ldr = GET_APIC_LOGICAL_ID(ldr);
+			if (kvm_apic_get_reg(apic, APIC_DFR) == APIC_DFR_FLAT)
+				new->mode |= KVM_APIC_MODE_XAPIC_FLAT;
+			else
+				new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
+		}
+
+		if (!kvm_apic_logical_map_valid(new))
 			continue;
 
+		apic_logical_id(new, ldr, &cid, &lid);
+
 		if (lid && cid < ARRAY_SIZE(new->logical_map))
 			new->logical_map[cid][ffs(lid) - 1] = apic;
 	}
@@ -732,7 +713,6 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 
 		dst = &map->phys_map[irq->dest_id];
 	} else {
-		u32 mda = irq->dest_id << (32 - map->ldr_bits);
 		u16 cid;
 
 		if (!kvm_apic_logical_map_valid(map)) {
@@ -740,15 +720,13 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
 			goto out;
 		}
 
-		cid = apic_cluster_id(map, mda);
+		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
 
 		if (cid >= ARRAY_SIZE(map->logical_map))
 			goto out;
 
 		dst = map->logical_map[cid];
 
-		bitmap = apic_logical_id(map, mda);
-
 		if (irq->delivery_mode == APIC_DM_LOWEST) {
 			int l = -1;
 			for_each_set_bit(i, &bitmap, 16) {
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index e284c2880c56..9d28383fc1e7 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -148,21 +148,6 @@ static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
 	return kvm_x86_ops->vm_has_apicv(kvm);
 }
 
-static inline u16 apic_cluster_id(struct kvm_apic_map *map, u32 ldr)
-{
-	u16 cid;
-	ldr >>= 32 - map->ldr_bits;
-	cid = (ldr >> map->cid_shift) & map->cid_mask;
-
-	return cid;
-}
-
-static inline u16 apic_logical_id(struct kvm_apic_map *map, u32 ldr)
-{
-	ldr >>= (32 - map->ldr_bits);
-	return ldr & map->lid_mask;
-}
-
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.apic->pending_events;
-- 
cgit v1.2.3


From 58d269d8cccc53643f1a0900cfc0940e85ec9691 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Thu, 2 Apr 2015 03:10:36 +0300
Subject: KVM: x86: BSP in MSR_IA32_APICBASE is writable

After reset, the CPU can change the BSP, which will be used upon INIT.  Reset
should return the BSP which QEMU asked for, and therefore handled accordingly.

To quote: "If the MP protocol has completed and a BSP is chosen, subsequent
INITs (either to a specific processor or system wide) do not cause the MP
protocol to be repeated."
[Intel SDM 8.4.2: MP Initialization Protocol Requirements and Restrictions]

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427933438-12782-3-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/lapic.c     | 2 --
 arch/x86/kvm/svm.c       | 2 +-
 arch/x86/kvm/vmx.c       | 2 +-
 arch/x86/kvm/x86.c       | 2 +-
 include/linux/kvm_host.h | 7 ++++++-
 5 files changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 11a0af113f27..4a6e58a967f7 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1523,8 +1523,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
 		return;
 	}
 
-	if (!kvm_vcpu_is_bsp(apic->vcpu))
-		value &= ~MSR_IA32_APICBASE_BSP;
 	vcpu->arch.apic_base = value;
 
 	/* update jump label if enable bit changes */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 155534c0f5e8..ce741b8650f6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1261,7 +1261,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 
 	svm->vcpu.arch.apic_base = APIC_DEFAULT_PHYS_BASE |
 				   MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_bsp(&svm->vcpu))
+	if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 	svm_init_osvw(&svm->vcpu);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ddce07e8bef8..8c14d6a455b0 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4706,7 +4706,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
 	kvm_set_cr8(&vmx->vcpu, 0);
 	apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE;
-	if (kvm_vcpu_is_bsp(&vmx->vcpu))
+	if (kvm_vcpu_is_reset_bsp(&vmx->vcpu))
 		apic_base_msr.data |= MSR_IA32_APICBASE_BSP;
 	apic_base_msr.host_initiated = true;
 	kvm_set_apic_base(&vmx->vcpu, &apic_base_msr);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a578629acb42..f7a78c62ab87 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7269,7 +7269,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
-	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
+	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 	else
 		vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 27bd53b69080..82af5d0b996e 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -982,11 +982,16 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 #endif /* CONFIG_HAVE_KVM_EVENTFD */
 
 #ifdef CONFIG_KVM_APIC_ARCHITECTURE
-static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
 {
 	return vcpu->kvm->bsp_vcpu_id == vcpu->vcpu_id;
 }
 
+static inline bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
+{
+	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
+}
+
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
 
 #else
-- 
cgit v1.2.3


From ae561edeb421fbc24f97df7af8607c14009c16b2 Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Thu, 2 Apr 2015 03:10:37 +0300
Subject: KVM: x86: DR0-DR3 are not clear on reset

DR0-DR3 are not cleared as they should during reset and when they are set from
userspace.  It appears to be caused by c77fb5fe6f03 ("KVM: x86: Allow the guest
to run with dirty debug registers").

Force their reload on these situations.

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427933438-12782-4-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  1 +
 arch/x86/kvm/x86.c              | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d0cfdb08b4c2..9f1d66e2e3b5 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -340,6 +340,7 @@ struct kvm_pmu {
 enum {
 	KVM_DEBUGREG_BP_ENABLED = 1,
 	KVM_DEBUGREG_WONT_EXIT = 2,
+	KVM_DEBUGREG_RELOAD = 4,
 };
 
 struct kvm_vcpu_arch {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f7a78c62ab87..ad3809df7d0a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -801,6 +801,17 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
+static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
+{
+	int i;
+
+	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
+		for (i = 0; i < KVM_NR_DB_REGS; i++)
+			vcpu->arch.eff_db[i] = vcpu->arch.db[i];
+		vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD;
+	}
+}
+
 static void kvm_update_dr6(struct kvm_vcpu *vcpu)
 {
 	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
@@ -3150,6 +3161,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
+	kvm_update_dr0123(vcpu);
 	vcpu->arch.dr6 = dbgregs->dr6;
 	kvm_update_dr6(vcpu);
 	vcpu->arch.dr7 = dbgregs->dr7;
@@ -6323,6 +6335,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		set_debugreg(vcpu->arch.eff_db[2], 2);
 		set_debugreg(vcpu->arch.eff_db[3], 3);
 		set_debugreg(vcpu->arch.dr6, 6);
+		vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
 	}
 
 	trace_kvm_entry(vcpu->vcpu_id);
@@ -7104,6 +7117,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	kvm_clear_exception_queue(vcpu);
 
 	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
+	kvm_update_dr0123(vcpu);
 	vcpu->arch.dr6 = DR6_INIT;
 	kvm_update_dr6(vcpu);
 	vcpu->arch.dr7 = DR7_FIXED_1;
-- 
cgit v1.2.3


From 1119022c71fb11826041787cf0ebffc1a1b0ba5b Mon Sep 17 00:00:00 2001
From: Nadav Amit
Date: Thu, 2 Apr 2015 03:10:38 +0300
Subject: KVM: x86: Clear CR2 on VCPU reset

CR2 is not cleared as it should after reset.  See Intel SDM table named "IA-32
Processor States Following Power-up, Reset, or INIT".

Signed-off-by: Nadav Amit <namit@cs.technion.ac.il>
Message-Id: <1427933438-12782-5-git-send-email-namit@cs.technion.ac.il>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ad3809df7d0a..faf044dba60c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7123,6 +7123,8 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.dr7 = DR7_FIXED_1;
 	kvm_update_dr7(vcpu);
 
+	vcpu->arch.cr2 = 0;
+
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	vcpu->arch.apf.msr_val = 0;
 	vcpu->arch.st.msr_val = 0;
-- 
cgit v1.2.3


From 3ea3b7fa9af067982f34b6745584558821eea79d Mon Sep 17 00:00:00 2001
From: Wanpeng Li
Date: Fri, 3 Apr 2015 15:40:25 +0800
Subject: kvm: mmu: lazy collapse small sptes into large sptes

Dirty logging tracks sptes in 4k granularity, meaning that large sptes
have to be split.  If live migration is successful, the guest in the
source machine will be destroyed and large sptes will be created in the
destination. However, the guest continues to run in the source machine
(for example if live migration fails), small sptes will remain around
and cause bad performance.

This patch introduce lazy collapsing of small sptes into large sptes.
The rmap will be scanned in ioctl context when dirty logging is stopped,
dropping those sptes which can be collapsed into a single large-page spte.
Later page faults will create the large-page sptes.

Reviewed-by: Xiao Guangrong <guangrong.xiao@linux.intel.com>
Signed-off-by: Wanpeng Li <wanpeng.li@linux.intel.com>
Message-Id: <1428046825-6905-1-git-send-email-wanpeng.li@linux.intel.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 ++
 arch/x86/kvm/mmu.c              | 73 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/kvm/x86.c              | 17 ++++++++++
 3 files changed, 92 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9f1d66e2e3b5..dea2e7e962e3 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -867,6 +867,8 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
 void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 				      struct kvm_memory_slot *memslot);
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+					struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot);
 void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index cee759299a35..146f295ee322 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4465,6 +4465,79 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
 		kvm_flush_remote_tlbs(kvm);
 }
 
+static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
+		unsigned long *rmapp)
+{
+	u64 *sptep;
+	struct rmap_iterator iter;
+	int need_tlb_flush = 0;
+	pfn_t pfn;
+	struct kvm_mmu_page *sp;
+
+	for (sptep = rmap_get_first(*rmapp, &iter); sptep;) {
+		BUG_ON(!(*sptep & PT_PRESENT_MASK));
+
+		sp = page_header(__pa(sptep));
+		pfn = spte_to_pfn(*sptep);
+
+		/*
+		 * Only EPT supported for now; otherwise, one would need to
+		 * find out efficiently whether the guest page tables are
+		 * also using huge pages.
+		 */
+		if (sp->role.direct &&
+			!kvm_is_reserved_pfn(pfn) &&
+			PageTransCompound(pfn_to_page(pfn))) {
+			drop_spte(kvm, sptep);
+			sptep = rmap_get_first(*rmapp, &iter);
+			need_tlb_flush = 1;
+		} else
+			sptep = rmap_get_next(&iter);
+	}
+
+	return need_tlb_flush;
+}
+
+void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
+			struct kvm_memory_slot *memslot)
+{
+	bool flush = false;
+	unsigned long *rmapp;
+	unsigned long last_index, index;
+	gfn_t gfn_start, gfn_end;
+
+	spin_lock(&kvm->mmu_lock);
+
+	gfn_start = memslot->base_gfn;
+	gfn_end = memslot->base_gfn + memslot->npages - 1;
+
+	if (gfn_start >= gfn_end)
+		goto out;
+
+	rmapp = memslot->arch.rmap[0];
+	last_index = gfn_to_index(gfn_end, memslot->base_gfn,
+					PT_PAGE_TABLE_LEVEL);
+
+	for (index = 0; index <= last_index; ++index, ++rmapp) {
+		if (*rmapp)
+			flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp);
+
+		if (need_resched() || spin_needbreak(&kvm->mmu_lock)) {
+			if (flush) {
+				kvm_flush_remote_tlbs(kvm);
+				flush = false;
+			}
+			cond_resched_lock(&kvm->mmu_lock);
+		}
+	}
+
+	if (flush)
+		kvm_flush_remote_tlbs(kvm);
+
+out:
+	spin_unlock(&kvm->mmu_lock);
+}
+
 void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
 				   struct kvm_memory_slot *memslot)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index faf044dba60c..b8cb1d091697 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7664,6 +7664,23 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 	/* It's OK to get 'new' slot here as it has already been installed */
 	new = id_to_memslot(kvm->memslots, mem->slot);
 
+	/*
+	 * Dirty logging tracks sptes in 4k granularity, meaning that large
+	 * sptes have to be split.  If live migration is successful, the guest
+	 * in the source machine will be destroyed and large sptes will be
+	 * created in the destination. However, if the guest continues to run
+	 * in the source machine (for example if live migration fails), small
+	 * sptes will remain around and cause bad performance.
+	 *
+	 * Scan sptes if dirty logging has been stopped, dropping those
+	 * which can be collapsed into a single large-page spte.  Later
+	 * page faults will create the large-page sptes.
+	 */
+	if ((change != KVM_MR_DELETE) &&
+		(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+		!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+		kvm_mmu_zap_collapsible_sptes(kvm, new);
+
 	/*
 	 * Set up write protection and/or dirty logging for the new slot.
 	 *
-- 
cgit v1.2.3


From cae2a173fe94ab3a437416af6f092fae2e65837e Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 6 Apr 2015 10:26:17 -0700
Subject: x86: clean up/fix 'copy_in_user()' tail zeroing

The rule for 'copy_from_user()' is that it zeroes the remaining kernel
buffer even when the copy fails halfway, just to make sure that we don't
leave uninitialized kernel memory around.  Because even if we check for
errors, some kernel buffers stay around after thge copy (think page
cache).

However, the x86-64 logic for user copies uses a copy_user_generic()
function for all the cases, that set the "zerorest" flag for any fault
on the source buffer.  Which meant that it didn't just try to clear the
kernel buffer after a failure in copy_from_user(), it also tried to
clear the destination user buffer for the "copy_in_user()" case.

Not only is that pointless, it also means that the clearing code has to
worry about the tail clearing taking page faults for the user buffer
case.  Which is just stupid, since that case shouldn't happen in the
first place.

Get rid of the whole "zerorest" thing entirely, and instead just check
if the destination is in kernel space or not.  And then just use
memset() to clear the tail of the kernel buffer if necessary.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/uaccess_64.h |  2 +-
 arch/x86/lib/usercopy_64.c        | 15 +++++++--------
 2 files changed, 8 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 12a26b979bf1..f2f9b39b274a 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -231,6 +231,6 @@ __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 }
 
 unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
+copy_user_handle_tail(char *to, char *from, unsigned len);
 
 #endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c
index c905e89e19fe..1f33b3d1fd68 100644
--- a/arch/x86/lib/usercopy_64.c
+++ b/arch/x86/lib/usercopy_64.c
@@ -69,21 +69,20 @@ EXPORT_SYMBOL(copy_in_user);
  * it is not necessary to optimize tail handling.
  */
 __visible unsigned long
-copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)
+copy_user_handle_tail(char *to, char *from, unsigned len)
 {
-	char c;
-	unsigned zero_len;
-
 	for (; len; --len, to++) {
+		char c;
+
 		if (__get_user_nocheck(c, from++, sizeof(char)))
 			break;
 		if (__put_user_nocheck(c, to, sizeof(char)))
 			break;
 	}
-
-	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)
-		if (__put_user_nocheck(c, to++, sizeof(char)))
-			break;
 	clac();
+
+	/* If the destination is a kernel buffer, we always clear the end */
+	if ((unsigned long)to >= TASK_SIZE_MAX)
+		memset(to, 0, len);
 	return len;
 }
-- 
cgit v1.2.3


From 55dd0df781e58ec23d218376ea4a676e7362a98c Mon Sep 17 00:00:00 2001
From: Anton Blanchard
Date: Thu, 9 Apr 2015 13:51:30 +1000
Subject: jump_label: Allow asm/jump_label.h to be included in assembly

Wrap asm/jump_label.h for all archs with #ifndef __ASSEMBLY__.
Since these are kernel only headers, we don't need #ifdef
__KERNEL__ so can simplify things a bit.

If an architecture wants to use jump labels in assembly, it
will still need to define a macro to create the __jump_table
entries (see ARCH_STATIC_BRANCH in the powerpc asm/jump_label.h
for an example).

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: benh@kernel.crashing.org
Cc: catalin.marinas@arm.com
Cc: davem@davemloft.net
Cc: heiko.carstens@de.ibm.com
Cc: jbaron@akamai.com
Cc: linux@arm.linux.org.uk
Cc: linuxppc-dev@lists.ozlabs.org
Cc: liuj97@gmail.com
Cc: mgorman@suse.de
Cc: mmarek@suse.cz
Cc: mpe@ellerman.id.au
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: rostedt@goodmis.org
Cc: schwidefsky@de.ibm.com
Cc: will.deacon@arm.com
Link: http://lkml.kernel.org/r/1428551492-21977-1-git-send-email-anton@samba.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/arm/include/asm/jump_label.h   | 5 ++---
 arch/arm64/include/asm/jump_label.h | 8 ++++----
 arch/mips/include/asm/jump_label.h  | 7 +++----
 arch/s390/include/asm/jump_label.h  | 3 +++
 arch/sparc/include/asm/jump_label.h | 5 ++---
 arch/x86/include/asm/jump_label.h   | 5 ++---
 6 files changed, 16 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/include/asm/jump_label.h b/arch/arm/include/asm/jump_label.h
index 70f9b9bfb1f9..5f337dc5c108 100644
--- a/arch/arm/include/asm/jump_label.h
+++ b/arch/arm/include/asm/jump_label.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_ARM_JUMP_LABEL_H
 #define _ASM_ARM_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/types.h>
 
@@ -27,8 +27,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u32 jump_label_t;
 
 struct jump_entry {
@@ -37,4 +35,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
diff --git a/arch/arm64/include/asm/jump_label.h b/arch/arm64/include/asm/jump_label.h
index 076a1c714049..c0e5165c2f76 100644
--- a/arch/arm64/include/asm/jump_label.h
+++ b/arch/arm64/include/asm/jump_label.h
@@ -18,11 +18,12 @@
  */
 #ifndef __ASM_JUMP_LABEL_H
 #define __ASM_JUMP_LABEL_H
+
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <asm/insn.h>
 
-#ifdef __KERNEL__
-
 #define JUMP_LABEL_NOP_SIZE		AARCH64_INSN_SIZE
 
 static __always_inline bool arch_static_branch(struct static_key *key)
@@ -39,8 +40,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u64 jump_label_t;
 
 struct jump_entry {
@@ -49,4 +48,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif	/* __ASM_JUMP_LABEL_H */
diff --git a/arch/mips/include/asm/jump_label.h b/arch/mips/include/asm/jump_label.h
index fdbff44e5482..608aa57799c8 100644
--- a/arch/mips/include/asm/jump_label.h
+++ b/arch/mips/include/asm/jump_label.h
@@ -8,9 +8,9 @@
 #ifndef _ASM_MIPS_JUMP_LABEL_H
 #define _ASM_MIPS_JUMP_LABEL_H
 
-#include <linux/types.h>
+#ifndef __ASSEMBLY__
 
-#ifdef __KERNEL__
+#include <linux/types.h>
 
 #define JUMP_LABEL_NOP_SIZE 4
 
@@ -39,8 +39,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __KERNEL__ */
-
 #ifdef CONFIG_64BIT
 typedef u64 jump_label_t;
 #else
@@ -53,4 +51,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif /* _ASM_MIPS_JUMP_LABEL_H */
diff --git a/arch/s390/include/asm/jump_label.h b/arch/s390/include/asm/jump_label.h
index 58642fd29c87..2b77e235b5fb 100644
--- a/arch/s390/include/asm/jump_label.h
+++ b/arch/s390/include/asm/jump_label.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_S390_JUMP_LABEL_H
 #define _ASM_S390_JUMP_LABEL_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 
 #define JUMP_LABEL_NOP_SIZE 6
@@ -39,4 +41,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
diff --git a/arch/sparc/include/asm/jump_label.h b/arch/sparc/include/asm/jump_label.h
index ec2e2e2aba7d..cc9b04a2b11b 100644
--- a/arch/sparc/include/asm/jump_label.h
+++ b/arch/sparc/include/asm/jump_label.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_SPARC_JUMP_LABEL_H
 #define _ASM_SPARC_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/types.h>
 
@@ -22,8 +22,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __KERNEL__ */
-
 typedef u32 jump_label_t;
 
 struct jump_entry {
@@ -32,4 +30,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 6a2cefb4395a..a4c1cf7e93f8 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -1,7 +1,7 @@
 #ifndef _ASM_X86_JUMP_LABEL_H
 #define _ASM_X86_JUMP_LABEL_H
 
-#ifdef __KERNEL__
+#ifndef __ASSEMBLY__
 
 #include <linux/stringify.h>
 #include <linux/types.h>
@@ -30,8 +30,6 @@ l_yes:
 	return true;
 }
 
-#endif /* __KERNEL__ */
-
 #ifdef CONFIG_X86_64
 typedef u64 jump_label_t;
 #else
@@ -44,4 +42,5 @@ struct jump_entry {
 	jump_label_t key;
 };
 
+#endif  /* __ASSEMBLY__ */
 #endif
-- 
cgit v1.2.3


From 31f0119b817f6474a7b4c48fed7588af1b62c543 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:37 +0200
Subject: x86/asm/entry/64: Use common code for rt_sigreturn() epilogue

Similarly to stub_execve, we can reuse the epilogue in
stub_rt_sigreturn() and stub_x32_rt_sigreturn().

Add a comment explaining why we can't eliminage SAVE_EXTRA_REGS
here.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 3197f41ce320..5252e6021826 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -527,12 +527,21 @@ END(stub_execveat)
  */
 ENTRY(stub_rt_sigreturn)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
+	DEFAULT_FRAME 0, 8
+	/*
+	 * SAVE_EXTRA_REGS result is not normally needed:
+	 * sigreturn overwrites all pt_regs->GPREGS.
+	 * But sigreturn can fail (!), and there is no easy way to detect that.
+	 * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
+	 * we SAVE_EXTRA_REGS here.
+	 */
+	SAVE_EXTRA_REGS 8
 	call sys_rt_sigreturn
-	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
+return_from_stub:
+	addq	$8, %rsp
+	CFI_ADJUST_CFA_OFFSET -8
 	RESTORE_EXTRA_REGS
+	movq %rax,RAX(%rsp)
 	jmp int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_rt_sigreturn)
@@ -540,13 +549,10 @@ END(stub_rt_sigreturn)
 #ifdef CONFIG_X86_X32_ABI
 ENTRY(stub_x32_rt_sigreturn)
 	CFI_STARTPROC
-	addq $8, %rsp
-	DEFAULT_FRAME 0
-	SAVE_EXTRA_REGS
+	DEFAULT_FRAME 0, 8
+	SAVE_EXTRA_REGS 8
 	call sys32_x32_rt_sigreturn
-	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
-	RESTORE_EXTRA_REGS
-	jmp int_ret_from_sys_call
+	jmp  return_from_stub
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
 
-- 
cgit v1.2.3


From 05f1752d195c145d02ae40881d0985c2cfbee473 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:38 +0200
Subject: x86/asm/entry/64: Move stub_x32_execvecloser() to stub_execveat()

This is a preparatory patch for moving stub32_execve[at]() to this
file. It makes sense to have all execve stubs in one place, so
that they can reuse code.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5252e6021826..f7d9ba6f73a3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -521,6 +521,23 @@ ENTRY(stub_execveat)
 	CFI_ENDPROC
 END(stub_execveat)
 
+#ifdef CONFIG_X86_X32_ABI
+ENTRY(stub_x32_execve)
+	CFI_STARTPROC
+	DEFAULT_FRAME 0, 8
+	call	compat_sys_execve
+	jmp	return_from_execve
+	CFI_ENDPROC
+END(stub_x32_execve)
+ENTRY(stub_x32_execveat)
+	CFI_STARTPROC
+	DEFAULT_FRAME 0, 8
+	call	compat_sys_execveat
+	jmp	return_from_execve
+	CFI_ENDPROC
+END(stub_x32_execveat)
+#endif
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
@@ -555,23 +572,6 @@ ENTRY(stub_x32_rt_sigreturn)
 	jmp  return_from_stub
 	CFI_ENDPROC
 END(stub_x32_rt_sigreturn)
-
-ENTRY(stub_x32_execve)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execve
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execve)
-
-ENTRY(stub_x32_execveat)
-	CFI_STARTPROC
-	DEFAULT_FRAME 0, 8
-	call	compat_sys_execveat
-	jmp	return_from_execve
-	CFI_ENDPROC
-END(stub_x32_execveat)
-
 #endif
 
 /*
-- 
cgit v1.2.3


From 0f90fb979d7b53d80a6d5cb6e127b4b4b249907e Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:39 +0200
Subject: x86/asm/entry: Zero EXTRA_REGS for stub32_execve() too

The change which affected how execve clears EXTRA_REGS missed
32-bit execve syscalls.

Fix this by using 64-bit execve stub epilogue for them too.

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/ia32/ia32entry.S  |  2 --
 arch/x86/kernel/entry_64.S | 15 +++++++++++++++
 2 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 5d8f987a340d..a821b1cd4fa7 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -571,8 +571,6 @@ GLOBAL(\label)
 
 	PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn
 	PTREGSCALL stub32_sigreturn, sys32_sigreturn
-	PTREGSCALL stub32_execve, compat_sys_execve
-	PTREGSCALL stub32_execveat, compat_sys_execveat
 	PTREGSCALL stub32_fork, sys_fork
 	PTREGSCALL stub32_vfork, sys_vfork
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f7d9ba6f73a3..5380b3a8f3e5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -538,6 +538,21 @@ ENTRY(stub_x32_execveat)
 END(stub_x32_execveat)
 #endif
 
+#ifdef CONFIG_IA32_EMULATION
+ENTRY(stub32_execve)
+	CFI_STARTPROC
+	call	compat_sys_execve
+	jmp	return_from_execve
+	CFI_ENDPROC
+END(stub32_execve)
+ENTRY(stub32_execveat)
+	CFI_STARTPROC
+	call	compat_sys_execveat
+	jmp	return_from_execve
+	CFI_ENDPROC
+END(stub32_execveat)
+#endif
+
 /*
  * sigreturn is special because it needs to restore all registers on return.
  * This cannot be done with SYSRET, so use the IRET return path instead.
-- 
cgit v1.2.3


From 772951c4e4b06cdffeff499259dba07b544f3166 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:40 +0200
Subject: x86/asm/entry/64: Optimize [v]fork/clone stubs

Replace "call func; ret" with "jmp func".

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5380b3a8f3e5..ce852565443a 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -484,8 +484,7 @@ ENTRY(stub_\func)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8		/* offset 8: return address */
 	SAVE_EXTRA_REGS 8
-	call sys_\func
-	ret
+	jmp sys_\func
 	CFI_ENDPROC
 END(stub_\func)
 	.endm
-- 
cgit v1.2.3


From a30b0085f54efae11f6256df4e4a16af7eefc1c4 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:41 +0200
Subject: x86/asm/entry/64: Remove a redundant jump

Jumping to the very next instruction is not very useful:

        jmp label
    label:

Removing the jump.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-5-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ce852565443a..e8ddd5196ce7 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1448,7 +1448,6 @@ ENTRY(nmi)
 	/* If it is below the NMI stack, it is a normal NMI */
 	jb	first_nmi
 	/* Ah, it is within the NMI stack, treat it as nested */
-	jmp	nested_nmi
 
 	CFI_REMEMBER_STATE
 
-- 
cgit v1.2.3


From 66ad4efa51805964521db03d8aa827a8dd9058b9 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:42 +0200
Subject: x86/asm/entry/64: Simplify jumps in ret_from_fork

Replace
        test
        jz  1f
        jmp label
    1:

with
        test
        jnz label

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-6-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e8ddd5196ce7..a35e5e4435ef 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -608,18 +608,18 @@ ENTRY(ret_from_fork)
 	RESTORE_EXTRA_REGS
 
 	testl $3,CS(%rsp)			# from kernel_thread?
-	jz   1f
 
 	/*
 	 * By the time we get here, we have no idea whether our pt_regs,
 	 * ti flags, and ti status came from the 64-bit SYSCALL fast path,
 	 * the slow path, or one of the ia32entry paths.
-	 * Use int_ret_from_sys_call to return, since it can safely handle
+	 * Use IRET code path to return, since it can safely handle
 	 * all of the above.
 	 */
-	jmp  int_ret_from_sys_call
+	jnz	int_ret_from_sys_call
 
-1:
+	/* We came from kernel_thread */
+	/* nb: we depend on RESTORE_EXTRA_REGS above */
 	movq %rbp, %rdi
 	call *%rbx
 	movl $0, RAX(%rsp)
-- 
cgit v1.2.3


From 54a81e914b2432a86dd49cf611b0f71ef44ca7ad Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:43 +0200
Subject: x86/asm/entry/64: Remove GET_THREAD_INFO() in ret_from_fork

It used to be used to check for _TIF_IA32, but the check has
been removed.

Remove GET_THREAD_INFO() too.

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-7-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a35e5e4435ef..b67f2fc0d160 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -603,8 +603,6 @@ ENTRY(ret_from_fork)
 
 	call schedule_tail			# rdi: 'prev' task parameter
 
-	GET_THREAD_INFO(%rcx)
-
 	RESTORE_EXTRA_REGS
 
 	testl $3,CS(%rsp)			# from kernel_thread?
-- 
cgit v1.2.3


From a37f34a325d90856314ccd4994e1070dcc6bdcc4 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Tue, 7 Apr 2015 22:43:44 +0200
Subject: x86/asm/entry/64: Reduce padding in execve stubs

execve stubs are 7 bytes only. Padding them to 16 bytes is a
waste.

   text	   data	    bss	    dec	    hex	filename
  12594	      0	      0	  12594	   3132	entry_64.o.before
  12530	      0	      0	  12530	   30f2	entry_64.o

Run-tested.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428439424-7258-8-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_64.S | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index b67f2fc0d160..c7b238494b31 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -511,8 +511,12 @@ return_from_execve:
 	jmp	int_ret_from_sys_call
 	CFI_ENDPROC
 END(stub_execve)
-
-ENTRY(stub_execveat)
+/*
+ * Remaining execve stubs are only 7 bytes long.
+ * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
+ */
+	.align	8
+GLOBAL(stub_execveat)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8
 	call	sys_execveat
@@ -521,14 +525,16 @@ ENTRY(stub_execveat)
 END(stub_execveat)
 
 #ifdef CONFIG_X86_X32_ABI
-ENTRY(stub_x32_execve)
+	.align	8
+GLOBAL(stub_x32_execve)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8
 	call	compat_sys_execve
 	jmp	return_from_execve
 	CFI_ENDPROC
 END(stub_x32_execve)
-ENTRY(stub_x32_execveat)
+	.align	8
+GLOBAL(stub_x32_execveat)
 	CFI_STARTPROC
 	DEFAULT_FRAME 0, 8
 	call	compat_sys_execveat
@@ -538,13 +544,15 @@ END(stub_x32_execveat)
 #endif
 
 #ifdef CONFIG_IA32_EMULATION
-ENTRY(stub32_execve)
+	.align	8
+GLOBAL(stub32_execve)
 	CFI_STARTPROC
 	call	compat_sys_execve
 	jmp	return_from_execve
 	CFI_ENDPROC
 END(stub32_execve)
-ENTRY(stub32_execveat)
+	.align	8
+GLOBAL(stub32_execveat)
 	CFI_STARTPROC
 	call	compat_sys_execveat
 	jmp	return_from_execve
-- 
cgit v1.2.3


From b44915927ca88084a7292e4ddd4cf91036f365e1 Mon Sep 17 00:00:00 2001
From: Aravind Gopalakrishnan
Date: Thu, 9 Apr 2015 10:51:48 +0200
Subject: x86/iommu: Fix header comments regarding standard and _FINISH macros

The comment line regarding IOMMU_INIT and IOMMU_INIT_FINISH
macros is incorrect:

  "The standard vs the _FINISH differs in that the _FINISH variant
  will continue detecting other IOMMUs in the call list..."

It should be "..the *standard* variant will continue
detecting..."

Fix that. Also, make it readable while at it.

Signed-off-by: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: konrad.wilk@oracle.com
Fixes: 6e9636693373 ("x86, iommu: Update header comments with appropriate naming")
Link: http://lkml.kernel.org/r/1428508017-5316-1-git-send-email-Aravind.Gopalakrishnan@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/include/asm/iommu_table.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
index f42a04735a0a..e37d6b3ad983 100644
--- a/arch/x86/include/asm/iommu_table.h
+++ b/arch/x86/include/asm/iommu_table.h
@@ -79,11 +79,12 @@ struct iommu_table_entry {
  *  d). Similar to the 'init', except that this gets called from pci_iommu_init
  *      where we do have a memory allocator.
  *
- * The standard vs the _FINISH differs in that the _FINISH variant will
- * continue detecting other IOMMUs in the call list after the
- * the detection routine returns a positive number. The _FINISH will
- * stop the execution chain. Both will still call the 'init' and
- * 'late_init' functions if they are set.
+ * The standard IOMMU_INIT differs from the IOMMU_INIT_FINISH variant
+ * in that the former will continue detecting other IOMMUs in the call
+ * list after the detection routine returns a positive number, while the
+ * latter will stop the execution chain upon first successful detection.
+ * Both variants will still call the 'init' and 'late_init' functions if
+ * they are set.
  */
 #define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init)		\
 	__IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
-- 
cgit v1.2.3


From 7a4e017041136de05527722b97e0c1f8702a5cbe Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Thu, 9 Apr 2015 13:26:29 -0500
Subject: x86/apic/uv: Update the APIC UV OEM check

Optimize the first "SGI" OEM check to return faster if the
system is not an SGI or UV system.

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Hedi Berriche <hedi@sgi.com>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/20150409182628.952357922@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd630e4..2a739a90010e 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -146,6 +146,9 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
 	int pnodeid, is_uv1, is_uv2, is_uv3;
 
+	if (strncmp(oem_id, "SGI", 3) != 0)
+		return 0;
+
 	is_uv1 = !strcmp(oem_id, "SGI");
 	is_uv2 = !strcmp(oem_id, "SGI2");
 	is_uv3 = !strncmp(oem_id, "SGI3", 4);	/* there are varieties of UV3 */
-- 
cgit v1.2.3


From 379b97e280971ef7673db5166c7e0f7ab49b81de Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Thu, 9 Apr 2015 13:26:30 -0500
Subject: x86/apic/uv: Update the UV APIC driver check

Fix a bug in the OEM check function that determines if the
system is a UV system and the BIOS is compatible with the
kernel's UV apic driver.  This prevents some possibly obscure
panics and guards the system against being started on SGI
hardware that does not have the required kernel support.

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Hedi Berriche <hedi@sgi.com>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/20150409182629.112998930@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 74 +++++++++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 2a739a90010e..e18962c8b007 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -144,36 +144,60 @@ static void __init uv_set_apicid_hibit(void)
 
 static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
-	int pnodeid, is_uv1, is_uv2, is_uv3;
+	int pnodeid;
+	int uv_apic;
 
 	if (strncmp(oem_id, "SGI", 3) != 0)
 		return 0;
 
-	is_uv1 = !strcmp(oem_id, "SGI");
-	is_uv2 = !strcmp(oem_id, "SGI2");
-	is_uv3 = !strncmp(oem_id, "SGI3", 4);	/* there are varieties of UV3 */
-	if (is_uv1 || is_uv2 || is_uv3) {
-		uv_hub_info->hub_revision =
-			(is_uv1 ? UV1_HUB_REVISION_BASE :
-			(is_uv2 ? UV2_HUB_REVISION_BASE :
-				  UV3_HUB_REVISION_BASE));
-		pnodeid = early_get_pnodeid();
-		early_get_apic_pnode_shift();
-		x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
-		x86_platform.nmi_init = uv_nmi_init;
-		if (!strcmp(oem_table_id, "UVL"))
-			uv_system_type = UV_LEGACY_APIC;
-		else if (!strcmp(oem_table_id, "UVX"))
-			uv_system_type = UV_X2APIC;
-		else if (!strcmp(oem_table_id, "UVH")) {
-			__this_cpu_write(x2apic_extra_bits,
-				pnodeid << uvh_apicid.s.pnode_shift);
-			uv_system_type = UV_NON_UNIQUE_APIC;
-			uv_set_apicid_hibit();
-			return 1;
-		}
+	/*
+	 * Determine UV arch type.
+	 *   SGI: UV100/1000
+	 *   SGI2: UV2000/3000
+	 *   SGI3: UV300 (truncated to 4 chars because of different varieties)
+	 */
+	uv_hub_info->hub_revision =
+		!strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+		!strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+		!strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+	if (uv_hub_info->hub_revision == 0)
+		goto badbios;
+
+	pnodeid = early_get_pnodeid();
+	early_get_apic_pnode_shift();
+	x86_platform.is_untracked_pat_range =  uv_is_untracked_pat_range;
+	x86_platform.nmi_init = uv_nmi_init;
+
+	if (!strcmp(oem_table_id, "UVX")) {		/* most common */
+		uv_system_type = UV_X2APIC;
+		uv_apic = 0;
+
+	} else if (!strcmp(oem_table_id, "UVH")) {	/* only UV1 systems */
+		uv_system_type = UV_NON_UNIQUE_APIC;
+		__this_cpu_write(x2apic_extra_bits,
+			pnodeid << uvh_apicid.s.pnode_shift);
+		uv_set_apicid_hibit();
+		uv_apic = 1;
+
+	} else	if (!strcmp(oem_table_id, "UVL")) {	/* only used for */
+		uv_system_type = UV_LEGACY_APIC;	/* very small systems */
+		uv_apic = 0;
+
+	} else {
+		goto badbios;
 	}
-	return 0;
+
+	pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
+		oem_id, oem_table_id, uv_system_type,
+		uv_min_hub_revision_id, uv_apic);
+
+	return uv_apic;
+
+badbios:
+	pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+	pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+	BUG();
 }
 
 enum uv_system_type get_uv_system_type(void)
-- 
cgit v1.2.3


From 1912c7afa39d2683a574011ff455fe49ada8016c Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Thu, 9 Apr 2015 13:26:31 -0500
Subject: x86/apic/uv: Update the UV APIC HUB check

Update the check for UV2000/3000.  Note when the HUB is not recognized.

Signed-off-by: Mike Travis <travis@sgi.com>
Acked-by: Hedi Berriche <hedi@sgi.com>
Acked-by: Dimitri Sivanich <sivanich@sgi.com>
Link: http://lkml.kernel.org/r/20150409182629.267239403@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index e18962c8b007..c8d92950bc04 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -881,10 +881,14 @@ void __init uv_system_init(void)
 	unsigned long mmr_base, present, paddr;
 	unsigned short pnode_mask;
 	unsigned char n_lshift;
-	char *hub = (is_uv1_hub() ? "UV1" :
-		    (is_uv2_hub() ? "UV2" :
-				    "UV3"));
+	char *hub = (is_uv1_hub() ? "UV100/1000" :
+		    (is_uv2_hub() ? "UV2000/3000" :
+		    (is_uv3_hub() ? "UV300" : NULL)));
 
+	if (!hub) {
+		pr_err("UV: Unknown/unsupported UV hub\n");
+		return;
+	}
 	pr_info("UV: Found %s hub\n", hub);
 	map_low_mmrs();
 
-- 
cgit v1.2.3


From 824b43763c562ee2feec16bb4017785528f3b54c Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel
Date: Thu, 9 Apr 2015 12:55:46 +0200
Subject: crypto: x86/sha1_ssse3 - move SHA-1 SSSE3 implementation to base
 layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha1_ssse3_glue.c | 139 ++++++++------------------------------
 1 file changed, 28 insertions(+), 111 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c
index 6c20fe04a738..33d1b9dc14cc 100644
--- a/arch/x86/crypto/sha1_ssse3_glue.c
+++ b/arch/x86/crypto/sha1_ssse3_glue.c
@@ -28,7 +28,7 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha1_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
@@ -44,132 +44,51 @@ asmlinkage void sha1_transform_avx(u32 *digest, const char *data,
 #define SHA1_AVX2_BLOCK_OPTSIZE	4	/* optimal 4*64 bytes of SHA1 blocks */
 
 asmlinkage void sha1_transform_avx2(u32 *digest, const char *data,
-				unsigned int rounds);
+				    unsigned int rounds);
 #endif
 
-static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
-
-
-static int sha1_ssse3_init(struct shash_desc *desc)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	*sctx = (struct sha1_state){
-		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
-	};
-
-	return 0;
-}
-
-static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count += len;
-
-	if (partial) {
-		done = SHA1_BLOCK_SIZE - partial;
-		memcpy(sctx->buffer + partial, data, done);
-		sha1_transform_asm(sctx->state, sctx->buffer, 1);
-	}
-
-	if (len - done >= SHA1_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
-
-		sha1_transform_asm(sctx->state, data + done, rounds);
-		done += rounds * SHA1_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buffer, data + done, len - done);
-
-	return 0;
-}
+static void (*sha1_transform_asm)(u32 *, const char *, unsigned int);
 
 static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
 	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
-	int res;
 
-	/* Handle the fast case right here */
-	if (partial + len < SHA1_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buffer + partial, data, len);
+	if (!irq_fpu_usable() ||
+	    (sctx->count % SHA1_BLOCK_SIZE) + len < SHA1_BLOCK_SIZE)
+		return crypto_sha1_update(desc, data, len);
 
-		return 0;
-	}
+	/* make sure casting to sha1_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha1_state, state) != 0);
 
-	if (!irq_fpu_usable()) {
-		res = crypto_sha1_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha1_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
-
-	return res;
-}
-
-
-/* Add padding and return the message digest. */
-static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA1_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
-	if (!irq_fpu_usable()) {
-		crypto_sha1_update(desc, padding, padlen);
-		crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha1_ssse3_update() */
-		if (padlen <= 56) {
-			sctx->count += padlen;
-			memcpy(sctx->buffer + index, padding, padlen);
-		} else {
-			__sha1_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
+	kernel_fpu_begin();
+	sha1_base_do_update(desc, data, len,
+			    (sha1_block_fn *)sha1_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha1_ssse3_export(struct shash_desc *desc, void *out)
+static int sha1_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
+	if (!irq_fpu_usable())
+		return crypto_sha1_finup(desc, data, len, out);
 
-	memcpy(out, sctx, sizeof(*sctx));
+	kernel_fpu_begin();
+	if (len)
+		sha1_base_do_update(desc, data, len,
+				    (sha1_block_fn *)sha1_transform_asm);
+	sha1_base_do_finalize(desc, (sha1_block_fn *)sha1_transform_asm);
+	kernel_fpu_end();
 
-	return 0;
+	return sha1_base_finish(desc, out);
 }
 
-static int sha1_ssse3_import(struct shash_desc *desc, const void *in)
+/* Add padding and return the message digest. */
+static int sha1_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	struct sha1_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
+	return sha1_ssse3_finup(desc, NULL, 0, out);
 }
 
 #ifdef CONFIG_AS_AVX2
@@ -186,13 +105,11 @@ static void sha1_apply_transform_avx2(u32 *digest, const char *data,
 
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
-	.init		=	sha1_ssse3_init,
+	.init		=	sha1_base_init,
 	.update		=	sha1_ssse3_update,
 	.final		=	sha1_ssse3_final,
-	.export		=	sha1_ssse3_export,
-	.import		=	sha1_ssse3_import,
+	.finup		=	sha1_ssse3_finup,
 	.descsize	=	sizeof(struct sha1_state),
-	.statesize	=	sizeof(struct sha1_state),
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-ssse3",
-- 
cgit v1.2.3


From 1631030ae63aef0a54fe08813e0f4e26c8ef9c78 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel
Date: Thu, 9 Apr 2015 12:55:47 +0200
Subject: crypto: x86/sha256_ssse3 - move SHA-224/256 SSSE3 implementation to
 base layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer. It also changes the
prototypes of the core asm functions to be compatible with the base
prototype

  void (sha256_block_fn)(struct sha256_state *sst, u8 const *src, int blocks)

so that they can be passed to the base layer directly.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha256-avx-asm.S    |  10 +-
 arch/x86/crypto/sha256-avx2-asm.S   |  10 +-
 arch/x86/crypto/sha256-ssse3-asm.S  |  10 +-
 arch/x86/crypto/sha256_ssse3_glue.c | 193 +++++++-----------------------------
 4 files changed, 50 insertions(+), 173 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha256-avx-asm.S b/arch/x86/crypto/sha256-avx-asm.S
index 642f15687a0a..92b3b5d75ba9 100644
--- a/arch/x86/crypto/sha256-avx-asm.S
+++ b/arch/x86/crypto/sha256-avx-asm.S
@@ -96,10 +96,10 @@ SHUF_DC00 = %xmm12      # shuffle xDxC -> DC00
 BYTE_FLIP_MASK = %xmm13
 
 NUM_BLKS = %rdx   # 3rd arg
-CTX = %rsi        # 2nd arg
-INP = %rdi        # 1st arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
 
-SRND = %rdi       # clobbers INP
+SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
@@ -342,8 +342,8 @@ a = TMP_
 
 ########################################################################
 ## void sha256_transform_avx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256-avx2-asm.S b/arch/x86/crypto/sha256-avx2-asm.S
index 9e86944c539d..570ec5ec62d7 100644
--- a/arch/x86/crypto/sha256-avx2-asm.S
+++ b/arch/x86/crypto/sha256-avx2-asm.S
@@ -91,12 +91,12 @@ BYTE_FLIP_MASK = %ymm13
 X_BYTE_FLIP_MASK = %xmm13 # XMM version of BYTE_FLIP_MASK
 
 NUM_BLKS = %rdx	# 3rd arg
-CTX	= %rsi  # 2nd arg
-INP	= %rdi	# 1st arg
+INP	= %rsi  # 2nd arg
+CTX	= %rdi	# 1st arg
 c	= %ecx
 d	= %r8d
 e       = %edx	# clobbers NUM_BLKS
-y3	= %edi	# clobbers INP
+y3	= %esi	# clobbers INP
 
 
 TBL	= %rbp
@@ -523,8 +523,8 @@ STACK_SIZE	= _RSP      + _RSP_SIZE
 
 ########################################################################
 ## void sha256_transform_rorx(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256-ssse3-asm.S b/arch/x86/crypto/sha256-ssse3-asm.S
index f833b74d902b..2cedc44e8121 100644
--- a/arch/x86/crypto/sha256-ssse3-asm.S
+++ b/arch/x86/crypto/sha256-ssse3-asm.S
@@ -88,10 +88,10 @@ SHUF_DC00 = %xmm11      # shuffle xDxC -> DC00
 BYTE_FLIP_MASK = %xmm12
 
 NUM_BLKS = %rdx   # 3rd arg
-CTX = %rsi        # 2nd arg
-INP = %rdi        # 1st arg
+INP = %rsi        # 2nd arg
+CTX = %rdi        # 1st arg
 
-SRND = %rdi       # clobbers INP
+SRND = %rsi       # clobbers INP
 c = %ecx
 d = %r8d
 e = %edx
@@ -348,8 +348,8 @@ a = TMP_
 
 ########################################################################
 ## void sha256_transform_ssse3(void *input_data, UINT32 digest[8], UINT64 num_blks)
-## arg 1 : pointer to input data
-## arg 2 : pointer to digest
+## arg 1 : pointer to digest
+## arg 2 : pointer to input data
 ## arg 3 : Num blocks
 ########################################################################
 .text
diff --git a/arch/x86/crypto/sha256_ssse3_glue.c b/arch/x86/crypto/sha256_ssse3_glue.c
index 8fad72f4dfd2..ccc338881ee8 100644
--- a/arch/x86/crypto/sha256_ssse3_glue.c
+++ b/arch/x86/crypto/sha256_ssse3_glue.c
@@ -36,195 +36,74 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha256_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
 #include <linux/string.h>
 
-asmlinkage void sha256_transform_ssse3(const char *data, u32 *digest,
-				     u64 rounds);
+asmlinkage void sha256_transform_ssse3(u32 *digest, const char *data,
+				       u64 rounds);
 #ifdef CONFIG_AS_AVX
-asmlinkage void sha256_transform_avx(const char *data, u32 *digest,
+asmlinkage void sha256_transform_avx(u32 *digest, const char *data,
 				     u64 rounds);
 #endif
 #ifdef CONFIG_AS_AVX2
-asmlinkage void sha256_transform_rorx(const char *data, u32 *digest,
-				     u64 rounds);
+asmlinkage void sha256_transform_rorx(u32 *digest, const char *data,
+				      u64 rounds);
 #endif
 
-static asmlinkage void (*sha256_transform_asm)(const char *, u32 *, u64);
-
-
-static int sha256_ssse3_init(struct shash_desc *desc)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA256_H0;
-	sctx->state[1] = SHA256_H1;
-	sctx->state[2] = SHA256_H2;
-	sctx->state[3] = SHA256_H3;
-	sctx->state[4] = SHA256_H4;
-	sctx->state[5] = SHA256_H5;
-	sctx->state[6] = SHA256_H6;
-	sctx->state[7] = SHA256_H7;
-	sctx->count = 0;
-
-	return 0;
-}
-
-static int __sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count += len;
-
-	if (partial) {
-		done = SHA256_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha256_transform_asm(sctx->buf, sctx->state, 1);
-	}
-
-	if (len - done >= SHA256_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA256_BLOCK_SIZE;
-
-		sha256_transform_asm(data + done, sctx->state, (u64) rounds);
-
-		done += rounds * SHA256_BLOCK_SIZE;
-	}
-
-	memcpy(sctx->buf, data + done, len - done);
-
-	return 0;
-}
+static void (*sha256_transform_asm)(u32 *, const char *, u64);
 
 static int sha256_ssse3_update(struct shash_desc *desc, const u8 *data,
 			     unsigned int len)
 {
 	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count % SHA256_BLOCK_SIZE;
-	int res;
 
-	/* Handle the fast case right here */
-	if (partial + len < SHA256_BLOCK_SIZE) {
-		sctx->count += len;
-		memcpy(sctx->buf + partial, data, len);
+	if (!irq_fpu_usable() ||
+	    (sctx->count % SHA256_BLOCK_SIZE) + len < SHA256_BLOCK_SIZE)
+		return crypto_sha256_update(desc, data, len);
 
-		return 0;
-	}
-
-	if (!irq_fpu_usable()) {
-		res = crypto_sha256_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha256_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
-
-	return res;
-}
+	/* make sure casting to sha256_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha256_state, state) != 0);
 
-
-/* Add padding and return the message digest. */
-static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be32 *dst = (__be32 *)out;
-	__be64 bits;
-	static const u8 padding[SHA256_BLOCK_SIZE] = { 0x80, };
-
-	bits = cpu_to_be64(sctx->count << 3);
-
-	/* Pad out to 56 mod 64 and append length */
-	index = sctx->count % SHA256_BLOCK_SIZE;
-	padlen = (index < 56) ? (56 - index) : ((SHA256_BLOCK_SIZE+56)-index);
-
-	if (!irq_fpu_usable()) {
-		crypto_sha256_update(desc, padding, padlen);
-		crypto_sha256_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha256_ssse3_update() */
-		if (padlen <= 56) {
-			sctx->count += padlen;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha256_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha256_ssse3_update(desc, (const u8 *)&bits,
-					sizeof(bits), 56);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be32(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
+	kernel_fpu_begin();
+	sha256_base_do_update(desc, data, len,
+			      (sha256_block_fn *)sha256_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha256_ssse3_export(struct shash_desc *desc, void *out)
+static int sha256_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha256_state *sctx = shash_desc_ctx(desc);
+	if (!irq_fpu_usable())
+		return crypto_sha256_finup(desc, data, len, out);
 
-	memcpy(out, sctx, sizeof(*sctx));
+	kernel_fpu_begin();
+	if (len)
+		sha256_base_do_update(desc, data, len,
+				      (sha256_block_fn *)sha256_transform_asm);
+	sha256_base_do_finalize(desc, (sha256_block_fn *)sha256_transform_asm);
+	kernel_fpu_end();
 
-	return 0;
+	return sha256_base_finish(desc, out);
 }
 
-static int sha256_ssse3_import(struct shash_desc *desc, const void *in)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha224_ssse3_init(struct shash_desc *desc)
-{
-	struct sha256_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA224_H0;
-	sctx->state[1] = SHA224_H1;
-	sctx->state[2] = SHA224_H2;
-	sctx->state[3] = SHA224_H3;
-	sctx->state[4] = SHA224_H4;
-	sctx->state[5] = SHA224_H5;
-	sctx->state[6] = SHA224_H6;
-	sctx->state[7] = SHA224_H7;
-	sctx->count = 0;
-
-	return 0;
-}
-
-static int sha224_ssse3_final(struct shash_desc *desc, u8 *hash)
+/* Add padding and return the message digest. */
+static int sha256_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	u8 D[SHA256_DIGEST_SIZE];
-
-	sha256_ssse3_final(desc, D);
-
-	memcpy(hash, D, SHA224_DIGEST_SIZE);
-	memzero_explicit(D, SHA256_DIGEST_SIZE);
-
-	return 0;
+	return sha256_ssse3_finup(desc, NULL, 0, out);
 }
 
 static struct shash_alg algs[] = { {
 	.digestsize	=	SHA256_DIGEST_SIZE,
-	.init		=	sha256_ssse3_init,
+	.init		=	sha256_base_init,
 	.update		=	sha256_ssse3_update,
 	.final		=	sha256_ssse3_final,
-	.export		=	sha256_ssse3_export,
-	.import		=	sha256_ssse3_import,
+	.finup		=	sha256_ssse3_finup,
 	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha256",
 		.cra_driver_name =	"sha256-ssse3",
@@ -235,13 +114,11 @@ static struct shash_alg algs[] = { {
 	}
 }, {
 	.digestsize	=	SHA224_DIGEST_SIZE,
-	.init		=	sha224_ssse3_init,
+	.init		=	sha224_base_init,
 	.update		=	sha256_ssse3_update,
-	.final		=	sha224_ssse3_final,
-	.export		=	sha256_ssse3_export,
-	.import		=	sha256_ssse3_import,
+	.final		=	sha256_ssse3_final,
+	.finup		=	sha256_ssse3_finup,
 	.descsize	=	sizeof(struct sha256_state),
-	.statesize	=	sizeof(struct sha256_state),
 	.base		=	{
 		.cra_name	=	"sha224",
 		.cra_driver_name =	"sha224-ssse3",
-- 
cgit v1.2.3


From e68410ebf62676dfb93aafff7c55b76644f37072 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel
Date: Thu, 9 Apr 2015 12:55:48 +0200
Subject: crypto: x86/sha512_ssse3 - move SHA-384/512 SSSE3 implementation to
 base layer

This removes all the boilerplate from the existing implementation,
and replaces it with calls into the base layer.  It also changes the
prototypes of the core asm functions to be compatible with the base
prototype

  void (sha512_block_fn)(struct sha256_state *sst, u8 const *src, int blocks)

so that they can be passed to the base layer directly.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 arch/x86/crypto/sha512-avx-asm.S    |   6 +-
 arch/x86/crypto/sha512-avx2-asm.S   |   6 +-
 arch/x86/crypto/sha512-ssse3-asm.S  |   6 +-
 arch/x86/crypto/sha512_ssse3_glue.c | 202 +++++++-----------------------------
 4 files changed, 44 insertions(+), 176 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/crypto/sha512-avx-asm.S b/arch/x86/crypto/sha512-avx-asm.S
index 974dde9bc6cd..565274d6a641 100644
--- a/arch/x86/crypto/sha512-avx-asm.S
+++ b/arch/x86/crypto/sha512-avx-asm.S
@@ -54,9 +54,9 @@
 
 # Virtual Registers
 # ARG1
-msg	= %rdi
+digest	= %rdi
 # ARG2
-digest	= %rsi
+msg	= %rsi
 # ARG3
 msglen	= %rdx
 T1	= %rcx
@@ -271,7 +271,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_avx(const void* M, void* D, u64 L)
+# void sha512_transform_avx(void* D, const void* M, u64 L)
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 # message blocks.
diff --git a/arch/x86/crypto/sha512-avx2-asm.S b/arch/x86/crypto/sha512-avx2-asm.S
index 568b96105f5c..a4771dcd1fcf 100644
--- a/arch/x86/crypto/sha512-avx2-asm.S
+++ b/arch/x86/crypto/sha512-avx2-asm.S
@@ -70,9 +70,9 @@ XFER  = YTMP0
 BYTE_FLIP_MASK  = %ymm9
 
 # 1st arg
-INP         = %rdi
+CTX         = %rdi
 # 2nd arg
-CTX         = %rsi
+INP         = %rsi
 # 3rd arg
 NUM_BLKS    = %rdx
 
@@ -562,7 +562,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_rorx(const void* M, void* D, uint64_t L)#
+# void sha512_transform_rorx(void* D, const void* M, uint64_t L)#
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 #   message blocks.
diff --git a/arch/x86/crypto/sha512-ssse3-asm.S b/arch/x86/crypto/sha512-ssse3-asm.S
index fb56855d51f5..e610e29cbc81 100644
--- a/arch/x86/crypto/sha512-ssse3-asm.S
+++ b/arch/x86/crypto/sha512-ssse3-asm.S
@@ -53,9 +53,9 @@
 
 # Virtual Registers
 # ARG1
-msg =		%rdi
+digest =	%rdi
 # ARG2
-digest =	%rsi
+msg =		%rsi
 # ARG3
 msglen =	%rdx
 T1 =		%rcx
@@ -269,7 +269,7 @@ frame_size = frame_GPRSAVE + GPRSAVE_SIZE
 .endm
 
 ########################################################################
-# void sha512_transform_ssse3(const void* M, void* D, u64 L)#
+# void sha512_transform_ssse3(void* D, const void* M, u64 L)#
 # Purpose: Updates the SHA512 digest stored at D with the message stored in M.
 # The size of the message pointed to by M must be an integer multiple of SHA512
 #   message blocks.
diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c
index 0b6af26832bf..d9fa4c1e063f 100644
--- a/arch/x86/crypto/sha512_ssse3_glue.c
+++ b/arch/x86/crypto/sha512_ssse3_glue.c
@@ -34,205 +34,75 @@
 #include <linux/cryptohash.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
-#include <asm/byteorder.h>
+#include <crypto/sha512_base.h>
 #include <asm/i387.h>
 #include <asm/xcr.h>
 #include <asm/xsave.h>
 
 #include <linux/string.h>
 
-asmlinkage void sha512_transform_ssse3(const char *data, u64 *digest,
-				     u64 rounds);
+asmlinkage void sha512_transform_ssse3(u64 *digest, const char *data,
+				       u64 rounds);
 #ifdef CONFIG_AS_AVX
-asmlinkage void sha512_transform_avx(const char *data, u64 *digest,
+asmlinkage void sha512_transform_avx(u64 *digest, const char *data,
 				     u64 rounds);
 #endif
 #ifdef CONFIG_AS_AVX2
-asmlinkage void sha512_transform_rorx(const char *data, u64 *digest,
-				     u64 rounds);
+asmlinkage void sha512_transform_rorx(u64 *digest, const char *data,
+				      u64 rounds);
 #endif
 
-static asmlinkage void (*sha512_transform_asm)(const char *, u64 *, u64);
-
-
-static int sha512_ssse3_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA512_H0;
-	sctx->state[1] = SHA512_H1;
-	sctx->state[2] = SHA512_H2;
-	sctx->state[3] = SHA512_H3;
-	sctx->state[4] = SHA512_H4;
-	sctx->state[5] = SHA512_H5;
-	sctx->state[6] = SHA512_H6;
-	sctx->state[7] = SHA512_H7;
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
+static void (*sha512_transform_asm)(u64 *, const char *, u64);
 
-static int __sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
-			       unsigned int len, unsigned int partial)
+static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len)
 {
 	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int done = 0;
-
-	sctx->count[0] += len;
-	if (sctx->count[0] < len)
-		sctx->count[1]++;
 
-	if (partial) {
-		done = SHA512_BLOCK_SIZE - partial;
-		memcpy(sctx->buf + partial, data, done);
-		sha512_transform_asm(sctx->buf, sctx->state, 1);
-	}
-
-	if (len - done >= SHA512_BLOCK_SIZE) {
-		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+	if (!irq_fpu_usable() ||
+	    (sctx->count[0] % SHA512_BLOCK_SIZE) + len < SHA512_BLOCK_SIZE)
+		return crypto_sha512_update(desc, data, len);
 
-		sha512_transform_asm(data + done, sctx->state, (u64) rounds);
-
-		done += rounds * SHA512_BLOCK_SIZE;
-	}
+	/* make sure casting to sha512_block_fn() is safe */
+	BUILD_BUG_ON(offsetof(struct sha512_state, state) != 0);
 
-	memcpy(sctx->buf, data + done, len - done);
+	kernel_fpu_begin();
+	sha512_base_do_update(desc, data, len,
+			      (sha512_block_fn *)sha512_transform_asm);
+	kernel_fpu_end();
 
 	return 0;
 }
 
-static int sha512_ssse3_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
+static int sha512_ssse3_finup(struct shash_desc *desc, const u8 *data,
+			      unsigned int len, u8 *out)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
-	int res;
-
-	/* Handle the fast case right here */
-	if (partial + len < SHA512_BLOCK_SIZE) {
-		sctx->count[0] += len;
-		if (sctx->count[0] < len)
-			sctx->count[1]++;
-		memcpy(sctx->buf + partial, data, len);
-
-		return 0;
-	}
+	if (!irq_fpu_usable())
+		return crypto_sha512_finup(desc, data, len, out);
 
-	if (!irq_fpu_usable()) {
-		res = crypto_sha512_update(desc, data, len);
-	} else {
-		kernel_fpu_begin();
-		res = __sha512_ssse3_update(desc, data, len, partial);
-		kernel_fpu_end();
-	}
+	kernel_fpu_begin();
+	if (len)
+		sha512_base_do_update(desc, data, len,
+				      (sha512_block_fn *)sha512_transform_asm);
+	sha512_base_do_finalize(desc, (sha512_block_fn *)sha512_transform_asm);
+	kernel_fpu_end();
 
-	return res;
+	return sha512_base_finish(desc, out);
 }
 
-
 /* Add padding and return the message digest. */
 static int sha512_ssse3_final(struct shash_desc *desc, u8 *out)
 {
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-	unsigned int i, index, padlen;
-	__be64 *dst = (__be64 *)out;
-	__be64 bits[2];
-	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
-
-	/* save number of bits */
-	bits[1] = cpu_to_be64(sctx->count[0] << 3);
-	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
-
-	/* Pad out to 112 mod 128 and append length */
-	index = sctx->count[0] & 0x7f;
-	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
-
-	if (!irq_fpu_usable()) {
-		crypto_sha512_update(desc, padding, padlen);
-		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
-	} else {
-		kernel_fpu_begin();
-		/* We need to fill a whole block for __sha512_ssse3_update() */
-		if (padlen <= 112) {
-			sctx->count[0] += padlen;
-			if (sctx->count[0] < padlen)
-				sctx->count[1]++;
-			memcpy(sctx->buf + index, padding, padlen);
-		} else {
-			__sha512_ssse3_update(desc, padding, padlen, index);
-		}
-		__sha512_ssse3_update(desc, (const u8 *)&bits,
-					sizeof(bits), 112);
-		kernel_fpu_end();
-	}
-
-	/* Store state in digest */
-	for (i = 0; i < 8; i++)
-		dst[i] = cpu_to_be64(sctx->state[i]);
-
-	/* Wipe context */
-	memset(sctx, 0, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_ssse3_export(struct shash_desc *desc, void *out)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(out, sctx, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha512_ssse3_import(struct shash_desc *desc, const void *in)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	memcpy(sctx, in, sizeof(*sctx));
-
-	return 0;
-}
-
-static int sha384_ssse3_init(struct shash_desc *desc)
-{
-	struct sha512_state *sctx = shash_desc_ctx(desc);
-
-	sctx->state[0] = SHA384_H0;
-	sctx->state[1] = SHA384_H1;
-	sctx->state[2] = SHA384_H2;
-	sctx->state[3] = SHA384_H3;
-	sctx->state[4] = SHA384_H4;
-	sctx->state[5] = SHA384_H5;
-	sctx->state[6] = SHA384_H6;
-	sctx->state[7] = SHA384_H7;
-
-	sctx->count[0] = sctx->count[1] = 0;
-
-	return 0;
-}
-
-static int sha384_ssse3_final(struct shash_desc *desc, u8 *hash)
-{
-	u8 D[SHA512_DIGEST_SIZE];
-
-	sha512_ssse3_final(desc, D);
-
-	memcpy(hash, D, SHA384_DIGEST_SIZE);
-	memzero_explicit(D, SHA512_DIGEST_SIZE);
-
-	return 0;
+	return sha512_ssse3_finup(desc, NULL, 0, out);
 }
 
 static struct shash_alg algs[] = { {
 	.digestsize	=	SHA512_DIGEST_SIZE,
-	.init		=	sha512_ssse3_init,
+	.init		=	sha512_base_init,
 	.update		=	sha512_ssse3_update,
 	.final		=	sha512_ssse3_final,
-	.export		=	sha512_ssse3_export,
-	.import		=	sha512_ssse3_import,
+	.finup		=	sha512_ssse3_finup,
 	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha512",
 		.cra_driver_name =	"sha512-ssse3",
@@ -243,13 +113,11 @@ static struct shash_alg algs[] = { {
 	}
 },  {
 	.digestsize	=	SHA384_DIGEST_SIZE,
-	.init		=	sha384_ssse3_init,
+	.init		=	sha384_base_init,
 	.update		=	sha512_ssse3_update,
-	.final		=	sha384_ssse3_final,
-	.export		=	sha512_ssse3_export,
-	.import		=	sha512_ssse3_import,
+	.final		=	sha512_ssse3_final,
+	.finup		=	sha512_ssse3_finup,
 	.descsize	=	sizeof(struct sha512_state),
-	.statesize	=	sizeof(struct sha512_state),
 	.base		=	{
 		.cra_name	=	"sha384",
 		.cra_driver_name =	"sha384-ssse3",
-- 
cgit v1.2.3


From 54cfa458b492fcf76d653698d362743bcb329055 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 10 Apr 2015 20:13:40 +0200
Subject: x86/asm/entry/32: Tidy up JNZ instructions after TESTs

After TESTs, use logically correct JNZ mnemonic instead of JNE.

This doesn't change code:

  md5:
   c3005b39a11fe582b7df7908561ad4ee  entry_32.o.before.asm
   c3005b39a11fe582b7df7908561ad4ee  entry_32.o.after.asm

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1428689620-21881-1-git-send-email-dvlasenk@redhat.com
[ Added object file comparison. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/entry_32.S | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 02bec0f1d1e1..1c309763e321 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -435,7 +435,7 @@ sysenter_after_call:
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testl $_TIF_ALLWORK_MASK, %ecx
-	jne sysexit_audit
+	jnz sysexit_audit
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl PT_EIP(%esp), %edx
@@ -463,7 +463,7 @@ sysenter_audit:
 
 sysexit_audit:
 	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jne syscall_exit_work
+	jnz syscall_exit_work
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_ANY)
 	movl %eax,%edx		/* second arg, syscall return value */
@@ -475,7 +475,7 @@ sysexit_audit:
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jne syscall_exit_work
+	jnz syscall_exit_work
 	movl PT_EAX(%esp),%eax	/* reload syscall return value */
 	jmp sysenter_exit
 #endif
@@ -513,7 +513,7 @@ syscall_exit:
 	TRACE_IRQS_OFF
 	movl TI_flags(%ebp), %ecx
 	testl $_TIF_ALLWORK_MASK, %ecx	# current->work
-	jne syscall_exit_work
+	jnz syscall_exit_work
 
 restore_all:
 	TRACE_IRQS_IRET
@@ -615,7 +615,7 @@ work_notifysig:				# deal with pending signals and
 #ifdef CONFIG_VM86
 	testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
 	movl %esp, %eax
-	jne work_notifysig_v86		# returning to kernel-space or
+	jnz work_notifysig_v86		# returning to kernel-space or
 					# vm86-space
 1:
 #else
-- 
cgit v1.2.3


From aa21df0424468dbd991440d41ba0f700c5997103 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 10 Apr 2015 15:06:56 +0200
Subject: perf/x86/64: Do not guess user_regs->cs, ss, sp in get_regs_user()

After recent changes to syscall entry points,
user_regs->{cs,ss,sp} are always correct. (They used to be
undefined while in syscalls).

We can report them reliably, without guessing.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1428671219-29341-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/perf_regs.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 02a8720414c0..7ab198acc5b8 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,8 +131,8 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	}
 
 	/*
-	 * RIP, flags, and the argument registers are usually saved.
-	 * orig_ax is probably okay, too.
+	 * These registers are always saved on 64-bit syscall entry.
+	 * On 32-bit entry points, they are saved too except r8..r11.
 	 */
 	regs_user_copy->ip = user_regs->ip;
 	regs_user_copy->cx = user_regs->cx;
@@ -145,9 +145,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	regs_user_copy->r11 = user_regs->r11;
 	regs_user_copy->orig_ax = user_regs->orig_ax;
 	regs_user_copy->flags = user_regs->flags;
+	regs_user_copy->sp = user_regs->sp;
+	regs_user_copy->cs = user_regs->cs;
+	regs_user_copy->ss = user_regs->ss;
 
 	/*
-	 * Don't even try to report the "rest" regs.
+	 * Most system calls don't save these registers, don't report them.
 	 */
 	regs_user_copy->bx = -1;
 	regs_user_copy->bp = -1;
@@ -158,7 +161,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 
 	/*
 	 * For this to be at all useful, we need a reasonable guess for
-	 * sp and the ABI.  Be careful: we're in NMI context, and we're
+	 * the ABI.  Be careful: we're in NMI context, and we're
 	 * considering current to be the current task, so we should
 	 * be careful not to look at any other percpu variables that might
 	 * change during context switches.
@@ -167,9 +170,6 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	    task_thread_info(current)->status & TS_COMPAT) {
 		/* Easy case: we're in a compat syscall. */
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
-		regs_user_copy->sp = user_regs->sp;
-		regs_user_copy->cs = user_regs->cs;
-		regs_user_copy->ss = user_regs->ss;
 	} else if (user_regs->orig_ax != -1) {
 		/*
 		 * We're probably in a 64-bit syscall.
@@ -177,17 +177,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 		 * than just blindly copying user_regs.
 		 */
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-		regs_user_copy->sp = user_regs->sp;
-		regs_user_copy->cs = __USER_CS;
-		regs_user_copy->ss = __USER_DS;
-		regs_user_copy->cx = -1;  /* usually contains garbage */
+		/* usually contains return address (same as ->ip) */
+		regs_user_copy->cx = -1;
 	} else {
 		/* We're probably in an interrupt or exception. */
 		regs_user->abi = user_64bit_mode(user_regs) ?
 			PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
-		regs_user_copy->sp = user_regs->sp;
-		regs_user_copy->cs = user_regs->cs;
-		regs_user_copy->ss = user_regs->ss;
 	}
 
 	regs_user->regs = regs_user_copy;
-- 
cgit v1.2.3


From 5df71b396b2d1fdd9d9f5a33e2eda5dc27c5632d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 10 Apr 2015 15:06:57 +0200
Subject: perf/x86/64: Do report user_regs->cx while we are in syscall, in
 get_regs_user()

Yes, it is true that cx contains return address.
It's not clear why we trash it.
Stop doing that.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1428671219-29341-2-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/perf_regs.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 7ab198acc5b8..a8d4e4862ace 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -177,8 +177,6 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 		 * than just blindly copying user_regs.
 		 */
 		regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-		/* usually contains return address (same as ->ip) */
-		regs_user_copy->cx = -1;
 	} else {
 		/* We're probably in an interrupt or exception. */
 		regs_user->abi = user_64bit_mode(user_regs) ?
-- 
cgit v1.2.3


From 32caa06091cc59651222cdc971dc21eaab36b097 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 10 Apr 2015 15:06:58 +0200
Subject: perf/x86/64: Simplify regs_user->abi setting code in get_regs_user()

user_64bit_mode(regs) basically checks regs->cs to point to a
64-bit segment. This check used to be unreliable here because
regs->cs was not always correct in syscalls.

Now regs->cs is always correct: in syscalls, in interrupts, in
exceptions. No need to emply heuristics here.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1428671219-29341-3-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/perf_regs.c | 18 ++----------------
 1 file changed, 2 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index a8d4e4862ace..8157d3900bc2 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -166,22 +166,8 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	 * be careful not to look at any other percpu variables that might
 	 * change during context switches.
 	 */
-	if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
-	    task_thread_info(current)->status & TS_COMPAT) {
-		/* Easy case: we're in a compat syscall. */
-		regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
-	} else if (user_regs->orig_ax != -1) {
-		/*
-		 * We're probably in a 64-bit syscall.
-		 * Warning: this code is severely racy.  At least it's better
-		 * than just blindly copying user_regs.
-		 */
-		regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
-	} else {
-		/* We're probably in an interrupt or exception. */
-		regs_user->abi = user_64bit_mode(user_regs) ?
-			PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
-	}
+	regs_user->abi = user_64bit_mode(user_regs) ?
+		PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
 
 	regs_user->regs = regs_user_copy;
 }
-- 
cgit v1.2.3


From 3b75232d55680ca166dffa274d0587d5faf0a016 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko
Date: Fri, 10 Apr 2015 15:06:59 +0200
Subject: perf/x86/64: Report regs_user->ax too in get_regs_user()

I don't see why we report e.g. orix_ax, which is not always
meaningful, but don't report ax, which is meaningful.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1428671219-29341-4-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/perf_regs.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 8157d3900bc2..da8cb987b973 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -135,6 +135,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
 	 * On 32-bit entry points, they are saved too except r8..r11.
 	 */
 	regs_user_copy->ip = user_regs->ip;
+	regs_user_copy->ax = user_regs->ax;
 	regs_user_copy->cx = user_regs->cx;
 	regs_user_copy->dx = user_regs->dx;
 	regs_user_copy->si = user_regs->si;
-- 
cgit v1.2.3


From 066450be419fa48007a9f29e19828f2a86198754 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Sun, 12 Apr 2015 11:11:21 +0200
Subject: perf/x86/intel/pt: Clean up the control flow in pt_pmu_hw_init()

Dan Carpenter pointed out that the control flow in pt_pmu_hw_init()
is a bit messy: for example the kfree(de_attrs) is entirely
superfluous.

Another problem is the inconsistent mixing of label based and
direct return error handling.

Add modern, label based error handling instead and clean up the code
a bit as well.

Note that we'll still do a kfree(NULL) in the normal case - this does
not matter as this is an init path and kfree() returns early if it
sees a NULL.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20150409090805.GG17605@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 53 +++++++++++++++++--------------
 1 file changed, 30 insertions(+), 23 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index f5a3afc65371..f2770641c0fd 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -119,48 +119,55 @@ static int __init pt_pmu_hw_init(void)
 	struct dev_ext_attribute *de_attrs;
 	struct attribute **attrs;
 	size_t size;
+	int ret;
 	long i;
 
-	if (test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT)) {
-		for (i = 0; i < PT_CPUID_LEAVES; i++)
-			cpuid_count(20, i,
-				    &pt_pmu.caps[CR_EAX + i * 4],
-				    &pt_pmu.caps[CR_EBX + i * 4],
-				    &pt_pmu.caps[CR_ECX + i * 4],
-				    &pt_pmu.caps[CR_EDX + i * 4]);
-	} else {
-		return -ENODEV;
+	attrs = NULL;
+	ret = -ENODEV;
+	if (!test_cpu_cap(&boot_cpu_data, X86_FEATURE_INTEL_PT))
+		goto fail;
+
+	for (i = 0; i < PT_CPUID_LEAVES; i++) {
+		cpuid_count(20, i,
+			    &pt_pmu.caps[CR_EAX + i*4],
+			    &pt_pmu.caps[CR_EBX + i*4],
+			    &pt_pmu.caps[CR_ECX + i*4],
+			    &pt_pmu.caps[CR_EDX + i*4]);
 	}
 
-	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps) + 1);
+	ret = -ENOMEM;
+	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
 	attrs = kzalloc(size, GFP_KERNEL);
 	if (!attrs)
-		goto err_attrs;
+		goto fail;
 
-	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps) + 1);
+	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
 	de_attrs = kzalloc(size, GFP_KERNEL);
 	if (!de_attrs)
-		goto err_de_attrs;
+		goto fail;
 
 	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
-		de_attrs[i].attr.attr.name = pt_caps[i].name;
+		struct dev_ext_attribute *de_attr = de_attrs + i;
+
+		de_attr->attr.attr.name = pt_caps[i].name;
+
+		sysfs_attr_init(&de_attrs->attr.attr);
 
-		sysfs_attr_init(&de_attrs[i].attr.attr);
-		de_attrs[i].attr.attr.mode = S_IRUGO;
-		de_attrs[i].attr.show = pt_cap_show;
-		de_attrs[i].var = (void *)i;
-		attrs[i] = &de_attrs[i].attr.attr;
+		de_attr->attr.attr.mode		= S_IRUGO;
+		de_attr->attr.show		= pt_cap_show;
+		de_attr->var			= (void *)i;
+
+		attrs[i] = &de_attr->attr.attr;
 	}
 
 	pt_cap_group.attrs = attrs;
+
 	return 0;
 
-err_de_attrs:
-	kfree(de_attrs);
-err_attrs:
+fail:
 	kfree(attrs);
 
-	return -ENOMEM;
+	return ret;
 }
 
 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
-- 
cgit v1.2.3


From fd223849f10a28fa40201652b5f13d52fa8f2bb0 Mon Sep 17 00:00:00 2001
From: Richard Weinberger
Date: Sun, 13 Jul 2014 17:41:57 +0200
Subject: um: Remove signal translation and exec_domain

As execution domain support is gone we can remove
signal translation from the signal code and remove
exec_domain from thread_info.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/asm/thread_info.h | 2 --
 arch/x86/um/signal.c              | 7 -------
 2 files changed, 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h
index e04114c4fcd9..b30c85b141d9 100644
--- a/arch/um/include/asm/thread_info.h
+++ b/arch/um/include/asm/thread_info.h
@@ -14,7 +14,6 @@
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
-	struct exec_domain	*exec_domain;	/* execution domain */
 	unsigned long		flags;		/* low level flags */
 	__u32			cpu;		/* current CPU */
 	int			preempt_count;  /* 0 => preemptable,
@@ -28,7 +27,6 @@ struct thread_info {
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task =		&tsk,			\
-	.exec_domain =	&default_exec_domain,	\
 	.flags =		0,		\
 	.cpu =		0,			\
 	.preempt_count = INIT_PREEMPT_COUNT,	\
diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c
index 0c8c32bfd792..592491d1d70d 100644
--- a/arch/x86/um/signal.c
+++ b/arch/x86/um/signal.c
@@ -549,13 +549,6 @@ int setup_signal_stack_si(unsigned long stack_top, struct ksignal *ksig,
 	if (err)
 		return err;
 
-	/* Set up registers for signal handler */
-	{
-		struct exec_domain *ed = current_thread_info()->exec_domain;
-		if (unlikely(ed && ed->signal_invmap && sig < 32))
-			sig = ed->signal_invmap[sig];
-	}
-
 	PT_REGS_SP(regs) = (unsigned long) frame;
 	PT_REGS_DI(regs) = sig;
 	/* In case the signal handler was declared without prototypes */
-- 
cgit v1.2.3


From 3050a35fba296196cb00e87f4a96aa7d9ed17a7b Mon Sep 17 00:00:00 2001
From: Richard Weinberger
Date: Sun, 13 Jul 2014 17:43:51 +0200
Subject: x86: Remove signal translation and exec_domain

As execution domain support is gone we can remove
signal translation from the signal code and remove
exec_domain from thread_info.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/include/asm/thread_info.h |  3 ---
 arch/x86/kernel/signal.c           | 16 +---------------
 2 files changed, 1 insertion(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..2df52baf5228 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -19,13 +19,11 @@
  */
 #ifndef __ASSEMBLY__
 struct task_struct;
-struct exec_domain;
 #include <asm/processor.h>
 #include <linux/atomic.h>
 
 struct thread_info {
 	struct task_struct	*task;		/* main task structure */
-	struct exec_domain	*exec_domain;	/* execution domain */
 	__u32			flags;		/* low level flags */
 	__u32			status;		/* thread synchronous flags */
 	__u32			cpu;		/* current CPU */
@@ -39,7 +37,6 @@ struct thread_info {
 #define INIT_THREAD_INFO(tsk)			\
 {						\
 	.task		= &tsk,			\
-	.exec_domain	= &default_exec_domain,	\
 	.flags		= 0,			\
 	.cpu		= 0,			\
 	.saved_preempt_count = INIT_PREEMPT_COUNT,	\
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..5ddc7ec20e75 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -592,24 +592,10 @@ badframe:
 	return 0;
 }
 
-/*
- * OK, we're invoking a handler:
- */
-static int signr_convert(int sig)
-{
-#ifdef CONFIG_X86_32
-	struct thread_info *info = current_thread_info();
-
-	if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
-		return info->exec_domain->signal_invmap[sig];
-#endif /* CONFIG_X86_32 */
-	return sig;
-}
-
 static int
 setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 {
-	int usig = signr_convert(ksig->sig);
+	int usig = ksig->sig;
 	sigset_t *set = sigmask_to_save();
 	compat_sigset_t *cset = (compat_sigset_t *) set;
 
-- 
cgit v1.2.3


From d0b5e15f0c0fdd759dd3dd48dc2dc2e7199e0da0 Mon Sep 17 00:00:00 2001
From: Richard Weinberger
Date: Wed, 18 Mar 2015 21:31:27 +0100
Subject: um: Remove SKAS3/4 support

Before we had SKAS0 UML had two modes of operation
TT (tracing thread) and SKAS3/4 (separated kernel address space).
TT was known to be insecure and got removed a long time ago.
SKAS3/4 required a few (3 or 4) patches on the host side which never went
mainline. The last host patch is 10 years old.

With SKAS0 mode (separated kernel address space using 0 host patches),
default since 2005, SKAS3/4 is obsolete and can be removed.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/include/shared/os.h              |   2 -
 arch/um/include/shared/skas/proc_mm.h    |  44 ------
 arch/um/include/shared/skas/skas.h       |   3 -
 arch/um/include/shared/skas_ptrace.h     |  14 --
 arch/um/kernel/ptrace.c                  |  32 -----
 arch/um/kernel/reboot.c                  |  35 ++---
 arch/um/kernel/skas/mmu.c                |  68 ++++-----
 arch/um/kernel/skas/process.c            |  27 ----
 arch/um/kernel/trap.c                    |   2 +-
 arch/um/kernel/um_arch.c                 |  10 --
 arch/um/os-Linux/process.c               |  16 ---
 arch/um/os-Linux/skas/mem.c              | 100 ++------------
 arch/um/os-Linux/skas/process.c          | 200 +++++++--------------------
 arch/um/os-Linux/start_up.c              | 154 ---------------------
 arch/x86/um/ldt.c                        | 227 +++++++------------------------
 arch/x86/um/shared/sysdep/faultinfo_32.h |   3 -
 arch/x86/um/shared/sysdep/faultinfo_64.h |   3 -
 arch/x86/um/shared/sysdep/skas_ptrace.h  |  22 ---
 18 files changed, 148 insertions(+), 814 deletions(-)
 delete mode 100644 arch/um/include/shared/skas/proc_mm.h
 delete mode 100644 arch/um/include/shared/skas_ptrace.h
 delete mode 100644 arch/x86/um/shared/sysdep/skas_ptrace.h

(limited to 'arch/x86')

diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 08eec0b691b0..d824528f6f62 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -174,7 +174,6 @@ extern unsigned long long os_makedev(unsigned major, unsigned minor);
 
 /* start_up.c */
 extern void os_early_checks(void);
-extern void can_do_skas(void);
 extern void os_check_bugs(void);
 extern void check_host_supports_tls(int *supports_tls, int *tls_min);
 
@@ -187,7 +186,6 @@ extern int os_process_parent(int pid);
 extern void os_stop_process(int pid);
 extern void os_kill_process(int pid, int reap_child);
 extern void os_kill_ptraced_process(int pid, int reap_child);
-extern long os_ptrace_ldt(long pid, long addr, long data);
 
 extern int os_getpid(void);
 extern int os_getpgrp(void);
diff --git a/arch/um/include/shared/skas/proc_mm.h b/arch/um/include/shared/skas/proc_mm.h
deleted file mode 100644
index 902809209603..000000000000
--- a/arch/um/include/shared/skas/proc_mm.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (C) 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SKAS_PROC_MM_H
-#define __SKAS_PROC_MM_H
-
-#define MM_MMAP 54
-#define MM_MUNMAP 55
-#define MM_MPROTECT 56
-#define MM_COPY_SEGMENTS 57
-
-struct mm_mmap {
-	unsigned long addr;
-	unsigned long len;
-	unsigned long prot;
-	unsigned long flags;
-	unsigned long fd;
-	unsigned long offset;
-};
-
-struct mm_munmap {
-	unsigned long addr;
-	unsigned long len;
-};
-
-struct mm_mprotect {
-	unsigned long addr;
-	unsigned long len;
-	unsigned int prot;
-};
-
-struct proc_mm_op {
-	int op;
-	union {
-		struct mm_mmap mmap;
-		struct mm_munmap munmap;
-	        struct mm_mprotect mprotect;
-		int copy_segments;
-	} u;
-};
-
-#endif
diff --git a/arch/um/include/shared/skas/skas.h b/arch/um/include/shared/skas/skas.h
index c45df961c874..911f3c45ad1f 100644
--- a/arch/um/include/shared/skas/skas.h
+++ b/arch/um/include/shared/skas/skas.h
@@ -9,13 +9,10 @@
 #include <sysdep/ptrace.h>
 
 extern int userspace_pid[];
-extern int proc_mm, ptrace_faultinfo, ptrace_ldt;
-extern int skas_needs_stub;
 
 extern int user_thread(unsigned long stack, int flags);
 extern void new_thread_handler(void);
 extern void handle_syscall(struct uml_pt_regs *regs);
-extern int new_mm(unsigned long stack);
 extern long execute_syscall_skas(void *r);
 extern unsigned long current_stub_stack(void);
 
diff --git a/arch/um/include/shared/skas_ptrace.h b/arch/um/include/shared/skas_ptrace.h
deleted file mode 100644
index 630a9c92b93c..000000000000
--- a/arch/um/include/shared/skas_ptrace.h
+++ /dev/null
@@ -1,14 +0,0 @@
-/* 
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#ifndef __SKAS_PTRACE_H
-#define __SKAS_PTRACE_H
-
-#define PTRACE_FAULTINFO 52
-#define PTRACE_SWITCH_MM 55
-
-#include <sysdep/skas_ptrace.h>
-
-#endif
diff --git a/arch/um/kernel/ptrace.c b/arch/um/kernel/ptrace.c
index 62435ef003d9..174ee5017264 100644
--- a/arch/um/kernel/ptrace.c
+++ b/arch/um/kernel/ptrace.c
@@ -8,9 +8,6 @@
 #include <linux/sched.h>
 #include <linux/tracehook.h>
 #include <asm/uaccess.h>
-#include <skas_ptrace.h>
-
-
 
 void user_enable_single_step(struct task_struct *child)
 {
@@ -104,35 +101,6 @@ long arch_ptrace(struct task_struct *child, long request,
 		ret = ptrace_set_thread_area(child, addr, vp);
 		break;
 
-	case PTRACE_FAULTINFO: {
-		/*
-		 * Take the info from thread->arch->faultinfo,
-		 * but transfer max. sizeof(struct ptrace_faultinfo).
-		 * On i386, ptrace_faultinfo is smaller!
-		 */
-		ret = copy_to_user(p, &child->thread.arch.faultinfo,
-				   sizeof(struct ptrace_faultinfo)) ?
-			-EIO : 0;
-		break;
-	}
-
-#ifdef PTRACE_LDT
-	case PTRACE_LDT: {
-		struct ptrace_ldt ldt;
-
-		if (copy_from_user(&ldt, p, sizeof(ldt))) {
-			ret = -EIO;
-			break;
-		}
-
-		/*
-		 * This one is confusing, so just punt and return -EIO for
-		 * now
-		 */
-		ret = -EIO;
-		break;
-	}
-#endif
 	default:
 		ret = ptrace_request(child, request, addr, data);
 		if (ret == -EIO)
diff --git a/arch/um/kernel/reboot.c b/arch/um/kernel/reboot.c
index ced8903921ae..9bdf67a092a5 100644
--- a/arch/um/kernel/reboot.c
+++ b/arch/um/kernel/reboot.c
@@ -15,28 +15,21 @@ void (*pm_power_off)(void);
 
 static void kill_off_processes(void)
 {
-	if (proc_mm)
-		/*
-		 * FIXME: need to loop over userspace_pids
-		 */
-		os_kill_ptraced_process(userspace_pid[0], 1);
-	else {
-		struct task_struct *p;
-		int pid;
-
-		read_lock(&tasklist_lock);
-		for_each_process(p) {
-			struct task_struct *t;
-
-			t = find_lock_task_mm(p);
-			if (!t)
-				continue;
-			pid = t->mm->context.id.u.pid;
-			task_unlock(t);
-			os_kill_ptraced_process(pid, 1);
-		}
-		read_unlock(&tasklist_lock);
+	struct task_struct *p;
+	int pid;
+
+	read_lock(&tasklist_lock);
+	for_each_process(p) {
+		struct task_struct *t;
+
+		t = find_lock_task_mm(p);
+		if (!t)
+			continue;
+		pid = t->mm->context.id.u.pid;
+		task_unlock(t);
+		os_kill_ptraced_process(pid, 1);
 	}
+	read_unlock(&tasklist_lock);
 }
 
 void uml_cleanup(void)
diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c
index 007d5503f49b..94abdcc1d6ad 100644
--- a/arch/um/kernel/skas/mmu.c
+++ b/arch/um/kernel/skas/mmu.c
@@ -54,35 +54,22 @@ int init_new_context(struct task_struct *task, struct mm_struct *mm)
 	unsigned long stack = 0;
 	int ret = -ENOMEM;
 
-	if (skas_needs_stub) {
-		stack = get_zeroed_page(GFP_KERNEL);
-		if (stack == 0)
-			goto out;
-	}
+	stack = get_zeroed_page(GFP_KERNEL);
+	if (stack == 0)
+		goto out;
 
 	to_mm->id.stack = stack;
 	if (current->mm != NULL && current->mm != &init_mm)
 		from_mm = &current->mm->context;
 
-	if (proc_mm) {
-		ret = new_mm(stack);
-		if (ret < 0) {
-			printk(KERN_ERR "init_new_context_skas - "
-			       "new_mm failed, errno = %d\n", ret);
-			goto out_free;
-		}
-		to_mm->id.u.mm_fd = ret;
-	}
-	else {
-		if (from_mm)
-			to_mm->id.u.pid = copy_context_skas0(stack,
-							     from_mm->id.u.pid);
-		else to_mm->id.u.pid = start_userspace(stack);
-
-		if (to_mm->id.u.pid < 0) {
-			ret = to_mm->id.u.pid;
-			goto out_free;
-		}
+	if (from_mm)
+		to_mm->id.u.pid = copy_context_skas0(stack,
+						     from_mm->id.u.pid);
+	else to_mm->id.u.pid = start_userspace(stack);
+
+	if (to_mm->id.u.pid < 0) {
+		ret = to_mm->id.u.pid;
+		goto out_free;
 	}
 
 	ret = init_new_ldt(to_mm, from_mm);
@@ -105,9 +92,6 @@ void uml_setup_stubs(struct mm_struct *mm)
 {
 	int err, ret;
 
-	if (!skas_needs_stub)
-		return;
-
 	ret = init_stub_pte(mm, STUB_CODE,
 			    (unsigned long) &__syscall_stub_start);
 	if (ret)
@@ -154,25 +138,19 @@ void destroy_context(struct mm_struct *mm)
 {
 	struct mm_context *mmu = &mm->context;
 
-	if (proc_mm)
-		os_close_file(mmu->id.u.mm_fd);
-	else {
-		/*
-		 * If init_new_context wasn't called, this will be
-		 * zero, resulting in a kill(0), which will result in the
-		 * whole UML suddenly dying.  Also, cover negative and
-		 * 1 cases, since they shouldn't happen either.
-		 */
-		if (mmu->id.u.pid < 2) {
-			printk(KERN_ERR "corrupt mm_context - pid = %d\n",
-			       mmu->id.u.pid);
-			return;
-		}
-		os_kill_ptraced_process(mmu->id.u.pid, 1);
+	/*
+	 * If init_new_context wasn't called, this will be
+	 * zero, resulting in a kill(0), which will result in the
+	 * whole UML suddenly dying.  Also, cover negative and
+	 * 1 cases, since they shouldn't happen either.
+	 */
+	if (mmu->id.u.pid < 2) {
+		printk(KERN_ERR "corrupt mm_context - pid = %d\n",
+		       mmu->id.u.pid);
+		return;
 	}
+	os_kill_ptraced_process(mmu->id.u.pid, 1);
 
-	if (skas_needs_stub)
-		free_page(mmu->id.stack);
-
+	free_page(mmu->id.stack);
 	free_ldt(mmu);
 }
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 4da11b3c8ddb..082955d694f3 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -10,25 +10,6 @@
 #include <os.h>
 #include <skas.h>
 
-int new_mm(unsigned long stack)
-{
-	int fd, err;
-
-	fd = os_open_file("/proc/mm", of_cloexec(of_write(OPENFLAGS())), 0);
-	if (fd < 0)
-		return fd;
-
-	if (skas_needs_stub) {
-		err = map_stub_pages(fd, STUB_CODE, STUB_DATA, stack);
-		if (err) {
-			os_close_file(fd);
-			return err;
-		}
-	}
-
-	return fd;
-}
-
 extern void start_kernel(void);
 
 static int __init start_kernel_proc(void *unused)
@@ -55,14 +36,6 @@ int __init start_uml(void)
 {
 	stack_protections((unsigned long) &cpu0_irqstack);
 	set_sigstack(cpu0_irqstack, THREAD_SIZE);
-	if (proc_mm) {
-		userspace_pid[0] = start_userspace(0);
-		if (userspace_pid[0] < 0) {
-			printf("start_uml - start_userspace returned %d\n",
-			       userspace_pid[0]);
-			exit(1);
-		}
-	}
 
 	init_new_thread_signals();
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 209617302df8..8e4daf44e980 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -220,7 +220,7 @@ unsigned long segv(struct faultinfo fi, unsigned long ip, int is_user,
 		panic("Segfault with no mm");
 	}
 
-	if (SEGV_IS_FIXABLE(&fi) || SEGV_MAYBE_FIXABLE(&fi))
+	if (SEGV_IS_FIXABLE(&fi))
 		err = handle_page_fault(address, ip, is_write, is_user,
 					&si.si_code);
 	else {
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index 9274eae6ae7b..dbd5bda1f184 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -268,7 +268,6 @@ int __init linux_main(int argc, char **argv)
 	unsigned long stack;
 	unsigned int i;
 	int add;
-	char * mode;
 
 	for (i = 1; i < argc; i++) {
 		if ((i == 1) && (argv[i][0] == ' '))
@@ -291,15 +290,6 @@ int __init linux_main(int argc, char **argv)
 	/* OS sanity checks that need to happen before the kernel runs */
 	os_early_checks();
 
-	can_do_skas();
-
-	if (proc_mm && ptrace_faultinfo)
-		mode = "SKAS3";
-	else
-		mode = "SKAS0";
-
-	printf("UML running in %s mode\n", mode);
-
 	brk_start = (unsigned long) sbrk(0);
 
 	/*
diff --git a/arch/um/os-Linux/process.c b/arch/um/os-Linux/process.c
index 33496fe2bb52..8408aba915b2 100644
--- a/arch/um/os-Linux/process.c
+++ b/arch/um/os-Linux/process.c
@@ -16,7 +16,6 @@
 #include <init.h>
 #include <longjmp.h>
 #include <os.h>
-#include <skas_ptrace.h>
 
 #define ARBITRARY_ADDR -1
 #define FAILURE_PID    -1
@@ -102,21 +101,6 @@ void os_kill_process(int pid, int reap_child)
 		CATCH_EINTR(waitpid(pid, NULL, __WALL));
 }
 
-/* This is here uniquely to have access to the userspace errno, i.e. the one
- * used by ptrace in case of error.
- */
-
-long os_ptrace_ldt(long pid, long addr, long data)
-{
-	int ret;
-
-	ret = ptrace(PTRACE_LDT, pid, addr, data);
-
-	if (ret < 0)
-		return -errno;
-	return ret;
-}
-
 /* Kill off a ptraced child by all means available.  kill it normally first,
  * then PTRACE_KILL it, then PTRACE_CONT it in case it's in a run state from
  * which it can't exit directly.
diff --git a/arch/um/os-Linux/skas/mem.c b/arch/um/os-Linux/skas/mem.c
index 689b18db798f..e7f8c945a573 100644
--- a/arch/um/os-Linux/skas/mem.c
+++ b/arch/um/os-Linux/skas/mem.c
@@ -12,7 +12,6 @@
 #include <as-layout.h>
 #include <mm_id.h>
 #include <os.h>
-#include <proc_mm.h>
 #include <ptrace_user.h>
 #include <registers.h>
 #include <skas.h>
@@ -46,8 +45,6 @@ static int __init init_syscall_regs(void)
 
 __initcall(init_syscall_regs);
 
-extern int proc_mm;
-
 static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
 {
 	int n, i;
@@ -56,10 +53,6 @@ static inline long do_syscall_stub(struct mm_id * mm_idp, void **addr)
 	unsigned long * syscall;
 	int err, pid = mm_idp->u.pid;
 
-	if (proc_mm)
-		/* FIXME: Need to look up userspace_pid by cpu */
-		pid = userspace_pid[0];
-
 	n = ptrace_setregs(pid, syscall_regs);
 	if (n < 0) {
 		printk(UM_KERN_ERR "Registers - \n");
@@ -178,38 +171,12 @@ int map(struct mm_id * mm_idp, unsigned long virt, unsigned long len, int prot,
 	int phys_fd, unsigned long long offset, int done, void **data)
 {
 	int ret;
+	unsigned long args[] = { virt, len, prot,
+				 MAP_SHARED | MAP_FIXED, phys_fd,
+				 MMAP_OFFSET(offset) };
 
-	if (proc_mm) {
-		struct proc_mm_op map;
-		int fd = mm_idp->u.mm_fd;
-
-		map = ((struct proc_mm_op) { .op	= MM_MMAP,
-				       .u		=
-				       { .mmap	=
-					 { .addr	= virt,
-					   .len	= len,
-					   .prot	= prot,
-					   .flags	= MAP_SHARED |
-					   MAP_FIXED,
-					   .fd	= phys_fd,
-					   .offset= offset
-					 } } } );
-		CATCH_EINTR(ret = write(fd, &map, sizeof(map)));
-		if (ret != sizeof(map)) {
-			ret = -errno;
-			printk(UM_KERN_ERR "map : /proc/mm map failed, "
-			       "err = %d\n", -ret);
-		}
-		else ret = 0;
-	}
-	else {
-		unsigned long args[] = { virt, len, prot,
-					 MAP_SHARED | MAP_FIXED, phys_fd,
-					 MMAP_OFFSET(offset) };
-
-		ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt,
-				       data, done);
-	}
+	ret = run_syscall_stub(mm_idp, STUB_MMAP_NR, args, virt,
+			       data, done);
 
 	return ret;
 }
@@ -218,32 +185,11 @@ int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len,
 	  int done, void **data)
 {
 	int ret;
+	unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0,
+				 0 };
 
-	if (proc_mm) {
-		struct proc_mm_op unmap;
-		int fd = mm_idp->u.mm_fd;
-
-		unmap = ((struct proc_mm_op) { .op	= MM_MUNMAP,
-					 .u	=
-					 { .munmap	=
-					   { .addr	=
-					     (unsigned long) addr,
-					     .len		= len } } } );
-		CATCH_EINTR(ret = write(fd, &unmap, sizeof(unmap)));
-		if (ret != sizeof(unmap)) {
-			ret = -errno;
-			printk(UM_KERN_ERR "unmap - proc_mm write returned "
-			       "%d\n", ret);
-		}
-		else ret = 0;
-	}
-	else {
-		unsigned long args[] = { (unsigned long) addr, len, 0, 0, 0,
-					 0 };
-
-		ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0,
-				       data, done);
-	}
+	ret = run_syscall_stub(mm_idp, __NR_munmap, args, 0,
+			       data, done);
 
 	return ret;
 }
@@ -251,33 +197,11 @@ int unmap(struct mm_id * mm_idp, unsigned long addr, unsigned long len,
 int protect(struct mm_id * mm_idp, unsigned long addr, unsigned long len,
 	    unsigned int prot, int done, void **data)
 {
-	struct proc_mm_op protect;
 	int ret;
+	unsigned long args[] = { addr, len, prot, 0, 0, 0 };
 
-	if (proc_mm) {
-		int fd = mm_idp->u.mm_fd;
-
-		protect = ((struct proc_mm_op) { .op	= MM_MPROTECT,
-					   .u	=
-					   { .mprotect	=
-					     { .addr	=
-					       (unsigned long) addr,
-					       .len	= len,
-					       .prot	= prot } } } );
-
-		CATCH_EINTR(ret = write(fd, &protect, sizeof(protect)));
-		if (ret != sizeof(protect)) {
-			ret = -errno;
-			printk(UM_KERN_ERR "protect failed, err = %d", -ret);
-		}
-		else ret = 0;
-	}
-	else {
-		unsigned long args[] = { addr, len, prot, 0, 0, 0 };
-
-		ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0,
-				       data, done);
-	}
+	ret = run_syscall_stub(mm_idp, __NR_mprotect, args, 0,
+			       data, done);
 
 	return ret;
 }
diff --git a/arch/um/os-Linux/skas/process.c b/arch/um/os-Linux/skas/process.c
index 908579f2b0ab..50ebeae5cbb3 100644
--- a/arch/um/os-Linux/skas/process.c
+++ b/arch/um/os-Linux/skas/process.c
@@ -16,11 +16,9 @@
 #include <kern_util.h>
 #include <mem.h>
 #include <os.h>
-#include <proc_mm.h>
 #include <ptrace_user.h>
 #include <registers.h>
 #include <skas.h>
-#include <skas_ptrace.h>
 #include <sysdep/stub.h>
 
 int is_skas_winch(int pid, int fd, void *data)
@@ -91,50 +89,33 @@ extern unsigned long current_stub_stack(void);
 static void get_skas_faultinfo(int pid, struct faultinfo *fi)
 {
 	int err;
+	unsigned long fpregs[FP_SIZE];
 
-	if (ptrace_faultinfo) {
-		err = ptrace(PTRACE_FAULTINFO, pid, 0, fi);
-		if (err) {
-			printk(UM_KERN_ERR "get_skas_faultinfo - "
-			       "PTRACE_FAULTINFO failed, errno = %d\n", errno);
-			fatal_sigsegv();
-		}
-
-		/* Special handling for i386, which has different structs */
-		if (sizeof(struct ptrace_faultinfo) < sizeof(struct faultinfo))
-			memset((char *)fi + sizeof(struct ptrace_faultinfo), 0,
-			       sizeof(struct faultinfo) -
-			       sizeof(struct ptrace_faultinfo));
+	err = get_fp_registers(pid, fpregs);
+	if (err < 0) {
+		printk(UM_KERN_ERR "save_fp_registers returned %d\n",
+		       err);
+		fatal_sigsegv();
 	}
-	else {
-		unsigned long fpregs[FP_SIZE];
-
-		err = get_fp_registers(pid, fpregs);
-		if (err < 0) {
-			printk(UM_KERN_ERR "save_fp_registers returned %d\n",
-			       err);
-			fatal_sigsegv();
-		}
-		err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
-		if (err) {
-			printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
-			       "errno = %d\n", pid, errno);
-			fatal_sigsegv();
-		}
-		wait_stub_done(pid);
+	err = ptrace(PTRACE_CONT, pid, 0, SIGSEGV);
+	if (err) {
+		printk(UM_KERN_ERR "Failed to continue stub, pid = %d, "
+		       "errno = %d\n", pid, errno);
+		fatal_sigsegv();
+	}
+	wait_stub_done(pid);
 
-		/*
-		 * faultinfo is prepared by the stub-segv-handler at start of
-		 * the stub stack page. We just have to copy it.
-		 */
-		memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
+	/*
+	 * faultinfo is prepared by the stub-segv-handler at start of
+	 * the stub stack page. We just have to copy it.
+	 */
+	memcpy(fi, (void *)current_stub_stack(), sizeof(*fi));
 
-		err = put_fp_registers(pid, fpregs);
-		if (err < 0) {
-			printk(UM_KERN_ERR "put_fp_registers returned %d\n",
-			       err);
-			fatal_sigsegv();
-		}
+	err = put_fp_registers(pid, fpregs);
+	if (err < 0) {
+		printk(UM_KERN_ERR "put_fp_registers returned %d\n",
+		       err);
+		fatal_sigsegv();
 	}
 }
 
@@ -198,7 +179,8 @@ extern int __syscall_stub_start;
 static int userspace_tramp(void *stack)
 {
 	void *addr;
-	int err;
+	int err, fd;
+	unsigned long long offset;
 
 	ptrace(PTRACE_TRACEME, 0, 0, 0);
 
@@ -211,36 +193,32 @@ static int userspace_tramp(void *stack)
 		exit(1);
 	}
 
-	if (!proc_mm) {
-		/*
-		 * This has a pte, but it can't be mapped in with the usual
-		 * tlb_flush mechanism because this is part of that mechanism
-		 */
-		int fd;
-		unsigned long long offset;
-		fd = phys_mapping(to_phys(&__syscall_stub_start), &offset);
-		addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
-			      PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
+	/*
+	 * This has a pte, but it can't be mapped in with the usual
+	 * tlb_flush mechanism because this is part of that mechanism
+	 */
+	fd = phys_mapping(to_phys(&__syscall_stub_start), &offset);
+	addr = mmap64((void *) STUB_CODE, UM_KERN_PAGE_SIZE,
+		      PROT_EXEC, MAP_FIXED | MAP_PRIVATE, fd, offset);
+	if (addr == MAP_FAILED) {
+		printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, "
+		       "errno = %d\n", STUB_CODE, errno);
+		exit(1);
+	}
+
+	if (stack != NULL) {
+		fd = phys_mapping(to_phys(stack), &offset);
+		addr = mmap((void *) STUB_DATA,
+			    UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
+			    MAP_FIXED | MAP_SHARED, fd, offset);
 		if (addr == MAP_FAILED) {
-			printk(UM_KERN_ERR "mapping mmap stub at 0x%lx failed, "
-			       "errno = %d\n", STUB_CODE, errno);
+			printk(UM_KERN_ERR "mapping segfault stack "
+			       "at 0x%lx failed, errno = %d\n",
+			       STUB_DATA, errno);
 			exit(1);
 		}
-
-		if (stack != NULL) {
-			fd = phys_mapping(to_phys(stack), &offset);
-			addr = mmap((void *) STUB_DATA,
-				    UM_KERN_PAGE_SIZE, PROT_READ | PROT_WRITE,
-				    MAP_FIXED | MAP_SHARED, fd, offset);
-			if (addr == MAP_FAILED) {
-				printk(UM_KERN_ERR "mapping segfault stack "
-				       "at 0x%lx failed, errno = %d\n",
-				       STUB_DATA, errno);
-				exit(1);
-			}
-		}
 	}
-	if (!ptrace_faultinfo && (stack != NULL)) {
+	if (stack != NULL) {
 		struct sigaction sa;
 
 		unsigned long v = STUB_CODE +
@@ -286,11 +264,7 @@ int start_userspace(unsigned long stub_stack)
 
 	sp = (unsigned long) stack + UM_KERN_PAGE_SIZE - sizeof(void *);
 
-	flags = CLONE_FILES;
-	if (proc_mm)
-		flags |= CLONE_VM;
-	else
-		flags |= SIGCHLD;
+	flags = CLONE_FILES | SIGCHLD;
 
 	pid = clone(userspace_tramp, (void *) sp, flags, (void *) stub_stack);
 	if (pid < 0) {
@@ -413,8 +387,7 @@ void userspace(struct uml_pt_regs *regs)
 
 			switch (sig) {
 			case SIGSEGV:
-				if (PTRACE_FULL_FAULTINFO ||
-				    !ptrace_faultinfo) {
+				if (PTRACE_FULL_FAULTINFO) {
 					get_skas_faultinfo(pid,
 							   &regs->faultinfo);
 					(*sig_info[SIGSEGV])(SIGSEGV, (struct siginfo *)&si,
@@ -571,67 +544,6 @@ int copy_context_skas0(unsigned long new_stack, int pid)
 	return err;
 }
 
-/*
- * This is used only, if stub pages are needed, while proc_mm is
- * available. Opening /proc/mm creates a new mm_context, which lacks
- * the stub-pages. Thus, we map them using /proc/mm-fd
- */
-int map_stub_pages(int fd, unsigned long code, unsigned long data,
-		   unsigned long stack)
-{
-	struct proc_mm_op mmop;
-	int n;
-	unsigned long long code_offset;
-	int code_fd = phys_mapping(to_phys((void *) &__syscall_stub_start),
-				   &code_offset);
-
-	mmop = ((struct proc_mm_op) { .op        = MM_MMAP,
-				      .u         =
-				      { .mmap    =
-					{ .addr    = code,
-					  .len     = UM_KERN_PAGE_SIZE,
-					  .prot    = PROT_EXEC,
-					  .flags   = MAP_FIXED | MAP_PRIVATE,
-					  .fd      = code_fd,
-					  .offset  = code_offset
-	} } });
-	CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop)));
-	if (n != sizeof(mmop)) {
-		n = errno;
-		printk(UM_KERN_ERR "mmap args - addr = 0x%lx, fd = %d, "
-		       "offset = %llx\n", code, code_fd,
-		       (unsigned long long) code_offset);
-		printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for code "
-		       "failed, err = %d\n", n);
-		return -n;
-	}
-
-	if (stack) {
-		unsigned long long map_offset;
-		int map_fd = phys_mapping(to_phys((void *)stack), &map_offset);
-		mmop = ((struct proc_mm_op)
-				{ .op        = MM_MMAP,
-				  .u         =
-				  { .mmap    =
-				    { .addr    = data,
-				      .len     = UM_KERN_PAGE_SIZE,
-				      .prot    = PROT_READ | PROT_WRITE,
-				      .flags   = MAP_FIXED | MAP_SHARED,
-				      .fd      = map_fd,
-				      .offset  = map_offset
-		} } });
-		CATCH_EINTR(n = write(fd, &mmop, sizeof(mmop)));
-		if (n != sizeof(mmop)) {
-			n = errno;
-			printk(UM_KERN_ERR "map_stub_pages : /proc/mm map for "
-			       "data failed, err = %d\n", n);
-			return -n;
-		}
-	}
-
-	return 0;
-}
-
 void new_thread(void *stack, jmp_buf *buf, void (*handler)(void))
 {
 	(*buf)[0].JB_IP = (unsigned long) handler;
@@ -728,17 +640,5 @@ void reboot_skas(void)
 
 void __switch_mm(struct mm_id *mm_idp)
 {
-	int err;
-
-	/* FIXME: need cpu pid in __switch_mm */
-	if (proc_mm) {
-		err = ptrace(PTRACE_SWITCH_MM, userspace_pid[0], 0,
-			     mm_idp->u.mm_fd);
-		if (err) {
-			printk(UM_KERN_ERR "__switch_mm - PTRACE_SWITCH_MM "
-			       "failed, errno = %d\n", errno);
-			fatal_sigsegv();
-		}
-	}
-	else userspace_pid[0] = mm_idp->u.pid;
+	userspace_pid[0] = mm_idp->u.pid;
 }
diff --git a/arch/um/os-Linux/start_up.c b/arch/um/os-Linux/start_up.c
index 337518c5042a..47f1ff056a54 100644
--- a/arch/um/os-Linux/start_up.c
+++ b/arch/um/os-Linux/start_up.c
@@ -24,7 +24,6 @@
 #include <ptrace_user.h>
 #include <registers.h>
 #include <skas.h>
-#include <skas_ptrace.h>
 
 static void ptrace_child(void)
 {
@@ -142,44 +141,6 @@ static int stop_ptraced_child(int pid, int exitcode, int mustexit)
 	return ret;
 }
 
-/* Changed only during early boot */
-int ptrace_faultinfo;
-static int disable_ptrace_faultinfo;
-
-int ptrace_ldt;
-static int disable_ptrace_ldt;
-
-int proc_mm;
-static int disable_proc_mm;
-
-int have_switch_mm;
-static int disable_switch_mm;
-
-int skas_needs_stub;
-
-static int __init skas0_cmd_param(char *str, int* add)
-{
-	disable_ptrace_faultinfo = 1;
-	disable_ptrace_ldt = 1;
-	disable_proc_mm = 1;
-	disable_switch_mm = 1;
-
-	return 0;
-}
-
-/* The two __uml_setup would conflict, without this stupid alias. */
-
-static int __init mode_skas0_cmd_param(char *str, int* add)
-	__attribute__((alias("skas0_cmd_param")));
-
-__uml_setup("skas0", skas0_cmd_param,
-"skas0\n"
-"    Disables SKAS3 and SKAS4 usage, so that SKAS0 is used\n\n");
-
-__uml_setup("mode=skas0", mode_skas0_cmd_param,
-"mode=skas0\n"
-"    Disables SKAS3 and SKAS4 usage, so that SKAS0 is used.\n\n");
-
 /* Changed only during early boot */
 static int force_sysemu_disabled = 0;
 
@@ -376,121 +337,6 @@ void __init os_early_checks(void)
 	stop_ptraced_child(pid, 1, 1);
 }
 
-static int __init noprocmm_cmd_param(char *str, int* add)
-{
-	disable_proc_mm = 1;
-	return 0;
-}
-
-__uml_setup("noprocmm", noprocmm_cmd_param,
-"noprocmm\n"
-"    Turns off usage of /proc/mm, even if host supports it.\n"
-"    To support /proc/mm, the host needs to be patched using\n"
-"    the current skas3 patch.\n\n");
-
-static int __init noptracefaultinfo_cmd_param(char *str, int* add)
-{
-	disable_ptrace_faultinfo = 1;
-	return 0;
-}
-
-__uml_setup("noptracefaultinfo", noptracefaultinfo_cmd_param,
-"noptracefaultinfo\n"
-"    Turns off usage of PTRACE_FAULTINFO, even if host supports\n"
-"    it. To support PTRACE_FAULTINFO, the host needs to be patched\n"
-"    using the current skas3 patch.\n\n");
-
-static int __init noptraceldt_cmd_param(char *str, int* add)
-{
-	disable_ptrace_ldt = 1;
-	return 0;
-}
-
-__uml_setup("noptraceldt", noptraceldt_cmd_param,
-"noptraceldt\n"
-"    Turns off usage of PTRACE_LDT, even if host supports it.\n"
-"    To support PTRACE_LDT, the host needs to be patched using\n"
-"    the current skas3 patch.\n\n");
-
-static inline void check_skas3_ptrace_faultinfo(void)
-{
-	struct ptrace_faultinfo fi;
-	int pid, n;
-
-	non_fatal("  - PTRACE_FAULTINFO...");
-	pid = start_ptraced_child();
-
-	n = ptrace(PTRACE_FAULTINFO, pid, 0, &fi);
-	if (n < 0) {
-		if (errno == EIO)
-			non_fatal("not found\n");
-		else
-			perror("not found");
-	} else if (disable_ptrace_faultinfo)
-		non_fatal("found but disabled on command line\n");
-	else {
-		ptrace_faultinfo = 1;
-		non_fatal("found\n");
-	}
-
-	stop_ptraced_child(pid, 1, 1);
-}
-
-static inline void check_skas3_ptrace_ldt(void)
-{
-#ifdef PTRACE_LDT
-	int pid, n;
-	unsigned char ldtbuf[40];
-	struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
-		.func = 2, /* read default ldt */
-		.ptr = ldtbuf,
-		.bytecount = sizeof(ldtbuf)};
-
-	non_fatal("  - PTRACE_LDT...");
-	pid = start_ptraced_child();
-
-	n = ptrace(PTRACE_LDT, pid, 0, (unsigned long) &ldt_op);
-	if (n < 0) {
-		if (errno == EIO)
-			non_fatal("not found\n");
-		else
-			perror("not found");
-	} else if (disable_ptrace_ldt)
-		non_fatal("found, but use is disabled\n");
-	else {
-		ptrace_ldt = 1;
-		non_fatal("found\n");
-	}
-
-	stop_ptraced_child(pid, 1, 1);
-#endif
-}
-
-static inline void check_skas3_proc_mm(void)
-{
-	non_fatal("  - /proc/mm...");
-	if (access("/proc/mm", W_OK) < 0)
-		perror("not found");
-	else if (disable_proc_mm)
-		non_fatal("found but disabled on command line\n");
-	else {
-		proc_mm = 1;
-		non_fatal("found\n");
-	}
-}
-
-void can_do_skas(void)
-{
-	non_fatal("Checking for the skas3 patch in the host:\n");
-
-	check_skas3_proc_mm();
-	check_skas3_ptrace_faultinfo();
-	check_skas3_ptrace_ldt();
-
-	if (!proc_mm || !ptrace_faultinfo || !ptrace_ldt)
-		skas_needs_stub = 1;
-}
-
 int __init parse_iomem(char *str, int *add)
 {
 	struct iomem_region *new;
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 8e08176f0bcb..5c0b711d2433 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -8,9 +8,7 @@
 #include <linux/slab.h>
 #include <asm/unistd.h>
 #include <os.h>
-#include <proc_mm.h>
 #include <skas.h>
-#include <skas_ptrace.h>
 #include <sysdep/tls.h>
 
 extern int modify_ldt(int func, void *ptr, unsigned long bytecount);
@@ -19,105 +17,20 @@ static long write_ldt_entry(struct mm_id *mm_idp, int func,
 		     struct user_desc *desc, void **addr, int done)
 {
 	long res;
-
-	if (proc_mm) {
-		/*
-		 * This is a special handling for the case, that the mm to
-		 * modify isn't current->active_mm.
-		 * If this is called directly by modify_ldt,
-		 *     (current->active_mm->context.skas.u == mm_idp)
-		 * will be true. So no call to __switch_mm(mm_idp) is done.
-		 * If this is called in case of init_new_ldt or PTRACE_LDT,
-		 * mm_idp won't belong to current->active_mm, but child->mm.
-		 * So we need to switch child's mm into our userspace, then
-		 * later switch back.
-		 *
-		 * Note: I'm unsure: should interrupts be disabled here?
-		 */
-		if (!current->active_mm || current->active_mm == &init_mm ||
-		    mm_idp != &current->active_mm->context.id)
-			__switch_mm(mm_idp);
-	}
-
-	if (ptrace_ldt) {
-		struct ptrace_ldt ldt_op = (struct ptrace_ldt) {
-			.func = func,
-			.ptr = desc,
-			.bytecount = sizeof(*desc)};
-		u32 cpu;
-		int pid;
-
-		if (!proc_mm)
-			pid = mm_idp->u.pid;
-		else {
-			cpu = get_cpu();
-			pid = userspace_pid[cpu];
-		}
-
-		res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op);
-
-		if (proc_mm)
-			put_cpu();
-	}
-	else {
-		void *stub_addr;
-		res = syscall_stub_data(mm_idp, (unsigned long *)desc,
-					(sizeof(*desc) + sizeof(long) - 1) &
-					    ~(sizeof(long) - 1),
-					addr, &stub_addr);
-		if (!res) {
-			unsigned long args[] = { func,
-						 (unsigned long)stub_addr,
-						 sizeof(*desc),
-						 0, 0, 0 };
-			res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
-					       0, addr, done);
-		}
+	void *stub_addr;
+	res = syscall_stub_data(mm_idp, (unsigned long *)desc,
+				(sizeof(*desc) + sizeof(long) - 1) &
+				    ~(sizeof(long) - 1),
+				addr, &stub_addr);
+	if (!res) {
+		unsigned long args[] = { func,
+					 (unsigned long)stub_addr,
+					 sizeof(*desc),
+					 0, 0, 0 };
+		res = run_syscall_stub(mm_idp, __NR_modify_ldt, args,
+				       0, addr, done);
 	}
 
-	if (proc_mm) {
-		/*
-		 * This is the second part of special handling, that makes
-		 * PTRACE_LDT possible to implement.
-		 */
-		if (current->active_mm && current->active_mm != &init_mm &&
-		    mm_idp != &current->active_mm->context.id)
-			__switch_mm(&current->active_mm->context.id);
-	}
-
-	return res;
-}
-
-static long read_ldt_from_host(void __user * ptr, unsigned long bytecount)
-{
-	int res, n;
-	struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) {
-			.func = 0,
-			.bytecount = bytecount,
-			.ptr = kmalloc(bytecount, GFP_KERNEL)};
-	u32 cpu;
-
-	if (ptrace_ldt.ptr == NULL)
-		return -ENOMEM;
-
-	/*
-	 * This is called from sys_modify_ldt only, so userspace_pid gives
-	 * us the right number
-	 */
-
-	cpu = get_cpu();
-	res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt);
-	put_cpu();
-	if (res < 0)
-		goto out;
-
-	n = copy_to_user(ptr, ptrace_ldt.ptr, res);
-	if (n != 0)
-		res = -EFAULT;
-
-  out:
-	kfree(ptrace_ldt.ptr);
-
 	return res;
 }
 
@@ -145,9 +58,6 @@ static int read_ldt(void __user * ptr, unsigned long bytecount)
 		bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES;
 	err = bytecount;
 
-	if (ptrace_ldt)
-		return read_ldt_from_host(ptr, bytecount);
-
 	mutex_lock(&ldt->lock);
 	if (ldt->entry_count <= LDT_DIRECT_ENTRIES) {
 		size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES;
@@ -229,17 +139,11 @@ static int write_ldt(void __user * ptr, unsigned long bytecount, int func)
 			goto out;
 	}
 
-	if (!ptrace_ldt)
-		mutex_lock(&ldt->lock);
+	mutex_lock(&ldt->lock);
 
 	err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1);
 	if (err)
 		goto out_unlock;
-	else if (ptrace_ldt) {
-		/* With PTRACE_LDT available, this is used as a flag only */
-		ldt->entry_count = 1;
-		goto out;
-	}
 
 	if (ldt_info.entry_number >= ldt->entry_count &&
 	    ldt_info.entry_number >= LDT_DIRECT_ENTRIES) {
@@ -393,91 +297,56 @@ long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm)
 	int i;
 	long page, err=0;
 	void *addr = NULL;
-	struct proc_mm_op copy;
 
 
-	if (!ptrace_ldt)
-		mutex_init(&new_mm->arch.ldt.lock);
+	mutex_init(&new_mm->arch.ldt.lock);
 
 	if (!from_mm) {
 		memset(&desc, 0, sizeof(desc));
 		/*
-		 * We have to initialize a clean ldt.
+		 * Now we try to retrieve info about the ldt, we
+		 * inherited from the host. All ldt-entries found
+		 * will be reset in the following loop
 		 */
-		if (proc_mm) {
-			/*
-			 * If the new mm was created using proc_mm, host's
-			 * default-ldt currently is assigned, which normally
-			 * contains the call-gates for lcall7 and lcall27.
-			 * To remove these gates, we simply write an empty
-			 * entry as number 0 to the host.
-			 */
-			err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1);
-		}
-		else{
-			/*
-			 * Now we try to retrieve info about the ldt, we
-			 * inherited from the host. All ldt-entries found
-			 * will be reset in the following loop
-			 */
-			ldt_get_host_info();
-			for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
-				desc.entry_number = *num_p;
-				err = write_ldt_entry(&new_mm->id, 1, &desc,
-						      &addr, *(num_p + 1) == -1);
-				if (err)
-					break;
-			}
+		ldt_get_host_info();
+		for (num_p=host_ldt_entries; *num_p != -1; num_p++) {
+			desc.entry_number = *num_p;
+			err = write_ldt_entry(&new_mm->id, 1, &desc,
+					      &addr, *(num_p + 1) == -1);
+			if (err)
+				break;
 		}
 		new_mm->arch.ldt.entry_count = 0;
 
 		goto out;
 	}
 
-	if (proc_mm) {
-		/*
-		 * We have a valid from_mm, so we now have to copy the LDT of
-		 * from_mm to new_mm, because using proc_mm an new mm with
-		 * an empty/default LDT was created in new_mm()
-		 */
-		copy = ((struct proc_mm_op) { .op 	= MM_COPY_SEGMENTS,
-					      .u 	=
-					      { .copy_segments =
-							from_mm->id.u.mm_fd } } );
-		i = os_write_file(new_mm->id.u.mm_fd, &copy, sizeof(copy));
-		if (i != sizeof(copy))
-			printk(KERN_ERR "new_mm : /proc/mm copy_segments "
-			       "failed, err = %d\n", -i);
-	}
-
-	if (!ptrace_ldt) {
-		/*
-		 * Our local LDT is used to supply the data for
-		 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
-		 * i.e., we have to use the stub for modify_ldt, which
-		 * can't handle the big read buffer of up to 64kB.
-		 */
-		mutex_lock(&from_mm->arch.ldt.lock);
-		if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
-			memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
-			       sizeof(new_mm->arch.ldt.u.entries));
-		else {
-			i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
-			while (i-->0) {
-				page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
-				if (!page) {
-					err = -ENOMEM;
-					break;
-				}
-				new_mm->arch.ldt.u.pages[i] =
-					(struct ldt_entry *) page;
-				memcpy(new_mm->arch.ldt.u.pages[i],
-				       from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
+	/*
+	 * Our local LDT is used to supply the data for
+	 * modify_ldt(READLDT), if PTRACE_LDT isn't available,
+	 * i.e., we have to use the stub for modify_ldt, which
+	 * can't handle the big read buffer of up to 64kB.
+	 */
+	mutex_lock(&from_mm->arch.ldt.lock);
+	if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES)
+		memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries,
+		       sizeof(new_mm->arch.ldt.u.entries));
+	else {
+		i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
+		while (i-->0) {
+			page = __get_free_page(GFP_KERNEL|__GFP_ZERO);
+			if (!page) {
+				err = -ENOMEM;
+				break;
 			}
+			new_mm->arch.ldt.u.pages[i] =
+				(struct ldt_entry *) page;
+			memcpy(new_mm->arch.ldt.u.pages[i],
+			       from_mm->arch.ldt.u.pages[i], PAGE_SIZE);
 		}
-		new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
-		mutex_unlock(&from_mm->arch.ldt.lock);
 	}
+	new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count;
+	mutex_unlock(&from_mm->arch.ldt.lock);
 
     out:
 	return err;
@@ -488,7 +357,7 @@ void free_ldt(struct mm_context *mm)
 {
 	int i;
 
-	if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
+	if (mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) {
 		i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE;
 		while (i-- > 0)
 			free_page((long) mm->arch.ldt.u.pages[i]);
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
index a26086b8a800..b6f2437ec29c 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_32.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -27,9 +27,6 @@ struct faultinfo {
 /* This is Page Fault */
 #define SEGV_IS_FIXABLE(fi)	((fi)->trap_no == 14)
 
-/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
-#define SEGV_MAYBE_FIXABLE(fi)	((fi)->trap_no == 0 && ptrace_faultinfo)
-
 #define PTRACE_FULL_FAULTINFO 0
 
 #endif
diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h
index f811cbe15d62..ee88f88974ea 100644
--- a/arch/x86/um/shared/sysdep/faultinfo_64.h
+++ b/arch/x86/um/shared/sysdep/faultinfo_64.h
@@ -27,9 +27,6 @@ struct faultinfo {
 /* This is Page Fault */
 #define SEGV_IS_FIXABLE(fi)	((fi)->trap_no == 14)
 
-/* No broken SKAS API, which doesn't pass trap_no, here. */
-#define SEGV_MAYBE_FIXABLE(fi)	0
-
 #define PTRACE_FULL_FAULTINFO 1
 
 #endif
diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h
deleted file mode 100644
index 453febe98993..000000000000
--- a/arch/x86/um/shared/sysdep/skas_ptrace.h
+++ /dev/null
@@ -1,22 +0,0 @@
-/*
- * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com)
- * Licensed under the GPL
- */
-
-#ifndef __SYSDEP_X86_SKAS_PTRACE_H
-#define __SYSDEP_X86_SKAS_PTRACE_H
-
-struct ptrace_faultinfo {
-        int is_write;
-        unsigned long addr;
-};
-
-struct ptrace_ldt {
-        int func;
-        void *ptr;
-        unsigned long bytecount;
-};
-
-#define PTRACE_LDT 54
-
-#endif
-- 
cgit v1.2.3


From 28fa468f53163bc0b867b4cc75a9e36e7ed4dbbd Mon Sep 17 00:00:00 2001
From: Richard Weinberger
Date: Wed, 18 Mar 2015 21:42:54 +0100
Subject: um: Remove broken SMP support

At times where UML used the TT mode to operate it had
kind of SMP support. It never got finished nor was
stable.
Let's rip out that cruft and stop confusing developers
which do tree-wide SMP cleanups.

If someone wants SMP support UML it has do be done from scratch.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/Kconfig.um                      |  30 ----
 arch/um/include/asm/processor-generic.h |   8 --
 arch/um/include/asm/smp.h               |  26 ----
 arch/um/kernel/Makefile                 |   2 +-
 arch/um/kernel/irq.c                    |   3 -
 arch/um/kernel/process.c                |  11 --
 arch/um/kernel/skas/process.c           |   4 +-
 arch/um/kernel/smp.c                    | 238 --------------------------------
 arch/um/kernel/um_arch.c                |  35 -----
 arch/x86/um/asm/barrier.h               |  11 --
 10 files changed, 2 insertions(+), 366 deletions(-)
 delete mode 100644 arch/um/kernel/smp.c

(limited to 'arch/x86')

diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index a7520c90f62d..ff86fbedc2fc 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -95,36 +95,6 @@ config MAGIC_SYSRQ
 	  The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
-config SMP
-	bool "Symmetric multi-processing support"
-	default n
-	depends on BROKEN
-	help
-	  This option enables UML SMP support.
-	  It is NOT related to having a real SMP box. Not directly, at least.
-
-	  UML implements virtual SMP by allowing as many processes to run
-	  simultaneously on the host as there are virtual processors configured.
-
-	  Obviously, if the host is a uniprocessor, those processes will
-	  timeshare, but, inside UML, will appear to be running simultaneously.
-	  If the host is a multiprocessor, then UML processes may run
-	  simultaneously, depending on the host scheduler.
-
-	  This, however, is supported only in TT mode. So, if you use the SKAS
-	  patch on your host, switching to TT mode and enabling SMP usually
-	  gives	you worse performances.
-	  Also, since the support for SMP has been under-developed, there could
-	  be some bugs being exposed by enabling SMP.
-
-	  If you don't know what to do, say N.
-
-config NR_CPUS
-	int "Maximum number of CPUs (2-32)"
-	range 2 32
-	depends on SMP
-	default "32"
-
 config HIGHMEM
 	bool "Highmem support"
 	depends on !64BIT && BROKEN
diff --git a/arch/um/include/asm/processor-generic.h b/arch/um/include/asm/processor-generic.h
index cbc5edd5a901..2d1e0dd5bb0b 100644
--- a/arch/um/include/asm/processor-generic.h
+++ b/arch/um/include/asm/processor-generic.h
@@ -98,16 +98,8 @@ struct cpuinfo_um {
 
 extern struct cpuinfo_um boot_cpu_data;
 
-#define my_cpu_data		cpu_data[smp_processor_id()]
-
-#ifdef CONFIG_SMP
-extern struct cpuinfo_um cpu_data[];
-#define current_cpu_data cpu_data[smp_processor_id()]
-#else
 #define cpu_data (&boot_cpu_data)
 #define current_cpu_data boot_cpu_data
-#endif
-
 
 #define KSTK_REG(tsk, reg) get_thread_reg(reg, &tsk->thread.switch_buf)
 extern unsigned long get_wchan(struct task_struct *p);
diff --git a/arch/um/include/asm/smp.h b/arch/um/include/asm/smp.h
index e4507938d8cf..9c3be355ed01 100644
--- a/arch/um/include/asm/smp.h
+++ b/arch/um/include/asm/smp.h
@@ -1,32 +1,6 @@
 #ifndef __UM_SMP_H
 #define __UM_SMP_H
 
-#ifdef CONFIG_SMP
-
-#include <linux/bitops.h>
-#include <asm/current.h>
-#include <linux/cpumask.h>
-
-#define raw_smp_processor_id() (current_thread->cpu)
-
-#define cpu_logical_map(n) (n)
-#define cpu_number_map(n) (n)
-extern int hard_smp_processor_id(void);
-#define NO_PROC_ID -1
-
-extern int ncpus;
-
-
-static inline void smp_cpus_done(unsigned int maxcpus)
-{
-}
-
-extern struct task_struct *idle_threads[NR_CPUS];
-
-#else
-
 #define hard_smp_processor_id()		0
 
 #endif
-
-#endif
diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile
index 2d840a070c8b..b7e31e5a8c0c 100644
--- a/arch/um/kernel/Makefile
+++ b/arch/um/kernel/Makefile
@@ -12,7 +12,7 @@ clean-files :=
 
 obj-y = config.o exec.o exitcode.o irq.o ksyms.o mem.o \
 	physmem.o process.o ptrace.o reboot.o sigio.o \
-	signal.o smp.o syscall.o sysrq.o time.o tlb.o trap.o \
+	signal.o syscall.o sysrq.o time.o tlb.o trap.o \
 	um_arch.o umid.o maccess.o skas/
 
 obj-$(CONFIG_BLK_DEV_INITRD) += initrd.o
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 1d8505b1e290..23cb9350d47e 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -35,9 +35,6 @@ void sigio_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
 	struct irq_fd *irq_fd;
 	int n;
 
-	if (smp_sigio_handler())
-		return;
-
 	while (1) {
 		n = os_waiting_for_events(active_fds);
 		if (n <= 0) {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index f17bca8ed2ce..68b9119841cd 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -259,17 +259,6 @@ int strlen_user_proc(char __user *str)
 	return strlen_user(str);
 }
 
-int smp_sigio_handler(void)
-{
-#ifdef CONFIG_SMP
-	int cpu = current_thread_info()->cpu;
-	IPI_handler(cpu);
-	if (cpu != 0)
-		return 1;
-#endif
-	return 0;
-}
-
 int cpu(void)
 {
 	return current_thread_info()->cpu;
diff --git a/arch/um/kernel/skas/process.c b/arch/um/kernel/skas/process.c
index 082955d694f3..527fa5881915 100644
--- a/arch/um/kernel/skas/process.c
+++ b/arch/um/kernel/skas/process.c
@@ -21,9 +21,7 @@ static int __init start_kernel_proc(void *unused)
 
 	cpu_tasks[0].pid = pid;
 	cpu_tasks[0].task = current;
-#ifdef CONFIG_SMP
-	init_cpu_online(get_cpu_mask(0));
-#endif
+
 	start_kernel();
 	return 0;
 }
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
deleted file mode 100644
index 5c8c3ea7db7b..000000000000
--- a/arch/um/kernel/smp.c
+++ /dev/null
@@ -1,238 +0,0 @@
-/*
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- * Licensed under the GPL
- */
-
-#include <linux/percpu.h>
-#include <asm/pgalloc.h>
-#include <asm/tlb.h>
-
-#ifdef CONFIG_SMP
-
-#include <linux/sched.h>
-#include <linux/module.h>
-#include <linux/threads.h>
-#include <linux/interrupt.h>
-#include <linux/err.h>
-#include <linux/hardirq.h>
-#include <asm/smp.h>
-#include <asm/processor.h>
-#include <asm/spinlock.h>
-#include <kern.h>
-#include <irq_user.h>
-#include <os.h>
-
-/* Per CPU bogomips and other parameters
- * The only piece used here is the ipi pipe, which is set before SMP is
- * started and never changed.
- */
-struct cpuinfo_um cpu_data[NR_CPUS];
-
-/* A statistic, can be a little off */
-int num_reschedules_sent = 0;
-
-/* Not changed after boot */
-struct task_struct *idle_threads[NR_CPUS];
-
-void smp_send_reschedule(int cpu)
-{
-	os_write_file(cpu_data[cpu].ipi_pipe[1], "R", 1);
-	num_reschedules_sent++;
-}
-
-void smp_send_stop(void)
-{
-	int i;
-
-	printk(KERN_INFO "Stopping all CPUs...");
-	for (i = 0; i < num_online_cpus(); i++) {
-		if (i == current_thread->cpu)
-			continue;
-		os_write_file(cpu_data[i].ipi_pipe[1], "S", 1);
-	}
-	printk(KERN_CONT "done\n");
-}
-
-static cpumask_t smp_commenced_mask = CPU_MASK_NONE;
-static cpumask_t cpu_callin_map = CPU_MASK_NONE;
-
-static int idle_proc(void *cpup)
-{
-	int cpu = (int) cpup, err;
-
-	err = os_pipe(cpu_data[cpu].ipi_pipe, 1, 1);
-	if (err < 0)
-		panic("CPU#%d failed to create IPI pipe, err = %d", cpu, -err);
-
-	os_set_fd_async(cpu_data[cpu].ipi_pipe[0]);
-
-	wmb();
-	if (cpu_test_and_set(cpu, cpu_callin_map)) {
-		printk(KERN_ERR "huh, CPU#%d already present??\n", cpu);
-		BUG();
-	}
-
-	while (!cpu_isset(cpu, smp_commenced_mask))
-		cpu_relax();
-
-	notify_cpu_starting(cpu);
-	set_cpu_online(cpu, true);
-	default_idle();
-	return 0;
-}
-
-static struct task_struct *idle_thread(int cpu)
-{
-	struct task_struct *new_task;
-
-	current->thread.request.u.thread.proc = idle_proc;
-	current->thread.request.u.thread.arg = (void *) cpu;
-	new_task = fork_idle(cpu);
-	if (IS_ERR(new_task))
-		panic("copy_process failed in idle_thread, error = %ld",
-		      PTR_ERR(new_task));
-
-	cpu_tasks[cpu] = ((struct cpu_task)
-		          { .pid = 	new_task->thread.mode.tt.extern_pid,
-			    .task = 	new_task } );
-	idle_threads[cpu] = new_task;
-	panic("skas mode doesn't support SMP");
-	return new_task;
-}
-
-void smp_prepare_cpus(unsigned int maxcpus)
-{
-	struct task_struct *idle;
-	unsigned long waittime;
-	int err, cpu, me = smp_processor_id();
-	int i;
-
-	for (i = 0; i < ncpus; ++i)
-		set_cpu_possible(i, true);
-
-	set_cpu_online(me, true);
-	cpu_set(me, cpu_callin_map);
-
-	err = os_pipe(cpu_data[me].ipi_pipe, 1, 1);
-	if (err < 0)
-		panic("CPU#0 failed to create IPI pipe, errno = %d", -err);
-
-	os_set_fd_async(cpu_data[me].ipi_pipe[0]);
-
-	for (cpu = 1; cpu < ncpus; cpu++) {
-		printk(KERN_INFO "Booting processor %d...\n", cpu);
-
-		idle = idle_thread(cpu);
-
-		init_idle(idle, cpu);
-
-		waittime = 200000000;
-		while (waittime-- && !cpu_isset(cpu, cpu_callin_map))
-			cpu_relax();
-
-		printk(KERN_INFO "%s\n",
-		       cpu_isset(cpu, cpu_calling_map) ? "done" : "failed");
-	}
-}
-
-void smp_prepare_boot_cpu(void)
-{
-	set_cpu_online(smp_processor_id(), true);
-}
-
-int __cpu_up(unsigned int cpu, struct task_struct *tidle)
-{
-	cpu_set(cpu, smp_commenced_mask);
-	while (!cpu_online(cpu))
-		mb();
-	return 0;
-}
-
-int setup_profiling_timer(unsigned int multiplier)
-{
-	printk(KERN_INFO "setup_profiling_timer\n");
-	return 0;
-}
-
-void smp_call_function_slave(int cpu);
-
-void IPI_handler(int cpu)
-{
-	unsigned char c;
-	int fd;
-
-	fd = cpu_data[cpu].ipi_pipe[0];
-	while (os_read_file(fd, &c, 1) == 1) {
-		switch (c) {
-		case 'C':
-			smp_call_function_slave(cpu);
-			break;
-
-		case 'R':
-			scheduler_ipi();
-			break;
-
-		case 'S':
-			printk(KERN_INFO "CPU#%d stopping\n", cpu);
-			while (1)
-				pause();
-			break;
-
-		default:
-			printk(KERN_ERR "CPU#%d received unknown IPI [%c]!\n",
-			       cpu, c);
-			break;
-		}
-	}
-}
-
-int hard_smp_processor_id(void)
-{
-	return pid_to_processor_id(os_getpid());
-}
-
-static DEFINE_SPINLOCK(call_lock);
-static atomic_t scf_started;
-static atomic_t scf_finished;
-static void (*func)(void *info);
-static void *info;
-
-void smp_call_function_slave(int cpu)
-{
-	atomic_inc(&scf_started);
-	(*func)(info);
-	atomic_inc(&scf_finished);
-}
-
-int smp_call_function(void (*_func)(void *info), void *_info, int wait)
-{
-	int cpus = num_online_cpus() - 1;
-	int i;
-
-	if (!cpus)
-		return 0;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	spin_lock_bh(&call_lock);
-	atomic_set(&scf_started, 0);
-	atomic_set(&scf_finished, 0);
-	func = _func;
-	info = _info;
-
-	for_each_online_cpu(i)
-		os_write_file(cpu_data[i].ipi_pipe[1], "C", 1);
-
-	while (atomic_read(&scf_started) != cpus)
-		barrier();
-
-	if (wait)
-		while (atomic_read(&scf_finished) != cpus)
-			barrier();
-
-	spin_unlock_bh(&call_lock);
-	return 0;
-}
-
-#endif
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index dbd5bda1f184..cd09285facf4 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -66,12 +66,6 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	int index = 0;
 
-#ifdef CONFIG_SMP
-	index = (struct cpuinfo_um *) v - cpu_data;
-	if (!cpu_online(index))
-		return 0;
-#endif
-
 	seq_printf(m, "processor\t: %d\n", index);
 	seq_printf(m, "vendor_id\t: User Mode Linux\n");
 	seq_printf(m, "model name\t: UML\n");
@@ -168,23 +162,6 @@ __uml_setup("debug", no_skas_debug_setup,
 "    this flag is not needed to run gdb on UML in skas mode\n\n"
 );
 
-#ifdef CONFIG_SMP
-static int __init uml_ncpus_setup(char *line, int *add)
-{
-	if (!sscanf(line, "%d", &ncpus)) {
-		printf("Couldn't parse [%s]\n", line);
-		return -1;
-	}
-
-	return 0;
-}
-
-__uml_setup("ncpus=", uml_ncpus_setup,
-"ncpus=<# of desired CPUs>\n"
-"    This tells an SMP kernel how many virtual processors to start.\n\n"
-);
-#endif
-
 static int __init Usage(char *line, int *add)
 {
 	const char **p;
@@ -380,15 +357,3 @@ void __init check_bugs(void)
 void apply_alternatives(struct alt_instr *start, struct alt_instr *end)
 {
 }
-
-#ifdef CONFIG_SMP
-void alternatives_smp_module_add(struct module *mod, char *name,
-				 void *locks, void *locks_end,
-				 void *text,  void *text_end)
-{
-}
-
-void alternatives_smp_module_del(struct module *mod)
-{
-}
-#endif
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..aec75a5d8722 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -36,22 +36,11 @@
 #endif /* CONFIG_X86_PPRO_FENCE */
 #define dma_wmb()	barrier()
 
-#ifdef CONFIG_SMP
-
-#define smp_mb()	mb()
-#define smp_rmb()	dma_rmb()
-#define smp_wmb()	barrier()
-#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
-
-#else /* CONFIG_SMP */
-
 #define smp_mb()	barrier()
 #define smp_rmb()	barrier()
 #define smp_wmb()	barrier()
 #define set_mb(var, value) do { var = value; barrier(); } while (0)
 
-#endif /* CONFIG_SMP */
-
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-- 
cgit v1.2.3


From a98a6d864d3b84219a6ec6213b00c260fb52f9f4 Mon Sep 17 00:00:00 2001
From: Richard Weinberger
Date: Wed, 18 Mar 2015 21:59:35 +0100
Subject: um: Remove broken highmem support

Highmem was always buggy and experimental on UML(i386).
In times where 64 bit computers are default we can
remove that experimental code.

Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/um/Kconfig.um            | 12 --------
 arch/um/include/asm/fixmap.h  |  4 ---
 arch/um/include/asm/pgtable.h |  6 +---
 arch/um/kernel/mem.c          | 66 -------------------------------------------
 arch/um/kernel/um_arch.c      |  5 ----
 arch/x86/um/Makefile          |  1 -
 6 files changed, 1 insertion(+), 93 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/um/Kconfig.um b/arch/um/Kconfig.um
index ff86fbedc2fc..8c46978ff70b 100644
--- a/arch/um/Kconfig.um
+++ b/arch/um/Kconfig.um
@@ -95,18 +95,6 @@ config MAGIC_SYSRQ
 	  The keys are documented in <file:Documentation/sysrq.txt>. Don't say Y
 	  unless you really know what this hack does.
 
-config HIGHMEM
-	bool "Highmem support"
-	depends on !64BIT && BROKEN
-	default n
-	help
-	  This was used to allow UML to run with big amounts of memory.
-	  Currently it is unstable, so if unsure say N.
-
-	  To use big amounts of memory, it is recommended enable static
-	  linking (i.e. CONFIG_STATIC_LINK) - this should allow the
-	  guest to use up to 2.75G of memory.
-
 config KERNEL_STACK_ORDER
 	int "Kernel stack size order"
 	default 1 if 64BIT
diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h
index 3094ea3c73b0..1761fd75bf13 100644
--- a/arch/um/include/asm/fixmap.h
+++ b/arch/um/include/asm/fixmap.h
@@ -33,10 +33,6 @@
  * fix-mapped?
  */
 enum fixed_addresses {
-#ifdef CONFIG_HIGHMEM
-	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
-	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
-#endif
 	__end_of_fixed_addresses
 };
 
diff --git a/arch/um/include/asm/pgtable.h b/arch/um/include/asm/pgtable.h
index 2324b624f195..18eb9924dda3 100644
--- a/arch/um/include/asm/pgtable.h
+++ b/arch/um/include/asm/pgtable.h
@@ -47,11 +47,7 @@ extern unsigned long end_iomem;
 #define VMALLOC_OFFSET	(__va_space)
 #define VMALLOC_START ((end_iomem + VMALLOC_OFFSET) & ~(VMALLOC_OFFSET-1))
 #define PKMAP_BASE ((FIXADDR_START - LAST_PKMAP * PAGE_SIZE) & PMD_MASK)
-#ifdef CONFIG_HIGHMEM
-# define VMALLOC_END	(PKMAP_BASE-2*PAGE_SIZE)
-#else
-# define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
-#endif
+#define VMALLOC_END	(FIXADDR_START-2*PAGE_SIZE)
 #define MODULES_VADDR	VMALLOC_START
 #define MODULES_END	VMALLOC_END
 #define MODULES_LEN	(MODULES_VADDR - MODULES_END)
diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c
index 8636e905426f..b2a2dff50b4e 100644
--- a/arch/um/kernel/mem.c
+++ b/arch/um/kernel/mem.c
@@ -38,19 +38,6 @@ int kmalloc_ok = 0;
 /* Used during early boot */
 static unsigned long brk_end;
 
-#ifdef CONFIG_HIGHMEM
-static void setup_highmem(unsigned long highmem_start,
-			  unsigned long highmem_len)
-{
-	unsigned long highmem_pfn;
-	int i;
-
-	highmem_pfn = __pa(highmem_start) >> PAGE_SHIFT;
-	for (i = 0; i < highmem_len >> PAGE_SHIFT; i++)
-		free_highmem_page(&mem_map[highmem_pfn + i]);
-}
-#endif
-
 void __init mem_init(void)
 {
 	/* clear the zero-page */
@@ -67,9 +54,6 @@ void __init mem_init(void)
 	/* this will put all low memory onto the freelists */
 	free_all_bootmem();
 	max_low_pfn = totalram_pages;
-#ifdef CONFIG_HIGHMEM
-	setup_highmem(end_iomem, highmem);
-#endif
 	max_pfn = totalram_pages;
 	mem_init_print_info(NULL);
 	kmalloc_ok = 1;
@@ -127,49 +111,6 @@ static void __init fixrange_init(unsigned long start, unsigned long end,
 	}
 }
 
-#ifdef CONFIG_HIGHMEM
-pte_t *kmap_pte;
-pgprot_t kmap_prot;
-
-#define kmap_get_fixmap_pte(vaddr)					\
-	pte_offset_kernel(pmd_offset(pud_offset(pgd_offset_k(vaddr), (vaddr)),\
-				     (vaddr)), (vaddr))
-
-static void __init kmap_init(void)
-{
-	unsigned long kmap_vstart;
-
-	/* cache the first kmap pte */
-	kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN);
-	kmap_pte = kmap_get_fixmap_pte(kmap_vstart);
-
-	kmap_prot = PAGE_KERNEL;
-}
-
-static void __init init_highmem(void)
-{
-	pgd_t *pgd;
-	pud_t *pud;
-	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long vaddr;
-
-	/*
-	 * Permanent kmaps:
-	 */
-	vaddr = PKMAP_BASE;
-	fixrange_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, swapper_pg_dir);
-
-	pgd = swapper_pg_dir + pgd_index(vaddr);
-	pud = pud_offset(pgd, vaddr);
-	pmd = pmd_offset(pud, vaddr);
-	pte = pte_offset_kernel(pmd, vaddr);
-	pkmap_page_table = pte;
-
-	kmap_init();
-}
-#endif /* CONFIG_HIGHMEM */
-
 static void __init fixaddr_user_init( void)
 {
 #ifdef CONFIG_ARCH_REUSE_HOST_VSYSCALL_AREA
@@ -211,9 +152,6 @@ void __init paging_init(void)
 
 	zones_size[ZONE_NORMAL] = (end_iomem >> PAGE_SHIFT) -
 		(uml_physmem >> PAGE_SHIFT);
-#ifdef CONFIG_HIGHMEM
-	zones_size[ZONE_HIGHMEM] = highmem >> PAGE_SHIFT;
-#endif
 	free_area_init(zones_size);
 
 	/*
@@ -224,10 +162,6 @@ void __init paging_init(void)
 	fixrange_init(vaddr, FIXADDR_TOP, swapper_pg_dir);
 
 	fixaddr_user_init();
-
-#ifdef CONFIG_HIGHMEM
-	init_highmem();
-#endif
 }
 
 /*
diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c
index cd09285facf4..9b06957e98ce 100644
--- a/arch/um/kernel/um_arch.c
+++ b/arch/um/kernel/um_arch.c
@@ -301,11 +301,6 @@ int __init linux_main(int argc, char **argv)
 	if (physmem_size + iomem_size > max_physmem) {
 		highmem = physmem_size + iomem_size - max_physmem;
 		physmem_size -= highmem;
-#ifndef CONFIG_HIGHMEM
-		highmem = 0;
-		printf("CONFIG_HIGHMEM not enabled - physical memory shrunk "
-		       "to %Lu bytes\n", physmem_size);
-#endif
 	}
 
 	high_physmem = uml_physmem + physmem_size;
diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile
index eafa324eb7a5..acb384d24669 100644
--- a/arch/x86/um/Makefile
+++ b/arch/x86/um/Makefile
@@ -21,7 +21,6 @@ obj-$(CONFIG_BINFMT_ELF) += elfcore.o
 
 subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o
 subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o
-subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o
 
 else
 
-- 
cgit v1.2.3


From fc9bea0e28db8cbfe0a08c1bfb1796bfd7adf49b Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin
Date: Tue, 24 Mar 2015 18:31:24 +0300
Subject: x86, UML: fix integer overflow in ELF_ET_DYN_BASE

Almost all arches define ELF_ET_DYN_BASE as 2/3 of TASK_SIZE.
Though it seems that some architectures do this in a wrong way.
The problem is that 2*TASK_SIZE may overflow 32-bits so
the real ELF_ET_DYN_BASE becomes wrong.
Fix this overflow by dividing TASK_SIZE prior to multiplying:
	(TASK_SIZE / 3 * 2)

Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 arch/x86/um/asm/elf.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h
index 25a1022dd793..0a656b727b1a 100644
--- a/arch/x86/um/asm/elf.h
+++ b/arch/x86/um/asm/elf.h
@@ -210,7 +210,7 @@ extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu);
 
 #define ELF_EXEC_PAGESIZE 4096
 
-#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3)
+#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
 
 extern long elf_aux_hwcap;
 #define ELF_HWCAP (elf_aux_hwcap)
-- 
cgit v1.2.3


From c4d30668da689de2f27bb0b19de4430d6c95d7cf Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Fri, 10 Apr 2015 00:22:56 -0400
Subject: x86 msr-index: define MSR_TURBO_RATIO_LIMIT,1,2

MSR_TURBO_RATIO_LIMIT has grown into a set of three registers.
Add the documented names for them, in preparation
for deleting the previous ad-hoc names:

+#define MSR_TURBO_RATIO_LIMIT          0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1         0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2         0x000001af

Signed-off-by: Len Brown <len.brown@intel.com>
Cc: x86@kernel.org
---
 arch/x86/include/uapi/asm/msr-index.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index 3ce079136c11..c4c75272314a 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -61,6 +61,9 @@
 #define MSR_OFFCORE_RSP_1		0x000001a7
 #define MSR_NHM_TURBO_RATIO_LIMIT	0x000001ad
 #define MSR_IVT_TURBO_RATIO_LIMIT	0x000001ae
+#define MSR_TURBO_RATIO_LIMIT		0x000001ad
+#define MSR_TURBO_RATIO_LIMIT1		0x000001ae
+#define MSR_TURBO_RATIO_LIMIT2		0x000001af
 
 #define MSR_LBR_SELECT			0x000001c8
 #define MSR_LBR_TOS			0x000001c9
-- 
cgit v1.2.3


From 692297d8f96887f836d9049a653ed05a71cf48fb Mon Sep 17 00:00:00 2001
From: Ulrich Obergfell
Date: Tue, 14 Apr 2015 15:44:19 -0700
Subject: watchdog: introduce the hardlockup_detector_disable() function

Have kvm_guest_init() use hardlockup_detector_disable() instead of
watchdog_enable_hardlockup_detector(false).

Remove the watchdog_hardlockup_detector_is_enabled() and the
watchdog_enable_hardlockup_detector() function which are no longer needed.

Signed-off-by: Ulrich Obergfell <uobergfe@redhat.com>
Signed-off-by: Don Zickus <dzickus@redhat.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/kvm.c |  2 +-
 include/linux/nmi.h   |  9 ++-------
 kernel/watchdog.c     | 21 ++-------------------
 3 files changed, 5 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index e354cc6446ab..9435620062df 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -513,7 +513,7 @@ void __init kvm_guest_init(void)
 	 * can get false positives too easily, for example if the host is
 	 * overcommitted.
 	 */
-	watchdog_enable_hardlockup_detector(false);
+	hardlockup_detector_disable();
 }
 
 static noinline uint32_t __kvm_cpuid_base(void)
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index 0426357297d5..3d46fb4708e0 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -25,16 +25,11 @@ static inline void touch_nmi_watchdog(void)
 #endif
 
 #if defined(CONFIG_HARDLOCKUP_DETECTOR)
-extern void watchdog_enable_hardlockup_detector(bool val);
-extern bool watchdog_hardlockup_detector_is_enabled(void);
+extern void hardlockup_detector_disable(void);
 #else
-static inline void watchdog_enable_hardlockup_detector(bool val)
+static inline void hardlockup_detector_disable(void)
 {
 }
-static inline bool watchdog_hardlockup_detector_is_enabled(void)
-{
-	return true;
-}
 #endif
 
 /*
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 49d02250aaac..f2be11ab7e08 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -83,8 +83,6 @@ static unsigned long soft_lockup_nmi_warn;
 #ifdef CONFIG_HARDLOCKUP_DETECTOR
 static int hardlockup_panic =
 			CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-
-static bool hardlockup_detector_enabled = true;
 /*
  * We may not want to enable hard lockup detection by default in all cases,
  * for example when running the kernel as a guest on a hypervisor. In these
@@ -93,14 +91,9 @@ static bool hardlockup_detector_enabled = true;
  * kernel command line parameters are parsed, because otherwise it is not
  * possible to override this in hardlockup_panic_setup().
  */
-void watchdog_enable_hardlockup_detector(bool val)
-{
-	hardlockup_detector_enabled = val;
-}
-
-bool watchdog_hardlockup_detector_is_enabled(void)
+void hardlockup_detector_disable(void)
 {
-	return hardlockup_detector_enabled;
+	watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
 }
 
 static int __init hardlockup_panic_setup(char *str)
@@ -530,15 +523,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
 	if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
 		goto out;
 
-	/*
-	 * Some kernels need to default hard lockup detection to
-	 * 'disabled', for example a guest on a hypervisor.
-	 */
-	if (!watchdog_hardlockup_detector_is_enabled()) {
-		event = ERR_PTR(-ENOENT);
-		goto handle_err;
-	}
-
 	/* is it already setup and enabled? */
 	if (event && event->state > PERF_EVENT_STATE_OFF)
 		goto out;
@@ -553,7 +537,6 @@ static int watchdog_nmi_enable(unsigned int cpu)
 	/* Try to register using hardware perf events */
 	event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
 
-handle_err:
 	/* save cpu0 error for future comparision */
 	if (cpu == 0 && IS_ERR(event))
 		cpu0_err = PTR_ERR(event);
-- 
cgit v1.2.3


From 982333683385343d8d2db9a1df69c98406f42687 Mon Sep 17 00:00:00 2001
From: Kirill A. Shutemov
Date: Tue, 14 Apr 2015 15:46:14 -0700
Subject: x86: expose number of page table levels on Kconfig level

We would want to use number of page table level to define mm_struct.
Let's expose it as CONFIG_PGTABLE_LEVELS.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig                            |  6 ++++++
 arch/x86/include/asm/paravirt.h             |  8 ++++----
 arch/x86/include/asm/paravirt_types.h       |  8 ++++----
 arch/x86/include/asm/pgalloc.h              |  8 ++++----
 arch/x86/include/asm/pgtable-2level_types.h |  1 -
 arch/x86/include/asm/pgtable-3level_types.h |  2 --
 arch/x86/include/asm/pgtable.h              |  8 ++++----
 arch/x86/include/asm/pgtable_64_types.h     |  1 -
 arch/x86/include/asm/pgtable_types.h        |  4 ++--
 arch/x86/kernel/paravirt.c                  |  6 +++---
 arch/x86/mm/pgtable.c                       | 14 +++++++-------
 arch/x86/xen/mmu.c                          | 14 +++++++-------
 include/trace/events/xen.h                  |  2 +-
 13 files changed, 42 insertions(+), 40 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..3e3aaad23414 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -277,6 +277,12 @@ config ARCH_SUPPORTS_UPROBES
 config FIX_EARLYCON_MEM
 	def_bool y
 
+config PGTABLE_LEVELS
+	int
+	default 4 if X86_64
+	default 3 if X86_PAE
+	default 2
+
 source "init/Kconfig"
 source "kernel/Kconfig.freezer"
 
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5f6051d5d139..8957810ad7d1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -545,7 +545,7 @@ static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
 		PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
 }
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 static inline pmd_t __pmd(pmdval_t val)
 {
 	pmdval_t ret;
@@ -585,7 +585,7 @@ static inline void set_pud(pud_t *pudp, pud_t pud)
 		PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
 			    val);
 }
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static inline pud_t __pud(pudval_t val)
 {
 	pudval_t ret;
@@ -636,9 +636,9 @@ static inline void pud_clear(pud_t *pudp)
 	set_pud(pudp, __pud(0));
 }
 
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
-#endif	/* PAGETABLE_LEVELS >= 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS >= 3 */
 
 #ifdef CONFIG_X86_PAE
 /* Special-case pte-setting operations for PAE, which can't update a
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 7549b8b369e4..f7b0b5c112f2 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -294,7 +294,7 @@ struct pv_mmu_ops {
 	struct paravirt_callee_save pgd_val;
 	struct paravirt_callee_save make_pgd;
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
 	void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
@@ -308,13 +308,13 @@ struct pv_mmu_ops {
 	struct paravirt_callee_save pmd_val;
 	struct paravirt_callee_save make_pmd;
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	struct paravirt_callee_save pud_val;
 	struct paravirt_callee_save make_pud;
 
 	void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
-#endif	/* PAGETABLE_LEVELS == 4 */
-#endif	/* PAGETABLE_LEVELS >= 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS >= 3 */
 
 	struct pv_lazy_ops lazy_mode;
 
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index c4412e972bbd..bf7f8b55b0f9 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -77,7 +77,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 
 #define pmd_pgtable(pmd) pmd_page(pmd)
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *page;
@@ -116,7 +116,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
 }
 #endif	/* CONFIG_X86_PAE */
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 {
 	paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
@@ -142,7 +142,7 @@ static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud,
 	___pud_free_tlb(tlb, pud);
 }
 
-#endif	/* PAGETABLE_LEVELS > 3 */
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 #endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level_types.h b/arch/x86/include/asm/pgtable-2level_types.h
index daacc23e3fb9..392576433e77 100644
--- a/arch/x86/include/asm/pgtable-2level_types.h
+++ b/arch/x86/include/asm/pgtable-2level_types.h
@@ -17,7 +17,6 @@ typedef union {
 #endif	/* !__ASSEMBLY__ */
 
 #define SHARED_KERNEL_PMD	0
-#define PAGETABLE_LEVELS	2
 
 /*
  * traditional i386 two-level paging structure:
diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
index 1bd5876c8649..bcc89625ebe5 100644
--- a/arch/x86/include/asm/pgtable-3level_types.h
+++ b/arch/x86/include/asm/pgtable-3level_types.h
@@ -24,8 +24,6 @@ typedef union {
 #define SHARED_KERNEL_PMD	1
 #endif
 
-#define PAGETABLE_LEVELS	3
-
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
  */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a0c35bf6cb92..fe57e7a98839 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -551,7 +551,7 @@ static inline unsigned long pages_to_mb(unsigned long npg)
 	return npg >> (20 - PAGE_SHIFT);
 }
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 static inline int pud_none(pud_t pud)
 {
 	return native_pud_val(pud) == 0;
@@ -594,9 +594,9 @@ static inline int pud_large(pud_t pud)
 {
 	return 0;
 }
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 static inline int pgd_present(pgd_t pgd)
 {
 	return pgd_flags(pgd) & _PAGE_PRESENT;
@@ -633,7 +633,7 @@ static inline int pgd_none(pgd_t pgd)
 {
 	return !native_pgd_val(pgd);
 }
-#endif	/* PAGETABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
 
 #endif	/* __ASSEMBLY__ */
 
diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 602b6028c5b6..e6844dfb4471 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -20,7 +20,6 @@ typedef struct { pteval_t pte; } pte_t;
 #endif	/* !__ASSEMBLY__ */
 
 #define SHARED_KERNEL_PMD	0
-#define PAGETABLE_LEVELS	4
 
 /*
  * PGDIR_SHIFT determines what a top-level page table entry can map
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 8c7c10802e9c..78f0c8cbe316 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -234,7 +234,7 @@ static inline pgdval_t pgd_flags(pgd_t pgd)
 	return native_pgd_val(pgd) & PTE_FLAGS_MASK;
 }
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 typedef struct { pudval_t pud; } pud_t;
 
 static inline pud_t native_make_pud(pmdval_t val)
@@ -255,7 +255,7 @@ static inline pudval_t native_pud_val(pud_t pud)
 }
 #endif
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 typedef struct { pmdval_t pmd; } pmd_t;
 
 static inline pmd_t native_make_pmd(pmdval_t val)
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 548d25f00c90..c614dd492f5f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.ptep_modify_prot_start = __ptep_modify_prot_start,
 	.ptep_modify_prot_commit = __ptep_modify_prot_commit,
 
-#if PAGETABLE_LEVELS >= 3
+#if CONFIG_PGTABLE_LEVELS >= 3
 #ifdef CONFIG_X86_PAE
 	.set_pte_atomic = native_set_pte_atomic,
 	.pte_clear = native_pte_clear,
@@ -454,13 +454,13 @@ struct pv_mmu_ops pv_mmu_ops = {
 	.pmd_val = PTE_IDENT,
 	.make_pmd = PTE_IDENT,
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	.pud_val = PTE_IDENT,
 	.make_pud = PTE_IDENT,
 
 	.set_pgd = native_set_pgd,
 #endif
-#endif /* PAGETABLE_LEVELS >= 3 */
+#endif /* CONFIG_PGTABLE_LEVELS >= 3 */
 
 	.pte_val = PTE_IDENT,
 	.pgd_val = PTE_IDENT,
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5a7e5252c878..b28edfecbdfe 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -58,7 +58,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
 	tlb_remove_page(tlb, pte);
 }
 
-#if PAGETABLE_LEVELS > 2
+#if CONFIG_PGTABLE_LEVELS > 2
 void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 {
 	struct page *page = virt_to_page(pmd);
@@ -74,14 +74,14 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 	tlb_remove_page(tlb, page);
 }
 
-#if PAGETABLE_LEVELS > 3
+#if CONFIG_PGTABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
 	tlb_remove_page(tlb, virt_to_page(pud));
 }
-#endif	/* PAGETABLE_LEVELS > 3 */
-#endif	/* PAGETABLE_LEVELS > 2 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 3 */
+#endif	/* CONFIG_PGTABLE_LEVELS > 2 */
 
 static inline void pgd_list_add(pgd_t *pgd)
 {
@@ -117,9 +117,9 @@ static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
 	/* If the pgd points to a shared pagetable level (either the
 	   ptes in non-PAE, or shared PMD in PAE), then just copy the
 	   references from swapper_pg_dir. */
-	if (PAGETABLE_LEVELS == 2 ||
-	    (PAGETABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
-	    PAGETABLE_LEVELS == 4) {
+	if (CONFIG_PGTABLE_LEVELS == 2 ||
+	    (CONFIG_PGTABLE_LEVELS == 3 && SHARED_KERNEL_PMD) ||
+	    CONFIG_PGTABLE_LEVELS == 4) {
 		clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
 				swapper_pg_dir + KERNEL_PGD_BOUNDARY,
 				KERNEL_PGD_PTRS);
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index adca9e2b6553..65083ad63b6f 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -502,7 +502,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 __visible pudval_t xen_pud_val(pud_t pud)
 {
 	return pte_mfn_to_pfn(pud.pud);
@@ -589,7 +589,7 @@ static void xen_set_pgd(pgd_t *ptr, pgd_t val)
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
 /*
  * (Yet another) pagetable walker.  This one is intended for pinning a
@@ -1628,7 +1628,7 @@ static void xen_release_pmd(unsigned long pfn)
 	xen_release_ptpage(pfn, PT_PMD);
 }
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2046,7 +2046,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	pv_mmu_ops.set_pgd = xen_set_pgd;
 #endif
 
@@ -2056,7 +2056,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
 	pv_mmu_ops.release_pte = xen_release_pte;
 	pv_mmu_ops.release_pmd = xen_release_pmd;
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
 	pv_mmu_ops.release_pud = xen_release_pud;
 #endif
@@ -2122,14 +2122,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
 
-#if PAGETABLE_LEVELS == 4
+#if CONFIG_PGTABLE_LEVELS == 4
 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_pgd = xen_set_pgd_hyper,
 
 	.alloc_pud = xen_alloc_pmd_init,
 	.release_pud = xen_release_pmd_init,
-#endif	/* PAGETABLE_LEVELS == 4 */
+#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
 
 	.activate_mm = xen_activate_mm,
 	.dup_mmap = xen_dup_mmap,
diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h
index d06b6da5c1e3..bce990f5a35d 100644
--- a/include/trace/events/xen.h
+++ b/include/trace/events/xen.h
@@ -224,7 +224,7 @@ TRACE_EVENT(xen_mmu_pmd_clear,
 	    TP_printk("pmdp %p", __entry->pmdp)
 	);
 
-#if PAGETABLE_LEVELS >= 4
+#if CONFIG_PGTABLE_LEVELS >= 4
 
 TRACE_EVENT(xen_mmu_set_pud,
 	    TP_PROTO(pud_t *pudp, pud_t pudval),
-- 
cgit v1.2.3


From 5d72b4fba40ef4b3f7a1a11d6aacc85d9af81561 Mon Sep 17 00:00:00 2001
From: Toshi Kani
Date: Tue, 14 Apr 2015 15:47:29 -0700
Subject: x86, mm: support huge I/O mapping capability I/F

Implement huge I/O mapping capability interfaces for ioremap() on x86.

IOREMAP_MAX_ORDER is defined to PUD_SHIFT on x86/64 and PMD_SHIFT on
x86/32, which overrides the default value defined in <linux/vmalloc.h>.

Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Robert Elliott <Elliott@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/page_types.h |  2 ++
 arch/x86/mm/ioremap.c             | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index f97fbe3abb67..c7c712f2648b 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,8 +40,10 @@
 
 #ifdef CONFIG_X86_64
 #include <asm/page_64_types.h>
+#define IOREMAP_MAX_ORDER       (PUD_SHIFT)
 #else
 #include <asm/page_32_types.h>
+#define IOREMAP_MAX_ORDER       (PMD_SHIFT)
 #endif	/* CONFIG_X86_64 */
 
 #ifndef __ASSEMBLY__
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index fdf617c00e2f..5ead4d6cf3a7 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -67,8 +67,13 @@ static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
 
 /*
  * Remap an arbitrary physical address space into the kernel virtual
- * address space. Needed when the kernel wants to access high addresses
- * directly.
+ * address space. It transparently creates kernel huge I/O mapping when
+ * the physical address is aligned by a huge page size (1GB or 2MB) and
+ * the requested size is at least the huge page size.
+ *
+ * NOTE: MTRRs can override PAT memory types with a 4KB granularity.
+ * Therefore, the mapping code falls back to use a smaller page toward 4KB
+ * when a mapping range is covered by non-WB type of MTRRs.
  *
  * NOTE! We need to allow non-page-aligned mappings too: we will obviously
  * have to convert them into an offset in a page-aligned mapping, but the
@@ -326,6 +331,20 @@ void iounmap(volatile void __iomem *addr)
 }
 EXPORT_SYMBOL(iounmap);
 
+int arch_ioremap_pud_supported(void)
+{
+#ifdef CONFIG_X86_64
+	return cpu_has_gbpages;
+#else
+	return 0;
+#endif
+}
+
+int arch_ioremap_pmd_supported(void)
+{
+	return cpu_has_pse;
+}
+
 /*
  * Convert a physical pointer to a virtual kernel pointer for /dev/mem
  * access
-- 
cgit v1.2.3


From 6b6378355b925050eb6fa966742d8c2d65ff0d83 Mon Sep 17 00:00:00 2001
From: Toshi Kani
Date: Tue, 14 Apr 2015 15:47:32 -0700
Subject: x86, mm: support huge KVA mappings on x86

Implement huge KVA mapping interfaces on x86.

On x86, MTRRs can override PAT memory types with a 4KB granularity.  When
using a huge page, MTRRs can override the memory type of the huge page,
which may lead a performance penalty.  The processor can also behave in an
undefined manner if a huge page is mapped to a memory range that MTRRs
have mapped with multiple different memory types.  Therefore, the mapping
code falls back to use a smaller page size toward 4KB when a mapping range
is covered by non-WB type of MTRRs.  The WB type of MTRRs has no affect on
the PAT memory types.

pud_set_huge() and pmd_set_huge() call mtrr_type_lookup() to see if a
given range is covered by MTRRs.  MTRR_TYPE_WRBACK indicates that the
range is either covered by WB or not covered and the MTRR default value is
set to WB.  0xFF indicates that MTRRs are disabled.

HAVE_ARCH_HUGE_VMAP is selected when X86_64 or X86_32 with X86_PAE is set.
 X86_32 without X86_PAE is not supported since such config can unlikey be
benefited from this feature, and there was an issue found in testing.

[fengguang.wu@intel.com: ioremap_pud_capable can be static]
Signed-off-by: Toshi Kani <toshi.kani@hp.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Robert Elliott <Elliott@hp.com>
Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig      |  1 +
 arch/x86/mm/pgtable.c | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++
 lib/ioremap.c         |  6 ++---
 3 files changed, 69 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3e3aaad23414..0f948cefaeb1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -99,6 +99,7 @@ config X86
 	select IRQ_FORCED_THREADING
 	select HAVE_BPF_JIT if X86_64
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
+	select HAVE_ARCH_HUGE_VMAP if X86_64 || (X86_32 && X86_PAE)
 	select ARCH_HAS_SG_CHAIN
 	select CLKEVT_I8253
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index b28edfecbdfe..0b97d2c75df3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -4,6 +4,7 @@
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/fixmap.h>
+#include <asm/mtrr.h>
 
 #define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
 
@@ -560,3 +561,67 @@ void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
 {
 	__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
 }
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PUD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pud, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
+{
+	u8 mtrr;
+
+	/*
+	 * Do not use a huge page when the range is covered by non-WB type
+	 * of MTRRs.
+	 */
+	mtrr = mtrr_type_lookup(addr, addr + PMD_SIZE);
+	if ((mtrr != MTRR_TYPE_WRBACK) && (mtrr != 0xFF))
+		return 0;
+
+	prot = pgprot_4k_2_large(prot);
+
+	set_pte((pte_t *)pmd, pfn_pte(
+		(u64)addr >> PAGE_SHIFT,
+		__pgprot(pgprot_val(prot) | _PAGE_PSE)));
+
+	return 1;
+}
+
+int pud_clear_huge(pud_t *pud)
+{
+	if (pud_large(*pud)) {
+		pud_clear(pud);
+		return 1;
+	}
+
+	return 0;
+}
+
+int pmd_clear_huge(pmd_t *pmd)
+{
+	if (pmd_large(*pmd)) {
+		pmd_clear(pmd);
+		return 1;
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_HAVE_ARCH_HUGE_VMAP */
diff --git a/lib/ioremap.c b/lib/ioremap.c
index be249062ba6e..86c8911b0e3a 100644
--- a/lib/ioremap.c
+++ b/lib/ioremap.c
@@ -14,9 +14,9 @@
 #include <asm/pgtable.h>
 
 #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
-int __read_mostly ioremap_pud_capable;
-int __read_mostly ioremap_pmd_capable;
-int __read_mostly ioremap_huge_disabled;
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
 
 static int __init set_nohugeiomap(char *str)
 {
-- 
cgit v1.2.3


From 82168140bc4cec7ec9bad39705518541149ff8b7 Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Tue, 14 Apr 2015 15:47:45 -0700
Subject: x86: standardize mmap_rnd() usage

In preparation for splitting out ET_DYN ASLR, this refactors the use of
mmap_rnd() to be used similarly to arm, and extracts the checking of
PF_RANDOMIZE.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/mm/mmap.c | 36 ++++++++++++++++++++----------------
 1 file changed, 20 insertions(+), 16 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index df4552bd239e..ebfa52030d5c 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -67,22 +67,21 @@ static int mmap_is_legacy(void)
 
 static unsigned long mmap_rnd(void)
 {
-	unsigned long rnd = 0;
+	unsigned long rnd;
 
 	/*
-	*  8 bits of randomness in 32bit mmaps, 20 address space bits
-	* 28 bits of randomness in 64bit mmaps, 40 address space bits
-	*/
-	if (current->flags & PF_RANDOMIZE) {
-		if (mmap_is_ia32())
-			rnd = get_random_int() % (1<<8);
-		else
-			rnd = get_random_int() % (1<<28);
-	}
+	 *  8 bits of randomness in 32bit mmaps, 20 address space bits
+	 * 28 bits of randomness in 64bit mmaps, 40 address space bits
+	 */
+	if (mmap_is_ia32())
+		rnd = (unsigned long)get_random_int() % (1<<8);
+	else
+		rnd = (unsigned long)get_random_int() % (1<<28);
+
 	return rnd << PAGE_SHIFT;
 }
 
-static unsigned long mmap_base(void)
+static unsigned long mmap_base(unsigned long rnd)
 {
 	unsigned long gap = rlimit(RLIMIT_STACK);
 
@@ -91,19 +90,19 @@ static unsigned long mmap_base(void)
 	else if (gap > MAX_GAP)
 		gap = MAX_GAP;
 
-	return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
+	return PAGE_ALIGN(TASK_SIZE - gap - rnd);
 }
 
 /*
  * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
  * does, but not when emulating X86_32
  */
-static unsigned long mmap_legacy_base(void)
+static unsigned long mmap_legacy_base(unsigned long rnd)
 {
 	if (mmap_is_ia32())
 		return TASK_UNMAPPED_BASE;
 	else
-		return TASK_UNMAPPED_BASE + mmap_rnd();
+		return TASK_UNMAPPED_BASE + rnd;
 }
 
 /*
@@ -112,13 +111,18 @@ static unsigned long mmap_legacy_base(void)
  */
 void arch_pick_mmap_layout(struct mm_struct *mm)
 {
-	mm->mmap_legacy_base = mmap_legacy_base();
-	mm->mmap_base = mmap_base();
+	unsigned long random_factor = 0UL;
+
+	if (current->flags & PF_RANDOMIZE)
+		random_factor = mmap_rnd();
+
+	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = mm->mmap_legacy_base;
 		mm->get_unmapped_area = arch_get_unmapped_area;
 	} else {
+		mm->mmap_base = mmap_base(random_factor);
 		mm->get_unmapped_area = arch_get_unmapped_area_topdown;
 	}
 }
-- 
cgit v1.2.3


From 2b68f6caeac271620cd2f9362aeaed360e317df0 Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Tue, 14 Apr 2015 15:48:00 -0700
Subject: mm: expose arch_mmap_rnd when available

When an architecture fully supports randomizing the ELF load location,
a per-arch mmap_rnd() function is used to find a randomized mmap base.
In preparation for randomizing the location of ET_DYN binaries
separately from mmap, this renames and exports these functions as
arch_mmap_rnd(). Additionally introduces CONFIG_ARCH_HAS_ELF_RANDOMIZE
for describing this feature on architectures that support it
(which is a superset of ARCH_BINFMT_ELF_RANDOMIZE_PIE, since s390
already supports a separated ET_DYN ASLR from mmap ASLR without the
ARCH_BINFMT_ELF_RANDOMIZE_PIE logic).

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                  |  7 +++++++
 arch/arm/Kconfig              |  1 +
 arch/arm/mm/mmap.c            |  4 ++--
 arch/arm64/Kconfig            |  1 +
 arch/arm64/mm/mmap.c          |  4 ++--
 arch/mips/Kconfig             |  1 +
 arch/mips/mm/mmap.c           |  4 ++--
 arch/powerpc/Kconfig          |  1 +
 arch/powerpc/mm/mmap.c        |  4 ++--
 arch/s390/Kconfig             |  1 +
 arch/s390/mm/mmap.c           |  8 ++++----
 arch/x86/Kconfig              |  1 +
 arch/x86/mm/mmap.c            |  4 ++--
 include/linux/elf-randomize.h | 10 ++++++++++
 14 files changed, 37 insertions(+), 14 deletions(-)
 create mode 100644 include/linux/elf-randomize.h

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index c88c23f0a1da..474904a8e540 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -491,6 +491,13 @@ config PGTABLE_LEVELS
 	int
 	default 2
 
+config ARCH_HAS_ELF_RANDOMIZE
+	bool
+	help
+	  An architecture supports choosing randomized locations for
+	  stack, mmap, brk, and ET_DYN. Defined functions:
+	  - arch_mmap_rnd()
+
 #
 # ABI hall of shame
 #
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 696cf3c61e0f..f85200a63a8b 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -3,6 +3,7 @@ config ARM
 	default y
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
 	select ARCH_HAVE_CUSTOM_GPIO_H
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/arm/mm/mmap.c b/arch/arm/mm/mmap.c
index 15a8160096b3..407dc786583a 100644
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -169,7 +169,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	return addr;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -184,7 +184,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3f2fba996bc2..7c1dbeb73e8d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -2,6 +2,7 @@ config ARM64
 	def_bool y
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm64/mm/mmap.c b/arch/arm64/mm/mmap.c
index ba776c01b552..ed177475dd8c 100644
--- a/arch/arm64/mm/mmap.c
+++ b/arch/arm64/mm/mmap.c
@@ -47,7 +47,7 @@ static int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -77,7 +77,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	/*
 	 * Fall back to the standard layout if the personality bit is set, or
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index a9d112d2a135..688ce274f59d 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -24,6 +24,7 @@ config MIPS
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_SYSCALL_TRACEPOINTS
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
 	select RTC_LIB if !MACH_LOONGSON
 	select GENERIC_ATOMIC64 if !64BIT
diff --git a/arch/mips/mm/mmap.c b/arch/mips/mm/mmap.c
index 9a4f1f5c1f0e..5c81fdd032c3 100644
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -142,7 +142,7 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp,
 			addr0, len, pgoff, flags, DOWN);
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -161,7 +161,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	if (mmap_is_legacy()) {
 		mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 91ad76f30d18..fc5fffbb331b 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -89,6 +89,7 @@ config PPC
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select BINFMT_ELF
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select OF
 	select OF_EARLY_FLATTREE
 	select OF_RESERVED_MEM
diff --git a/arch/powerpc/mm/mmap.c b/arch/powerpc/mm/mmap.c
index 1ad2299d795d..0f0502e12f6c 100644
--- a/arch/powerpc/mm/mmap.c
+++ b/arch/powerpc/mm/mmap.c
@@ -53,7 +53,7 @@ static inline int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -87,7 +87,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	/*
 	 * Fall back to the standard layout if the personality
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f6aebcb7a0f8..ac2b75d74cd2 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -65,6 +65,7 @@ config S390
 	def_bool y
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_DEBUG_STRICT_USER_COPY_CHECKS
+	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index db57078075c5..a94504d99c47 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -60,7 +60,7 @@ static inline int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	if (is_32bit_task())
 		return (get_random_int() & 0x7ff) << PAGE_SHIFT;
@@ -187,7 +187,7 @@ unsigned long randomize_et_dyn(void)
 		base &= ~((1UL << 32) - 1);
 
 	if (current->flags & PF_RANDOMIZE)
-		base += mmap_rnd();
+		base += arch_mmap_rnd();
 
 	return base;
 }
@@ -203,7 +203,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	/*
 	 * Fall back to the standard layout if the personality
@@ -283,7 +283,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	/*
 	 * Fall back to the standard layout if the personality
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0f948cefaeb1..782ddbbc1c9a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
+	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select SPARSE_IRQ
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index ebfa52030d5c..9d518d693b4b 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -65,7 +65,7 @@ static int mmap_is_legacy(void)
 	return sysctl_legacy_va_layout;
 }
 
-static unsigned long mmap_rnd(void)
+unsigned long arch_mmap_rnd(void)
 {
 	unsigned long rnd;
 
@@ -114,7 +114,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
 	unsigned long random_factor = 0UL;
 
 	if (current->flags & PF_RANDOMIZE)
-		random_factor = mmap_rnd();
+		random_factor = arch_mmap_rnd();
 
 	mm->mmap_legacy_base = mmap_legacy_base(random_factor);
 
diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h
new file mode 100644
index 000000000000..7a4eda02d2b1
--- /dev/null
+++ b/include/linux/elf-randomize.h
@@ -0,0 +1,10 @@
+#ifndef _ELF_RANDOMIZE_H
+#define _ELF_RANDOMIZE_H
+
+#ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE
+static inline unsigned long arch_mmap_rnd(void) { return 0; }
+#else
+extern unsigned long arch_mmap_rnd(void);
+#endif
+
+#endif
-- 
cgit v1.2.3


From d1fd836dcf00d2028c700c7e44d2c23404062c90 Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Tue, 14 Apr 2015 15:48:07 -0700
Subject: mm: split ET_DYN ASLR from mmap ASLR

This fixes the "offset2lib" weakness in ASLR for arm, arm64, mips,
powerpc, and x86.  The problem is that if there is a leak of ASLR from
the executable (ET_DYN), it means a leak of shared library offset as
well (mmap), and vice versa.  Further details and a PoC of this attack
is available here:

  http://cybersecurity.upv.es/attacks/offset2lib/offset2lib.html

With this patch, a PIE linked executable (ET_DYN) has its own ASLR
region:

  $ ./show_mmaps_pie
  54859ccd6000-54859ccd7000 r-xp  ...  /tmp/show_mmaps_pie
  54859ced6000-54859ced7000 r--p  ...  /tmp/show_mmaps_pie
  54859ced7000-54859ced8000 rw-p  ...  /tmp/show_mmaps_pie
  7f75be764000-7f75be91f000 r-xp  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75be91f000-7f75beb1f000 ---p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb1f000-7f75beb23000 r--p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb23000-7f75beb25000 rw-p  ...  /lib/x86_64-linux-gnu/libc.so.6
  7f75beb25000-7f75beb2a000 rw-p  ...
  7f75beb2a000-7f75beb4d000 r-xp  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed45000-7f75bed46000 rw-p  ...
  7f75bed46000-7f75bed47000 r-xp  ...
  7f75bed47000-7f75bed4c000 rw-p  ...
  7f75bed4c000-7f75bed4d000 r--p  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed4d000-7f75bed4e000 rw-p  ...  /lib64/ld-linux-x86-64.so.2
  7f75bed4e000-7f75bed4f000 rw-p  ...
  7fffb3741000-7fffb3762000 rw-p  ...  [stack]
  7fffb377b000-7fffb377d000 r--p  ...  [vvar]
  7fffb377d000-7fffb377f000 r-xp  ...  [vdso]

The change is to add a call the newly created arch_mmap_rnd() into the
ELF loader for handling ET_DYN ASLR in a separate region from mmap ASLR,
as was already done on s390.  Removes CONFIG_BINFMT_ELF_RANDOMIZE_PIE,
which is no longer needed.

Signed-off-by: Kees Cook <keescook@chromium.org>
Reported-by: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/Kconfig            |  1 -
 arch/arm64/Kconfig          |  1 -
 arch/mips/Kconfig           |  1 -
 arch/powerpc/Kconfig        |  1 -
 arch/s390/include/asm/elf.h |  5 ++---
 arch/s390/mm/mmap.c         |  8 --------
 arch/x86/Kconfig            |  1 -
 fs/Kconfig.binfmt           |  3 ---
 fs/binfmt_elf.c             | 18 ++++--------------
 9 files changed, 6 insertions(+), 33 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f85200a63a8b..4b62f4caf0ce 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1,7 +1,6 @@
 config ARM
 	bool
 	default y
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7c1dbeb73e8d..34f487d5d84e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1,6 +1,5 @@
 config ARM64
 	def_bool y
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 688ce274f59d..a326c4cb8cf0 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -23,7 +23,6 @@ config MIPS
 	select HAVE_KRETPROBES
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_SYSCALL_TRACEPOINTS
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE if CPU_SUPPORTS_HUGEPAGES && 64BIT
 	select RTC_LIB if !MACH_LOONGSON
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index fc5fffbb331b..e99014adf017 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -88,7 +88,6 @@ config PPC
 	select ARCH_MIGHT_HAVE_PC_PARPORT
 	select ARCH_MIGHT_HAVE_PC_SERIO
 	select BINFMT_ELF
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select OF
 	select OF_EARLY_FLATTREE
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index f8db4781a4c2..ff662155b2c4 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -163,10 +163,9 @@ extern unsigned int vdso_enabled;
    the loader.  We need to make sure that it is out of the way of the program
    that it will "exec", and that there is sufficient room for the brk. 64-bit
    tasks are aligned to 4GB. */
-extern unsigned long randomize_et_dyn(void);
-#define ELF_ET_DYN_BASE (randomize_et_dyn() + (is_32bit_task() ? \
+#define ELF_ET_DYN_BASE (is_32bit_task() ? \
 				(STACK_TOP / 3 * 2) : \
-				(STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)))
+				(STACK_TOP / 3 * 2) & ~((1UL << 32) - 1))
 
 /* This yields a mask that user programs can use to figure out what
    instruction set this CPU supports. */
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 8c11536f972d..bb3367c5cb0b 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -177,14 +177,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	return addr;
 }
 
-unsigned long randomize_et_dyn(void)
-{
-	if (current->flags & PF_RANDOMIZE)
-		return arch_mmap_rnd();
-
-	return 0UL;
-}
-
 #ifndef CONFIG_64BIT
 
 /*
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 782ddbbc1c9a..1f7f185934a5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -87,7 +87,6 @@ config X86
 	select HAVE_ARCH_KMEMCHECK
 	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
 	select HAVE_USER_RETURN_NOTIFIER
-	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 270c48148f79..2d0cbbd14cfc 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -27,9 +27,6 @@ config COMPAT_BINFMT_ELF
 	bool
 	depends on COMPAT && BINFMT_ELF
 
-config ARCH_BINFMT_ELF_RANDOMIZE_PIE
-	bool
-
 config ARCH_BINFMT_ELF_STATE
 	bool
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index d925f55e4857..b20c05477e90 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -31,6 +31,7 @@
 #include <linux/security.h>
 #include <linux/random.h>
 #include <linux/elf.h>
+#include <linux/elf-randomize.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
 #include <linux/sched.h>
@@ -910,21 +911,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 			 * default mmap base, as well as whatever program they
 			 * might try to exec.  This is because the brk will
 			 * follow the loader, and is not movable.  */
-#ifdef CONFIG_ARCH_BINFMT_ELF_RANDOMIZE_PIE
-			/* Memory randomization might have been switched off
-			 * in runtime via sysctl or explicit setting of
-			 * personality flags.
-			 * If that is the case, retain the original non-zero
-			 * load_bias value in order to establish proper
-			 * non-randomized mappings.
-			 */
+			load_bias = ELF_ET_DYN_BASE - vaddr;
 			if (current->flags & PF_RANDOMIZE)
-				load_bias = 0;
-			else
-				load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#else
-			load_bias = ELF_PAGESTART(ELF_ET_DYN_BASE - vaddr);
-#endif
+				load_bias += arch_mmap_rnd();
+			load_bias = ELF_PAGESTART(load_bias);
 			total_size = total_mapping_size(elf_phdata,
 							loc->elf_ex.e_phnum);
 			if (!total_size) {
-- 
cgit v1.2.3


From 204db6ed17743000691d930368a5abd6ea541c58 Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Tue, 14 Apr 2015 15:48:12 -0700
Subject: mm: fold arch_randomize_brk into ARCH_HAS_ELF_RANDOMIZE

The arch_randomize_brk() function is used on several architectures,
even those that don't support ET_DYN ASLR. To avoid bulky extern/#define
tricks, consolidate the support under CONFIG_ARCH_HAS_ELF_RANDOMIZE for
the architectures that support it, while still handling CONFIG_COMPAT_BRK.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Russell King <linux@arm.linux.org.uk>
Reviewed-by: Ingo Molnar <mingo@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: "David A. Long" <dave.long@linaro.org>
Cc: Andrey Ryabinin <a.ryabinin@samsung.com>
Cc: Arun Chandran <achandran@mvista.com>
Cc: Yann Droneaud <ydroneaud@opteya.com>
Cc: Min-Hua Chen <orca.chen@gmail.com>
Cc: Paul Burton <paul.burton@imgtec.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Markos Chandras <markos.chandras@imgtec.com>
Cc: Vineeth Vijayan <vvijayan@mvista.com>
Cc: Jeff Bailey <jeffbailey@google.com>
Cc: Michael Holzheu <holzheu@linux.vnet.ibm.com>
Cc: Ben Hutchings <ben@decadent.org.uk>
Cc: Behan Webster <behanw@converseincode.com>
Cc: Ismael Ripoll <iripoll@upv.es>
Cc: Jan-Simon Mller <dl9pf@gmx.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/Kconfig                   |  1 +
 arch/arm/include/asm/elf.h     |  4 ----
 arch/arm64/include/asm/elf.h   |  4 ----
 arch/mips/include/asm/elf.h    |  4 ----
 arch/powerpc/include/asm/elf.h |  4 ----
 arch/s390/include/asm/elf.h    |  3 ---
 arch/x86/include/asm/elf.h     |  3 ---
 fs/binfmt_elf.c                |  4 +---
 include/linux/elf-randomize.h  | 12 ++++++++++++
 9 files changed, 14 insertions(+), 25 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/Kconfig b/arch/Kconfig
index 474904a8e540..e1068987bad1 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -497,6 +497,7 @@ config ARCH_HAS_ELF_RANDOMIZE
 	  An architecture supports choosing randomized locations for
 	  stack, mmap, brk, and ET_DYN. Defined functions:
 	  - arch_mmap_rnd()
+	  - arch_randomize_brk()
 
 #
 # ABI hall of shame
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h
index afb9cafd3786..c1ff8ab12914 100644
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -125,10 +125,6 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
 extern void elf_set_personality(const struct elf32_hdr *);
 #define SET_PERSONALITY(ex)	elf_set_personality(&(ex))
 
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 #ifdef CONFIG_MMU
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 struct linux_binprm;
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index f724db00b235..faad6df49e5b 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -156,10 +156,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 #define STACK_RND_MASK			(0x3ffff >> (PAGE_SHIFT - 12))
 #endif
 
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 #ifdef CONFIG_COMPAT
 
 #ifdef __AARCH64EB__
diff --git a/arch/mips/include/asm/elf.h b/arch/mips/include/asm/elf.h
index 535f196ffe02..31d747d46a23 100644
--- a/arch/mips/include/asm/elf.h
+++ b/arch/mips/include/asm/elf.h
@@ -410,10 +410,6 @@ struct linux_binprm;
 extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 				       int uses_interp);
 
-struct mm_struct;
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 struct arch_elf_state {
 	int fp_abi;
 	int interp_fp_abi;
diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h
index 57d289acb803..ee46ffef608e 100644
--- a/arch/powerpc/include/asm/elf.h
+++ b/arch/powerpc/include/asm/elf.h
@@ -128,10 +128,6 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
 	(0x7ff >> (PAGE_SHIFT - 12)) : \
 	(0x3ffff >> (PAGE_SHIFT - 12)))
 
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
-
 #ifdef CONFIG_SPU_BASE
 /* Notes used in ET_CORE. Note name is "SPU/<fd>/<filename>". */
 #define NT_SPU		1
diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h
index ff662155b2c4..a5c4978462c1 100644
--- a/arch/s390/include/asm/elf.h
+++ b/arch/s390/include/asm/elf.h
@@ -226,9 +226,6 @@ struct linux_binprm;
 #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
 int arch_setup_additional_pages(struct linux_binprm *, int);
 
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 void *fill_cpu_elf_notes(void *ptr, struct save_area *sa, __vector128 *vxrs);
 
 #endif
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 935588d95c82..f161c189c27b 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -339,9 +339,6 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 					      int uses_interp);
 #define compat_arch_setup_additional_pages compat_arch_setup_additional_pages
 
-extern unsigned long arch_randomize_brk(struct mm_struct *mm);
-#define arch_randomize_brk arch_randomize_brk
-
 /*
  * True on X86_32 or when emulating IA32 on X86_64
  */
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b20c05477e90..241ef68d2893 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1050,15 +1050,13 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	current->mm->end_data = end_data;
 	current->mm->start_stack = bprm->p;
 
-#ifdef arch_randomize_brk
 	if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
 		current->mm->brk = current->mm->start_brk =
 			arch_randomize_brk(current->mm);
-#ifdef CONFIG_COMPAT_BRK
+#ifdef compat_brk_randomized
 		current->brk_randomized = 1;
 #endif
 	}
-#endif
 
 	if (current->personality & MMAP_PAGE_ZERO) {
 		/* Why this, you ask???  Well SVr4 maps page 0 as read-only,
diff --git a/include/linux/elf-randomize.h b/include/linux/elf-randomize.h
index 7a4eda02d2b1..b5f0bda9472e 100644
--- a/include/linux/elf-randomize.h
+++ b/include/linux/elf-randomize.h
@@ -1,10 +1,22 @@
 #ifndef _ELF_RANDOMIZE_H
 #define _ELF_RANDOMIZE_H
 
+struct mm_struct;
+
 #ifndef CONFIG_ARCH_HAS_ELF_RANDOMIZE
 static inline unsigned long arch_mmap_rnd(void) { return 0; }
+# if defined(arch_randomize_brk) && defined(CONFIG_COMPAT_BRK)
+#  define compat_brk_randomized
+# endif
+# ifndef arch_randomize_brk
+#  define arch_randomize_brk(mm)	(mm->brk)
+# endif
 #else
 extern unsigned long arch_mmap_rnd(void);
+extern unsigned long arch_randomize_brk(struct mm_struct *mm);
+# ifdef CONFIG_COMPAT_BRK
+#  define compat_brk_randomized
+# endif
 #endif
 
 #endif
-- 
cgit v1.2.3


From 4a20799d11f64e6b8725cacc7619b1ae1dbf9acd Mon Sep 17 00:00:00 2001
From: Vladimir Murzin
Date: Tue, 14 Apr 2015 15:48:27 -0700
Subject: mm: move memtest under mm

Memtest is a simple feature which fills the memory with a given set of
patterns and validates memory contents, if bad memory regions is detected
it reserves them via memblock API.  Since memblock API is widely used by
other architectures this feature can be enabled outside of x86 world.

This patch set promotes memtest to live under generic mm umbrella and
enables memtest feature for arm/arm64.

It was reported that this patch set was useful for tracking down an issue
with some errant DMA on an arm64 platform.

This patch (of 6):

There is nothing platform dependent in the core memtest code, so other
platforms might benefit from this feature too.

[linux@roeck-us.net: MEMTEST depends on MEMBLOCK]
Signed-off-by: Vladimir Murzin <vladimir.murzin@arm.com>
Acked-by: Will Deacon <will.deacon@arm.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Russell King <rmk@arm.linux.org.uk>
Cc: Paul Bolle <pebolle@tiscali.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig            |  11 -----
 arch/x86/include/asm/e820.h |   8 ---
 arch/x86/mm/Makefile        |   2 -
 arch/x86/mm/memtest.c       | 118 --------------------------------------------
 include/linux/memblock.h    |   8 +++
 lib/Kconfig.debug           |  12 +++++
 mm/Makefile                 |   1 +
 mm/memtest.c                | 118 ++++++++++++++++++++++++++++++++++++++++++++
 8 files changed, 139 insertions(+), 139 deletions(-)
 delete mode 100644 arch/x86/mm/memtest.c
 create mode 100644 mm/memtest.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 1f7f185934a5..d43e7e1c784b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -721,17 +721,6 @@ endif #HYPERVISOR_GUEST
 config NO_BOOTMEM
 	def_bool y
 
-config MEMTEST
-	bool "Memtest"
-	---help---
-	  This option adds a kernel parameter 'memtest', which allows memtest
-	  to be set.
-	        memtest=0, mean disabled; -- default
-	        memtest=1, mean do 1 test pattern;
-	        ...
-	        memtest=4, mean do 4 test patterns.
-	  If you are unsure how to answer this question, answer N.
-
 source "arch/x86/Kconfig.cpu"
 
 config HPET_TIMER
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 779c2efe2e97..3ab0537872fb 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -40,14 +40,6 @@ static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
 }
 #endif
 
-#ifdef CONFIG_MEMTEST
-extern void early_memtest(unsigned long start, unsigned long end);
-#else
-static inline void early_memtest(unsigned long start, unsigned long end)
-{
-}
-#endif
-
 extern unsigned long e820_end_of_ram_pfn(void);
 extern unsigned long e820_end_of_low_ram_pfn(void);
 extern u64 early_reserve_e820(u64 sizet, u64 align);
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index c4cc74006c61..a482d105172b 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,6 +32,4 @@ obj-$(CONFIG_AMD_NUMA)		+= amdtopology.o
 obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
-obj-$(CONFIG_MEMTEST)		+= memtest.o
-
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
deleted file mode 100644
index 1e9da795767a..000000000000
--- a/arch/x86/mm/memtest.c
+++ /dev/null
@@ -1,118 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/mm.h>
-#include <linux/smp.h>
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/memblock.h>
-
-static u64 patterns[] __initdata = {
-	/* The first entry has to be 0 to leave memtest with zeroed memory */
-	0,
-	0xffffffffffffffffULL,
-	0x5555555555555555ULL,
-	0xaaaaaaaaaaaaaaaaULL,
-	0x1111111111111111ULL,
-	0x2222222222222222ULL,
-	0x4444444444444444ULL,
-	0x8888888888888888ULL,
-	0x3333333333333333ULL,
-	0x6666666666666666ULL,
-	0x9999999999999999ULL,
-	0xccccccccccccccccULL,
-	0x7777777777777777ULL,
-	0xbbbbbbbbbbbbbbbbULL,
-	0xddddddddddddddddULL,
-	0xeeeeeeeeeeeeeeeeULL,
-	0x7a6c7258554e494cULL, /* yeah ;-) */
-};
-
-static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
-{
-	printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
-	       (unsigned long long) pattern,
-	       (unsigned long long) start_bad,
-	       (unsigned long long) end_bad);
-	memblock_reserve(start_bad, end_bad - start_bad);
-}
-
-static void __init memtest(u64 pattern, u64 start_phys, u64 size)
-{
-	u64 *p, *start, *end;
-	u64 start_bad, last_bad;
-	u64 start_phys_aligned;
-	const size_t incr = sizeof(pattern);
-
-	start_phys_aligned = ALIGN(start_phys, incr);
-	start = __va(start_phys_aligned);
-	end = start + (size - (start_phys_aligned - start_phys)) / incr;
-	start_bad = 0;
-	last_bad = 0;
-
-	for (p = start; p < end; p++)
-		*p = pattern;
-
-	for (p = start; p < end; p++, start_phys_aligned += incr) {
-		if (*p == pattern)
-			continue;
-		if (start_phys_aligned == last_bad + incr) {
-			last_bad += incr;
-			continue;
-		}
-		if (start_bad)
-			reserve_bad_mem(pattern, start_bad, last_bad + incr);
-		start_bad = last_bad = start_phys_aligned;
-	}
-	if (start_bad)
-		reserve_bad_mem(pattern, start_bad, last_bad + incr);
-}
-
-static void __init do_one_pass(u64 pattern, u64 start, u64 end)
-{
-	u64 i;
-	phys_addr_t this_start, this_end;
-
-	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
-		this_start = clamp_t(phys_addr_t, this_start, start, end);
-		this_end = clamp_t(phys_addr_t, this_end, start, end);
-		if (this_start < this_end) {
-			printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
-			       (unsigned long long)this_start,
-			       (unsigned long long)this_end,
-			       (unsigned long long)cpu_to_be64(pattern));
-			memtest(pattern, this_start, this_end - this_start);
-		}
-	}
-}
-
-/* default is disabled */
-static int memtest_pattern __initdata;
-
-static int __init parse_memtest(char *arg)
-{
-	if (arg)
-		memtest_pattern = simple_strtoul(arg, NULL, 0);
-	else
-		memtest_pattern = ARRAY_SIZE(patterns);
-
-	return 0;
-}
-
-early_param("memtest", parse_memtest);
-
-void __init early_memtest(unsigned long start, unsigned long end)
-{
-	unsigned int i;
-	unsigned int idx = 0;
-
-	if (!memtest_pattern)
-		return;
-
-	printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
-	for (i = memtest_pattern-1; i < UINT_MAX; --i) {
-		idx = i % ARRAY_SIZE(patterns);
-		do_one_pass(patterns[idx], start, end);
-	}
-}
diff --git a/include/linux/memblock.h b/include/linux/memblock.h
index e8cc45307f8f..6724cb020f5e 100644
--- a/include/linux/memblock.h
+++ b/include/linux/memblock.h
@@ -365,6 +365,14 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo
 #define __initdata_memblock
 #endif
 
+#ifdef CONFIG_MEMTEST
+extern void early_memtest(unsigned long start, unsigned long end);
+#else
+static inline void early_memtest(unsigned long start, unsigned long end)
+{
+}
+#endif
+
 #else
 static inline phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align)
 {
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 36b6fa88ce5b..5c7a3183423b 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1745,6 +1745,18 @@ config TEST_UDELAY
 
 	  If unsure, say N.
 
+config MEMTEST
+	bool "Memtest"
+	depends on HAVE_MEMBLOCK
+	---help---
+	  This option adds a kernel parameter 'memtest', which allows memtest
+	  to be set.
+	        memtest=0, mean disabled; -- default
+	        memtest=1, mean do 1 test pattern;
+	        ...
+	        memtest=4, mean do 4 test patterns.
+	  If you are unsure how to answer this question, answer N.
+
 source "samples/Kconfig"
 
 source "lib/Kconfig.kgdb"
diff --git a/mm/Makefile b/mm/Makefile
index 668a9bb82be4..98c4eaeabdcb 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -55,6 +55,7 @@ obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
 obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
+obj-$(CONFIG_MEMTEST)		+= memtest.o
 obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/memtest.c b/mm/memtest.c
new file mode 100644
index 000000000000..1e9da795767a
--- /dev/null
+++ b/mm/memtest.c
@@ -0,0 +1,118 @@
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/init.h>
+#include <linux/pfn.h>
+#include <linux/memblock.h>
+
+static u64 patterns[] __initdata = {
+	/* The first entry has to be 0 to leave memtest with zeroed memory */
+	0,
+	0xffffffffffffffffULL,
+	0x5555555555555555ULL,
+	0xaaaaaaaaaaaaaaaaULL,
+	0x1111111111111111ULL,
+	0x2222222222222222ULL,
+	0x4444444444444444ULL,
+	0x8888888888888888ULL,
+	0x3333333333333333ULL,
+	0x6666666666666666ULL,
+	0x9999999999999999ULL,
+	0xccccccccccccccccULL,
+	0x7777777777777777ULL,
+	0xbbbbbbbbbbbbbbbbULL,
+	0xddddddddddddddddULL,
+	0xeeeeeeeeeeeeeeeeULL,
+	0x7a6c7258554e494cULL, /* yeah ;-) */
+};
+
+static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
+{
+	printk(KERN_INFO "  %016llx bad mem addr %010llx - %010llx reserved\n",
+	       (unsigned long long) pattern,
+	       (unsigned long long) start_bad,
+	       (unsigned long long) end_bad);
+	memblock_reserve(start_bad, end_bad - start_bad);
+}
+
+static void __init memtest(u64 pattern, u64 start_phys, u64 size)
+{
+	u64 *p, *start, *end;
+	u64 start_bad, last_bad;
+	u64 start_phys_aligned;
+	const size_t incr = sizeof(pattern);
+
+	start_phys_aligned = ALIGN(start_phys, incr);
+	start = __va(start_phys_aligned);
+	end = start + (size - (start_phys_aligned - start_phys)) / incr;
+	start_bad = 0;
+	last_bad = 0;
+
+	for (p = start; p < end; p++)
+		*p = pattern;
+
+	for (p = start; p < end; p++, start_phys_aligned += incr) {
+		if (*p == pattern)
+			continue;
+		if (start_phys_aligned == last_bad + incr) {
+			last_bad += incr;
+			continue;
+		}
+		if (start_bad)
+			reserve_bad_mem(pattern, start_bad, last_bad + incr);
+		start_bad = last_bad = start_phys_aligned;
+	}
+	if (start_bad)
+		reserve_bad_mem(pattern, start_bad, last_bad + incr);
+}
+
+static void __init do_one_pass(u64 pattern, u64 start, u64 end)
+{
+	u64 i;
+	phys_addr_t this_start, this_end;
+
+	for_each_free_mem_range(i, NUMA_NO_NODE, &this_start, &this_end, NULL) {
+		this_start = clamp_t(phys_addr_t, this_start, start, end);
+		this_end = clamp_t(phys_addr_t, this_end, start, end);
+		if (this_start < this_end) {
+			printk(KERN_INFO "  %010llx - %010llx pattern %016llx\n",
+			       (unsigned long long)this_start,
+			       (unsigned long long)this_end,
+			       (unsigned long long)cpu_to_be64(pattern));
+			memtest(pattern, this_start, this_end - this_start);
+		}
+	}
+}
+
+/* default is disabled */
+static int memtest_pattern __initdata;
+
+static int __init parse_memtest(char *arg)
+{
+	if (arg)
+		memtest_pattern = simple_strtoul(arg, NULL, 0);
+	else
+		memtest_pattern = ARRAY_SIZE(patterns);
+
+	return 0;
+}
+
+early_param("memtest", parse_memtest);
+
+void __init early_memtest(unsigned long start, unsigned long end)
+{
+	unsigned int i;
+	unsigned int idx = 0;
+
+	if (!memtest_pattern)
+		return;
+
+	printk(KERN_INFO "early_memtest: # of tests: %d\n", memtest_pattern);
+	for (i = memtest_pattern-1; i < UINT_MAX; --i) {
+		idx = i % ARRAY_SIZE(patterns);
+		do_one_pass(patterns[idx], start, end);
+	}
+}
-- 
cgit v1.2.3


From 3ac62bc0602794dc36aad77a7ef5772e989b2e22 Mon Sep 17 00:00:00 2001
From: Joe Perches
Date: Wed, 15 Apr 2015 16:17:45 -0700
Subject: x86: mtrr: if: remove use of seq_printf return value

The seq_printf return value, because it's frequently misused,
will eventually be converted to void.

See: commit 1f33c41c03da ("seq_file: Rename seq_overflow() to
     seq_has_overflowed() and make public")

Signed-off-by: Joe Perches <joe@perches.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/cpu/mtrr/if.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index a041e094b8b9..d76f13d6d8d6 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -404,11 +404,10 @@ static const struct file_operations mtrr_fops = {
 static int mtrr_seq_show(struct seq_file *seq, void *offset)
 {
 	char factor;
-	int i, max, len;
+	int i, max;
 	mtrr_type type;
 	unsigned long base, size;
 
-	len = 0;
 	max = num_var_ranges;
 	for (i = 0; i < max; i++) {
 		mtrr_if->get(i, &base, &size, &type);
@@ -425,11 +424,10 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
 			size >>= 20 - PAGE_SHIFT;
 		}
 		/* Base can be > 32bit */
-		len += seq_printf(seq, "reg%02i: base=0x%06lx000 "
-			"(%5luMB), size=%5lu%cB, count=%d: %s\n",
-			i, base, base >> (20 - PAGE_SHIFT), size,
-			factor, mtrr_usage_table[i],
-			mtrr_attrib_to_str(type));
+		seq_printf(seq, "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
+			   i, base, base >> (20 - PAGE_SHIFT),
+			   size, factor,
+			   mtrr_usage_table[i], mtrr_attrib_to_str(type));
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From fd0f86b66425bd8c6af8985881e82b28c30fd450 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Thu, 16 Apr 2015 00:40:25 -0700
Subject: x86/ptrace: Fix the TIF_FORCED_TF logic in handle_signal()

When the TIF_SINGLESTEP tracee dequeues a signal,
handle_signal() clears TIF_FORCED_TF and X86_EFLAGS_TF but
leaves TIF_SINGLESTEP set.

If the tracer does PTRACE_SINGLESTEP again, enable_single_step()
sets X86_EFLAGS_TF but not TIF_FORCED_TF.  This means that the
subsequent PTRACE_CONT doesn't not clear X86_EFLAGS_TF, and the
tracee gets the wrong SIGTRAP.

Test-case (needs -O2 to avoid prologue insns in signal handler):

	#include <unistd.h>
	#include <stdio.h>
	#include <sys/ptrace.h>
	#include <sys/wait.h>
	#include <sys/user.h>
	#include <assert.h>
	#include <stddef.h>

	void handler(int n)
	{
		asm("nop");
	}

	int child(void)
	{
		assert(ptrace(PTRACE_TRACEME, 0,0,0) == 0);
		signal(SIGALRM, handler);
		kill(getpid(), SIGALRM);
		return 0x23;
	}

	void *getip(int pid)
	{
		return (void*)ptrace(PTRACE_PEEKUSER, pid,
					offsetof(struct user, regs.rip), 0);
	}

	int main(void)
	{
		int pid, status;

		pid = fork();
		if (!pid)
			return child();

		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGALRM);

		assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert((getip(pid) - (void*)handler) == 0);

		assert(ptrace(PTRACE_SINGLESTEP, pid, 0, SIGALRM) == 0);
		assert(wait(&status) == pid);
		assert(WIFSTOPPED(status) && WSTOPSIG(status) == SIGTRAP);
		assert((getip(pid) - (void*)handler) == 1);

		assert(ptrace(PTRACE_CONT, pid, 0,0) == 0);
		assert(wait(&status) == pid);
		assert(WIFEXITED(status) && WEXITSTATUS(status) == 0x23);

		return 0;
	}

The last assert() fails because PTRACE_CONT wrongly triggers
another single-step and X86_EFLAGS_TF can't be cleared by
debugger until the tracee does sys_rt_sigreturn().

Change handle_signal() to do user_disable_single_step() if
stepping, we do not need to preserve TIF_SINGLESTEP because we
are going to do ptrace_notify(), and it is simply wrong to leak
this bit.

While at it, change the comment to explain why we also need to
clear TF unconditionally after setup_rt_frame().

Note: in the longer term we should probably change
setup_sigcontext() to use get_flags() and then just remove this
user_disable_single_step().  And, the state of TIF_FORCED_TF can
be wrong after restore_sigcontext() which can set/clear TF, this
needs another fix.

This fix fixes the 'single_step_syscall_32' testcase in
the x86 testsuite:

Before:

	~/linux/tools/testing/selftests/x86> ./single_step_syscall_32
	[RUN]   Set TF and check nop
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check int80
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check a fast syscall
	[WARN]  Hit 10000 SIGTRAPs with si_addr 0xf7789cc0, ip 0xf7789cc0
	Trace/breakpoint trap (core dumped)

After:

	~/linux/linux/tools/testing/selftests/x86> ./single_step_syscall_32
	[RUN]   Set TF and check nop
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check int80
	[OK]    Survived with TF set and 9 traps
	[RUN]   Set TF and check a fast syscall
	[OK]    Survived with TF set and 39 traps
	[RUN]   Fast syscall with TF cleared
	[OK]    Nothing unexpected happened

Reported-by: Evan Teran <eteran@alum.rit.edu>
Reported-by: Pedro Alves <palves@redhat.com>
Tested-by: Andres Freund <andres@anarazel.de>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
[ Added x86 self-test info. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/signal.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 3e581865c8e2..d185bdd95a4b 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -630,7 +630,8 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 static void
 handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 {
-	bool failed;
+	bool stepping, failed;
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -654,12 +655,13 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	}
 
 	/*
-	 * If TF is set due to a debugger (TIF_FORCED_TF), clear the TF
-	 * flag so that register information in the sigcontext is correct.
+	 * If TF is set due to a debugger (TIF_FORCED_TF), clear TF now
+	 * so that register information in the sigcontext is correct and
+	 * then notify the tracer before entering the signal handler.
 	 */
-	if (unlikely(regs->flags & X86_EFLAGS_TF) &&
-	    likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
-		regs->flags &= ~X86_EFLAGS_TF;
+	stepping = test_thread_flag(TIF_SINGLESTEP);
+	if (stepping)
+		user_disable_single_step(current);
 
 	failed = (setup_rt_frame(ksig, regs) < 0);
 	if (!failed) {
@@ -670,10 +672,8 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		 * it might disable possible debug exception from the
 		 * signal handler.
 		 *
-		 * Clear TF when entering the signal handler, but
-		 * notify any tracer that was single-stepping it.
-		 * The tracer may want to single-step inside the
-		 * handler too.
+		 * Clear TF for the case when it wasn't set by debugger to
+		 * avoid the recursive send_sigtrap() in SIGTRAP handler.
 		 */
 		regs->flags &= ~(X86_EFLAGS_DF|X86_EFLAGS_RF|X86_EFLAGS_TF);
 		/*
@@ -682,7 +682,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 		if (used_math())
 			fpu_reset_state(current);
 	}
-	signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
+	signal_setup_done(failed, ksig, stepping);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From c857eb56e6e8e53dccd8d1e7ea90bcaf3311996d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 15 Apr 2015 20:14:53 +0200
Subject: perf/x86: Fix hw_perf_event::flags collision

Somehow we ended up with overlapping flags when merging the
RDPMC control flag - this is bad, fix it.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h
index 329f0356ad4a..6ac5cb7a9e14 100644
--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -65,15 +65,15 @@ struct event_constraint {
 /*
  * struct hw_perf_event.flags flags
  */
-#define PERF_X86_EVENT_PEBS_LDLAT	0x1 /* ld+ldlat data address sampling */
-#define PERF_X86_EVENT_PEBS_ST		0x2 /* st data address sampling */
-#define PERF_X86_EVENT_PEBS_ST_HSW	0x4 /* haswell style datala, store */
-#define PERF_X86_EVENT_COMMITTED	0x8 /* event passed commit_txn */
-#define PERF_X86_EVENT_PEBS_LD_HSW	0x10 /* haswell style datala, load */
-#define PERF_X86_EVENT_PEBS_NA_HSW	0x20 /* haswell style datala, unknown */
-#define PERF_X86_EVENT_EXCL		0x40 /* HT exclusivity on counter */
-#define PERF_X86_EVENT_DYNAMIC		0x80 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x40 /* grant rdpmc permission */
+#define PERF_X86_EVENT_PEBS_LDLAT	0x0001 /* ld+ldlat data address sampling */
+#define PERF_X86_EVENT_PEBS_ST		0x0002 /* st data address sampling */
+#define PERF_X86_EVENT_PEBS_ST_HSW	0x0004 /* haswell style datala, store */
+#define PERF_X86_EVENT_COMMITTED	0x0008 /* event passed commit_txn */
+#define PERF_X86_EVENT_PEBS_LD_HSW	0x0010 /* haswell style datala, load */
+#define PERF_X86_EVENT_PEBS_NA_HSW	0x0020 /* haswell style datala, unknown */
+#define PERF_X86_EVENT_EXCL		0x0040 /* HT exclusivity on counter */
+#define PERF_X86_EVENT_DYNAMIC		0x0080 /* dynamic alloc'd constraint */
+#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0100 /* grant rdpmc permission */
 
 
 struct amd_nb {
-- 
cgit v1.2.3


From 517e6341fa123ec3a2f9ea78ad547be910529881 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Sat, 11 Apr 2015 12:16:22 +0200
Subject: perf/x86/intel: Fix Core2,Atom,NHM,WSM cycles:pp events

Ingo reported that cycles:pp didn't work for him on some machines.

It turns out that in this commit:

  af4bdcf675cf perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events

Andi forgot to explicitly allow that event when he
disabled event flags for PEBS on those uarchs.

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Fixes: af4bdcf675cf ("perf/x86/intel: Disallow flags for most Core2/Atom/Nehalem/Westmere events")
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index ca69ea56c712..813f75d71175 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -558,6 +558,8 @@ struct event_constraint intel_core2_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
 	EVENT_CONSTRAINT_END
 };
 
@@ -565,6 +567,8 @@ struct event_constraint intel_atom_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0x1),    /* MEM_LOAD_RETIRED.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x01),
 	EVENT_CONSTRAINT_END
 };
 
@@ -588,6 +592,8 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	EVENT_CONSTRAINT_END
 };
 
@@ -603,6 +609,8 @@ struct event_constraint intel_westmere_pebs_event_constraints[] = {
 	INTEL_FLAGS_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xcb, 0xf),    /* MEM_LOAD_RETIRED.* */
 	INTEL_FLAGS_EVENT_CONSTRAINT(0xf7, 0xf),    /* FP_ASSIST.* */
+	/* INST_RETIRED.ANY_P, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108000c0, 0x0f),
 	EVENT_CONSTRAINT_END
 };
 
-- 
cgit v1.2.3


From 645523960102fa0ac0578d070630e49ab05f06d1 Mon Sep 17 00:00:00 2001
From: Jacob Pan
Date: Thu, 26 Mar 2015 14:28:45 -0700
Subject: perf/x86/intel/rapl: Fix energy counter measurements but supporing
 per domain energy units

RAPL energy hardware unit can vary within a single CPU package, e.g.
HSW server DRAM has a fixed energy unit of 15.3 uJ (2^-16) whereas
the unit on other domains can be enumerated from power unit MSR.

There might be other variations in the future, this patch adds
per cpu model quirk to allow special handling of certain cpus.

hw_unit is also removed from per cpu data since it is not per cpu
and the sampling rate for energy counter is typically not high.

Without this patch, DRAM domain on HSW servers will be counted
4x higher than the real energy counter.

Signed-off-by: Jacob Pan <jacob.jun.pan@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Stephane Eranian <eranian@google.com>
Cc: Andi Kleen <andi.kleen@intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Link: http://lkml.kernel.org/r/1427405325-780-1-git-send-email-jacob.jun.pan@linux.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_rapl.c | 94 ++++++++++++++++++++++-------
 1 file changed, 73 insertions(+), 21 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
index c4bb8b8e5017..999289b94025 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -62,6 +62,14 @@
 #define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
 
+#define NR_RAPL_DOMAINS         0x4
+static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
+	"pp0-core",
+	"package",
+	"dram",
+	"pp1-gpu",
+};
+
 /* Clients have PP0, PKG */
 #define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
 			 1<<RAPL_IDX_PKG_NRG_STAT|\
@@ -112,7 +120,6 @@ static struct perf_pmu_events_attr event_attr_##v = {			\
 
 struct rapl_pmu {
 	spinlock_t	 lock;
-	int		 hw_unit;  /* 1/2^hw_unit Joule */
 	int		 n_active; /* number of active events */
 	struct list_head active_list;
 	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
@@ -120,6 +127,7 @@ struct rapl_pmu {
 	struct hrtimer   hrtimer;
 };
 
+static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;  /* 1/2^hw_unit Joule */
 static struct pmu rapl_pmu_class;
 static cpumask_t rapl_cpu_mask;
 static int rapl_cntr_mask;
@@ -127,6 +135,7 @@ static int rapl_cntr_mask;
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
 
+static struct x86_pmu_quirk *rapl_quirks;
 static inline u64 rapl_read_counter(struct perf_event *event)
 {
 	u64 raw;
@@ -134,15 +143,28 @@ static inline u64 rapl_read_counter(struct perf_event *event)
 	return raw;
 }
 
-static inline u64 rapl_scale(u64 v)
+#define rapl_add_quirk(func_)						\
+do {									\
+	static struct x86_pmu_quirk __quirk __initdata = {		\
+		.func = func_,						\
+	};								\
+	__quirk.next = rapl_quirks;					\
+	rapl_quirks = &__quirk;						\
+} while (0)
+
+static inline u64 rapl_scale(u64 v, int cfg)
 {
+	if (cfg > NR_RAPL_DOMAINS) {
+		pr_warn("invalid domain %d, failed to scale data\n", cfg);
+		return v;
+	}
 	/*
 	 * scale delta to smallest unit (1/2^32)
 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
 	 * or use ldexp(count, -32).
 	 * Watts = Joules/Time delta
 	 */
-	return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
+	return v << (32 - rapl_hw_unit[cfg - 1]);
 }
 
 static u64 rapl_event_update(struct perf_event *event)
@@ -173,7 +195,7 @@ again:
 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
 	delta >>= shift;
 
-	sdelta = rapl_scale(delta);
+	sdelta = rapl_scale(delta, event->hw.config);
 
 	local64_add(sdelta, &event->count);
 
@@ -546,12 +568,22 @@ static void rapl_cpu_init(int cpu)
 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
 }
 
+static __init void rapl_hsw_server_quirk(void)
+{
+	/*
+	 * DRAM domain on HSW server has fixed energy unit which can be
+	 * different than the unit from power unit MSR.
+	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
+	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
+	 */
+	rapl_hw_unit[RAPL_IDX_RAM_NRG_STAT] = 16;
+}
+
 static int rapl_cpu_prepare(int cpu)
 {
 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
 	int phys_id = topology_physical_package_id(cpu);
 	u64 ms;
-	u64 msr_rapl_power_unit_bits;
 
 	if (pmu)
 		return 0;
@@ -559,24 +591,13 @@ static int rapl_cpu_prepare(int cpu)
 	if (phys_id < 0)
 		return -1;
 
-	/* protect rdmsrl() to handle virtualization */
-	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
-		return -1;
-
 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
 	if (!pmu)
 		return -1;
-
 	spin_lock_init(&pmu->lock);
 
 	INIT_LIST_HEAD(&pmu->active_list);
 
-	/*
-	 * grab power unit as: 1/2^unit Joules
-	 *
-	 * we cache in local PMU instance
-	 */
-	pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
 	pmu->pmu = &rapl_pmu_class;
 
 	/*
@@ -586,8 +607,8 @@ static int rapl_cpu_prepare(int cpu)
 	 * divide interval by 2 to avoid lockstep (2 * 100)
 	 * if hw unit is 32, then we use 2 ms 1/200/2
 	 */
-	if (pmu->hw_unit < 32)
-		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
+	if (rapl_hw_unit[0] < 32)
+		ms = (1000 / (2 * 100)) * (1ULL << (32 - rapl_hw_unit[0] - 1));
 	else
 		ms = 2;
 
@@ -655,6 +676,20 @@ static int rapl_cpu_notifier(struct notifier_block *self,
 	return NOTIFY_OK;
 }
 
+static int rapl_check_hw_unit(void)
+{
+	u64 msr_rapl_power_unit_bits;
+	int i;
+
+	/* protect rdmsrl() to handle virtualization */
+	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
+		return -1;
+	for (i = 0; i < NR_RAPL_DOMAINS; i++)
+		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
+
+	return 0;
+}
+
 static const struct x86_cpu_id rapl_cpu_match[] = {
 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
 	[1] = {},
@@ -664,6 +699,8 @@ static int __init rapl_pmu_init(void)
 {
 	struct rapl_pmu *pmu;
 	int cpu, ret;
+	struct x86_pmu_quirk *quirk;
+	int i;
 
 	/*
 	 * check for Intel processor family 6
@@ -678,6 +715,11 @@ static int __init rapl_pmu_init(void)
 		rapl_cntr_mask = RAPL_IDX_CLN;
 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
 		break;
+	case 63: /* Haswell-Server */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_SRV;
+		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
+		break;
 	case 60: /* Haswell */
 	case 69: /* Haswell-Celeron */
 		rapl_cntr_mask = RAPL_IDX_HSW;
@@ -693,7 +735,13 @@ static int __init rapl_pmu_init(void)
 		/* unsupported */
 		return 0;
 	}
+	ret = rapl_check_hw_unit();
+	if (ret)
+		return ret;
 
+	/* run cpu model quirks */
+	for (quirk = rapl_quirks; quirk; quirk = quirk->next)
+		quirk->func();
 	cpu_notifier_register_begin();
 
 	for_each_online_cpu(cpu) {
@@ -714,14 +762,18 @@ static int __init rapl_pmu_init(void)
 
 	pmu = __this_cpu_read(rapl_pmu);
 
-	pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
+	pr_info("RAPL PMU detected,"
 		" API unit is 2^-32 Joules,"
 		" %d fixed counters"
 		" %llu ms ovfl timer\n",
-		pmu->hw_unit,
 		hweight32(rapl_cntr_mask),
 		ktime_to_ms(pmu->timer_interval));
-
+	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
+		if (rapl_cntr_mask & (1 << i)) {
+			pr_info("hw unit of domain %s 2^-%d Joules\n",
+				rapl_domain_names[i], rapl_hw_unit[i]);
+		}
+	}
 out:
 	cpu_notifier_register_done();
 
-- 
cgit v1.2.3


From 78d504bcd769cc496f63b626f507039eab2316b7 Mon Sep 17 00:00:00 2001
From: Kan Liang
Date: Thu, 2 Apr 2015 04:12:57 -0400
Subject: perf/x86/intel: Add Broadwell support for the LBR callstack

Same as Haswell, Broadwell also support the LBR callstack.

Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Andi Kleen <ak@linux.intel.com>
Link: http://lkml.kernel.org/r/1427962377-40955-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9da2400c2ec3..219d3fb423a1 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -3275,7 +3275,7 @@ __init int intel_pmu_init(void)
 		hw_cache_extra_regs[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = HSW_DEMAND_WRITE|
 									      BDW_L3_MISS_LOCAL|HSW_SNOOP_DRAM;
 
-		intel_pmu_lbr_init_snb();
+		intel_pmu_lbr_init_hsw();
 
 		x86_pmu.event_constraints = intel_bdw_event_constraints;
 		x86_pmu.pebs_constraints = intel_hsw_pebs_event_constraints;
-- 
cgit v1.2.3


From 18ecb3bfa5a9f6fffbb3eeb4369f0b9463438ec0 Mon Sep 17 00:00:00 2001
From: Borislav Petkov
Date: Thu, 16 Apr 2015 20:41:37 +0200
Subject: x86/fpu: Load xsave pointer *after* initialization

So I was playing with gdb today and did this simple thing:

	gdb /bin/ls

	...

	(gdb) run

Box exploded with this splat:

	BUG: unable to handle kernel NULL pointer dereference at 00000000000001d0
	IP: [<ffffffff8100fe5a>] xstateregs_get+0x7a/0x120
	[...]

	Call Trace:
	 ptrace_regset
	 ptrace_request
	 ? wait_task_inactive
	 ? preempt_count_sub
	 arch_ptrace
	 ? ptrace_get_task_struct
	 SyS_ptrace
	 system_call_fastpath

... because we do cache &target->thread.fpu.state->xsave into the
local variable xsave but that pointer is NULL at that time and
it gets initialized later, in init_fpu(), see:

	e7f180dcd8ab ("x86/fpu: Change xstateregs_get()/set() to use ->xsave.i387 rather than ->fxsave")

The fix is simple: load xsave *after* init_fpu() has run.

Also do the same in xstateregs_set(), as suggested by Oleg Nesterov.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1429209697-5902-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/i387.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 367f39d35e9c..009183276bb7 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -341,7 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 		unsigned int pos, unsigned int count,
 		void *kbuf, void __user *ubuf)
 {
-	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+	struct xsave_struct *xsave;
 	int ret;
 
 	if (!cpu_has_xsave)
@@ -351,6 +351,8 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	xsave = &target->thread.fpu.state->xsave;
+
 	/*
 	 * Copy the 48bytes defined by the software first into the xstate
 	 * memory layout in the thread struct, so that we can copy the entire
@@ -369,7 +371,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 		  unsigned int pos, unsigned int count,
 		  const void *kbuf, const void __user *ubuf)
 {
-	struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
+	struct xsave_struct *xsave;
 	int ret;
 
 	if (!cpu_has_xsave)
@@ -379,6 +381,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
 	if (ret)
 		return ret;
 
+	xsave = &target->thread.fpu.state->xsave;
+
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
 	/*
 	 * mxcsr reserved bits must be masked to zero for security reasons.
-- 
cgit v1.2.3


From 8eb68bf75eb7ea73bce88cb9d42efd3bbcece3cd Mon Sep 17 00:00:00 2001
From: Kees Cook
Date: Thu, 16 Apr 2015 12:49:04 -0700
Subject: x86: switch to using asm-generic for seccomp.h

Switch to using the newly created asm-generic/seccomp.h for the seccomp
strict mode syscall definitions. The obsolete sigreturn syscall override
is retained in 32-bit mode, and the ia32 syscall overrides are used in
the compat case. Remaining definitions were identical.

Signed-off-by: Kees Cook <keescook@chromium.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/seccomp.h    | 21 ++++++++++++++++++---
 arch/x86/include/asm/seccomp_32.h | 11 -----------
 arch/x86/include/asm/seccomp_64.h | 17 -----------------
 3 files changed, 18 insertions(+), 31 deletions(-)
 delete mode 100644 arch/x86/include/asm/seccomp_32.h
 delete mode 100644 arch/x86/include/asm/seccomp_64.h

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
index 0f3d7f099224..0c8c7c8861b4 100644
--- a/arch/x86/include/asm/seccomp.h
+++ b/arch/x86/include/asm/seccomp.h
@@ -1,5 +1,20 @@
+#ifndef _ASM_X86_SECCOMP_H
+#define _ASM_X86_SECCOMP_H
+
+#include <asm/unistd.h>
+
 #ifdef CONFIG_X86_32
-# include <asm/seccomp_32.h>
-#else
-# include <asm/seccomp_64.h>
+#define __NR_seccomp_sigreturn		__NR_sigreturn
 #endif
+
+#ifdef CONFIG_COMPAT
+#include <asm/ia32_unistd.h>
+#define __NR_seccomp_read_32		__NR_ia32_read
+#define __NR_seccomp_write_32		__NR_ia32_write
+#define __NR_seccomp_exit_32		__NR_ia32_exit
+#define __NR_seccomp_sigreturn_32	__NR_ia32_sigreturn
+#endif
+
+#include <asm-generic/seccomp.h>
+
+#endif /* _ASM_X86_SECCOMP_H */
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
deleted file mode 100644
index b811d6f5780c..000000000000
--- a/arch/x86/include/asm/seccomp_32.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_32_H
-#define _ASM_X86_SECCOMP_32_H
-
-#include <linux/unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
deleted file mode 100644
index 84ec1bd161a5..000000000000
--- a/arch/x86/include/asm/seccomp_64.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef _ASM_X86_SECCOMP_64_H
-#define _ASM_X86_SECCOMP_64_H
-
-#include <linux/unistd.h>
-#include <asm/ia32_unistd.h>
-
-#define __NR_seccomp_read __NR_read
-#define __NR_seccomp_write __NR_write
-#define __NR_seccomp_exit __NR_exit
-#define __NR_seccomp_sigreturn __NR_rt_sigreturn
-
-#define __NR_seccomp_read_32 __NR_ia32_read
-#define __NR_seccomp_write_32 __NR_ia32_write
-#define __NR_seccomp_exit_32 __NR_ia32_exit
-#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
-
-#endif /* _ASM_X86_SECCOMP_64_H */
-- 
cgit v1.2.3


From 0c99241c93b8060441f3c8434848e54b5338f922 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 16 Apr 2015 12:38:30 +0200
Subject: perf/x86/intel/pt: Fix and clean up error handling in pt_event_add()

Dan Carpenter reported that pt_event_add() has buggy
error handling logic: it returns 0 instead of -EBUSY when
it fails to start a newly added event.

Furthermore, the control flow in this function is messy,
with cleanup labels mixed with direct returns.

Fix the bug and clean up the code by converting it to
a straight fast path for the regular non-failing case,
plus a clear sequence of cascading goto labels to do
all cleanup.

NOTE: I materially changed the existing clean up logic in the
pt_event_start() failure case to use the direct
perf_aux_output_end() path, not pt_event_del(), because
perf_aux_output_end() is enough here.

Reported-by: Dan Carpenter <dan.carpenter@oracle.com>
Acked-by: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Julia Lawall <julia.lawall@lip6.fr>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150416103830.GB7847@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/kernel/cpu/perf_event_intel_pt.c | 33 ++++++++++++++-----------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c
index f2770641c0fd..ffe666c2c6b5 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -988,39 +988,36 @@ static int pt_event_add(struct perf_event *event, int mode)
 	int ret = -EBUSY;
 
 	if (pt->handle.event)
-		goto out;
+		goto fail;
 
 	buf = perf_aux_output_begin(&pt->handle, event);
-	if (!buf) {
-		ret = -EINVAL;
-		goto out;
-	}
+	ret = -EINVAL;
+	if (!buf)
+		goto fail_stop;
 
 	pt_buffer_reset_offsets(buf, pt->handle.head);
 	if (!buf->snapshot) {
 		ret = pt_buffer_reset_markers(buf, &pt->handle);
-		if (ret) {
-			perf_aux_output_end(&pt->handle, 0, true);
-			goto out;
-		}
+		if (ret)
+			goto fail_end_stop;
 	}
 
 	if (mode & PERF_EF_START) {
 		pt_event_start(event, 0);
-		if (hwc->state == PERF_HES_STOPPED) {
-			pt_event_del(event, 0);
-			ret = -EBUSY;
-		}
+		ret = -EBUSY;
+		if (hwc->state == PERF_HES_STOPPED)
+			goto fail_end_stop;
 	} else {
 		hwc->state = PERF_HES_STOPPED;
 	}
 
-	ret = 0;
-out:
-
-	if (ret)
-		hwc->state = PERF_HES_STOPPED;
+	return 0;
 
+fail_end_stop:
+	perf_aux_output_end(&pt->handle, 0, true);
+fail_stop:
+	hwc->state = PERF_HES_STOPPED;
+fail:
 	return ret;
 }
 
-- 
cgit v1.2.3


From a6dfa128ce5c414ab46b1d690f7a1b8decb8526d Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk
Date: Fri, 17 Apr 2015 15:04:48 -0400
Subject: config: Enable NEED_DMA_MAP_STATE by default when SWIOTLB is selected

A huge amount of NIC drivers use the DMA API, however if
compiled under 32-bit an very important part of the DMA API can
be ommitted leading to the drivers not working at all
(especially if used with 'swiotlb=force iommu=soft').

As Prashant Sreedharan explains it: "the driver [tg3] uses
DEFINE_DMA_UNMAP_ADDR(), dma_unmap_addr_set() to keep a copy of
the dma "mapping" and dma_unmap_addr() to get the "mapping"
value. On most of the platforms this is a no-op, but ... with
"iommu=soft and swiotlb=force" this house keeping is required,
... otherwise we pass 0 while calling pci_unmap_/pci_dma_sync_
instead of the DMA address."

As such enable this even when using 32-bit kernels.

Reported-by: Ian Jackson <Ian.Jackson@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Prashant Sreedharan <prashant@broadcom.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michael Chan <mchan@broadcom.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: boris.ostrovsky@oracle.com
Cc: cascardo@linux.vnet.ibm.com
Cc: david.vrabel@citrix.com
Cc: sanjeevb@broadcom.com
Cc: siva.kallam@broadcom.com
Cc: vyasevich@gmail.com
Cc: xen-devel@lists.xensource.com
Link: http://lkml.kernel.org/r/20150417190448.GA9462@l.oracle.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 arch/x86/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index faff6934c05a..eb79036e3503 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -177,7 +177,7 @@ config SBUS
 
 config NEED_DMA_MAP_STATE
 	def_bool y
-	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG
+	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG || SWIOTLB
 
 config NEED_SG_DMA_LENGTH
 	def_bool y
-- 
cgit v1.2.3


From 0b2bb6925eb602eae993a4b5c282a8c18ad1c949 Mon Sep 17 00:00:00 2001
From: Len Brown
Date: Thu, 26 Mar 2015 00:50:30 -0400
Subject: tools/power turbostat: Initial Skylake support

Skylake adds some additional residency counters.

Skylake supports a different mix of RAPL registers
from any previous product.

In most other ways, Skylake is like Broadwell.

Signed-off-by: Len Brown <len.brown@intel.com>
---
 arch/x86/include/uapi/asm/msr-index.h |   5 ++
 tools/power/x86/turbostat/turbostat.c | 124 ++++++++++++++++++++++++++++++----
 2 files changed, 116 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h
index c4c75272314a..fe01b0a784e7 100644
--- a/arch/x86/include/uapi/asm/msr-index.h
+++ b/arch/x86/include/uapi/asm/msr-index.h
@@ -150,6 +150,11 @@
 #define MSR_PP1_ENERGY_STATUS		0x00000641
 #define MSR_PP1_POLICY			0x00000642
 
+#define MSR_PKG_WEIGHTED_CORE_C0_RES	0x00000658
+#define MSR_PKG_ANY_CORE_C0_RES		0x00000659
+#define MSR_PKG_ANY_GFXE_C0_RES		0x0000065A
+#define MSR_PKG_BOTH_CORE_GFXE_C0_RES	0x0000065B
+
 #define MSR_CORE_C1_RES			0x00000660
 
 #define MSR_CC6_DEMOTION_POLICY_CONFIG	0x00000668
diff --git a/tools/power/x86/turbostat/turbostat.c b/tools/power/x86/turbostat/turbostat.c
index 50341a322cb8..ad5688914446 100644
--- a/tools/power/x86/turbostat/turbostat.c
+++ b/tools/power/x86/turbostat/turbostat.c
@@ -57,6 +57,7 @@ unsigned int do_pc3;
 unsigned int do_pc6;
 unsigned int do_pc7;
 unsigned int do_c8_c9_c10;
+unsigned int do_skl_residency;
 unsigned int do_slm_cstates;
 unsigned int use_c1_residency_msr;
 unsigned int has_aperf;
@@ -99,18 +100,18 @@ unsigned int do_ring_perf_limit_reasons;
 #define RAPL_DRAM		(1 << 3)
 					/* 0x618 MSR_DRAM_POWER_LIMIT */
 					/* 0x619 MSR_DRAM_ENERGY_STATUS */
-					/* 0x61c MSR_DRAM_POWER_INFO */
 #define RAPL_DRAM_PERF_STATUS	(1 << 4)
 					/* 0x61b MSR_DRAM_PERF_STATUS */
+#define RAPL_DRAM_POWER_INFO	(1 << 5)
+					/* 0x61c MSR_DRAM_POWER_INFO */
 
-#define RAPL_CORES		(1 << 5)
+#define RAPL_CORES		(1 << 6)
 					/* 0x638 MSR_PP0_POWER_LIMIT */
 					/* 0x639 MSR_PP0_ENERGY_STATUS */
-#define RAPL_CORE_POLICY	(1 << 6)
+#define RAPL_CORE_POLICY	(1 << 7)
 					/* 0x63a MSR_PP0_POLICY */
 
-
-#define RAPL_GFX		(1 << 7)
+#define RAPL_GFX		(1 << 8)
 					/* 0x640 MSR_PP1_POWER_LIMIT */
 					/* 0x641 MSR_PP1_ENERGY_STATUS */
 					/* 0x642 MSR_PP1_POLICY */
@@ -157,6 +158,10 @@ struct pkg_data {
 	unsigned long long pc8;
 	unsigned long long pc9;
 	unsigned long long pc10;
+	unsigned long long pkg_wtd_core_c0;
+	unsigned long long pkg_any_core_c0;
+	unsigned long long pkg_any_gfxe_c0;
+	unsigned long long pkg_both_core_gfxe_c0;
 	unsigned int package_id;
 	unsigned int energy_pkg;	/* MSR_PKG_ENERGY_STATUS */
 	unsigned int energy_dram;	/* MSR_DRAM_ENERGY_STATUS */
@@ -320,6 +325,13 @@ void print_header(void)
 	if (do_ptm)
 		outp += sprintf(outp, "  PkgTmp");
 
+	if (do_skl_residency) {
+		outp += sprintf(outp, " Totl%%C0");
+		outp += sprintf(outp, "  Any%%C0");
+		outp += sprintf(outp, "  GFX%%C0");
+		outp += sprintf(outp, " CPUGFX%%");
+	}
+
 	if (do_pc2)
 		outp += sprintf(outp, " Pkg%%pc2");
 	if (do_pc3)
@@ -401,6 +413,12 @@ int dump_counters(struct thread_data *t, struct core_data *c,
 
 	if (p) {
 		outp += sprintf(outp, "package: %d\n", p->package_id);
+
+		outp += sprintf(outp, "Weighted cores: %016llX\n", p->pkg_wtd_core_c0);
+		outp += sprintf(outp, "Any cores: %016llX\n", p->pkg_any_core_c0);
+		outp += sprintf(outp, "Any GFX: %016llX\n", p->pkg_any_gfxe_c0);
+		outp += sprintf(outp, "CPU + GFX: %016llX\n", p->pkg_both_core_gfxe_c0);
+
 		outp += sprintf(outp, "pc2: %016llX\n", p->pc2);
 		if (do_pc3)
 			outp += sprintf(outp, "pc3: %016llX\n", p->pc3);
@@ -539,9 +557,18 @@ int format_counters(struct thread_data *t, struct core_data *c,
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		goto done;
 
+	/* PkgTmp */
 	if (do_ptm)
 		outp += sprintf(outp, "%8d", p->pkg_temp_c);
 
+	/* Totl%C0, Any%C0 GFX%C0 CPUGFX% */
+	if (do_skl_residency) {
+		outp += sprintf(outp, "%8.2f", 100.0 * p->pkg_wtd_core_c0/t->tsc);
+		outp += sprintf(outp, "%8.2f", 100.0 * p->pkg_any_core_c0/t->tsc);
+		outp += sprintf(outp, "%8.2f", 100.0 * p->pkg_any_gfxe_c0/t->tsc);
+		outp += sprintf(outp, "%8.2f", 100.0 * p->pkg_both_core_gfxe_c0/t->tsc);
+	}
+
 	if (do_pc2)
 		outp += sprintf(outp, "%8.2f", 100.0 * p->pc2/t->tsc);
 	if (do_pc3)
@@ -644,6 +671,13 @@ void format_all_counters(struct thread_data *t, struct core_data *c, struct pkg_
 void
 delta_package(struct pkg_data *new, struct pkg_data *old)
 {
+
+	if (do_skl_residency) {
+		old->pkg_wtd_core_c0 = new->pkg_wtd_core_c0 - old->pkg_wtd_core_c0;
+		old->pkg_any_core_c0 = new->pkg_any_core_c0 - old->pkg_any_core_c0;
+		old->pkg_any_gfxe_c0 = new->pkg_any_gfxe_c0 - old->pkg_any_gfxe_c0;
+		old->pkg_both_core_gfxe_c0 = new->pkg_both_core_gfxe_c0 - old->pkg_both_core_gfxe_c0;
+	}
 	old->pc2 = new->pc2 - old->pc2;
 	if (do_pc3)
 		old->pc3 = new->pc3 - old->pc3;
@@ -790,6 +824,11 @@ void clear_counters(struct thread_data *t, struct core_data *c, struct pkg_data
 	c->c7 = 0;
 	c->core_temp_c = 0;
 
+	p->pkg_wtd_core_c0 = 0;
+	p->pkg_any_core_c0 = 0;
+	p->pkg_any_gfxe_c0 = 0;
+	p->pkg_both_core_gfxe_c0 = 0;
+
 	p->pc2 = 0;
 	if (do_pc3)
 		p->pc3 = 0;
@@ -834,6 +873,13 @@ int sum_counters(struct thread_data *t, struct core_data *c,
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		return 0;
 
+	if (do_skl_residency) {
+		average.packages.pkg_wtd_core_c0 += p->pkg_wtd_core_c0;
+		average.packages.pkg_any_core_c0 += p->pkg_any_core_c0;
+		average.packages.pkg_any_gfxe_c0 += p->pkg_any_gfxe_c0;
+		average.packages.pkg_both_core_gfxe_c0 += p->pkg_both_core_gfxe_c0;
+	}
+
 	average.packages.pc2 += p->pc2;
 	if (do_pc3)
 		average.packages.pc3 += p->pc3;
@@ -881,6 +927,13 @@ void compute_average(struct thread_data *t, struct core_data *c,
 	average.cores.c6 /= topo.num_cores;
 	average.cores.c7 /= topo.num_cores;
 
+	if (do_skl_residency) {
+		average.packages.pkg_wtd_core_c0 /= topo.num_packages;
+		average.packages.pkg_any_core_c0 /= topo.num_packages;
+		average.packages.pkg_any_gfxe_c0 /= topo.num_packages;
+		average.packages.pkg_both_core_gfxe_c0 /= topo.num_packages;
+	}
+
 	average.packages.pc2 /= topo.num_packages;
 	if (do_pc3)
 		average.packages.pc3 /= topo.num_packages;
@@ -987,6 +1040,16 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 	if (!(t->flags & CPU_IS_FIRST_CORE_IN_PACKAGE))
 		return 0;
 
+	if (do_skl_residency) {
+		if (get_msr(cpu, MSR_PKG_WEIGHTED_CORE_C0_RES, &p->pkg_wtd_core_c0))
+			return -10;
+		if (get_msr(cpu, MSR_PKG_ANY_CORE_C0_RES, &p->pkg_any_core_c0))
+			return -11;
+		if (get_msr(cpu, MSR_PKG_ANY_GFXE_C0_RES, &p->pkg_any_gfxe_c0))
+			return -12;
+		if (get_msr(cpu, MSR_PKG_BOTH_CORE_GFXE_C0_RES, &p->pkg_both_core_gfxe_c0))
+			return -13;
+	}
 	if (do_pc3)
 		if (get_msr(cpu, MSR_PKG_C3_RESIDENCY, &p->pc3))
 			return -9;
@@ -1063,15 +1126,18 @@ int get_counters(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 #define PCL_6R 9 /* PC6 Retention */
 #define PCL__7 10 /* PC7 */
 #define PCL_7S 11 /* PC7 Shrink */
-#define PCLUNL 12 /* Unlimited */
+#define PCL__8 12 /* PC8 */
+#define PCL__9 13 /* PC9 */
+#define PCLUNL 14 /* Unlimited */
 
 int pkg_cstate_limit = PCLUKN;
 char *pkg_cstate_limit_strings[] = { "reserved", "unknown", "pc0", "pc1", "pc2",
-	"pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "unlimited"};
+	"pc3", "pc4", "pc6", "pc6n", "pc6r", "pc7", "pc7s", "pc8", "pc9", "unlimited"};
 
 int nhm_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCL__3, PCL__6, PCL__7, PCLRSV, PCLRSV, PCLUNL};
 int snb_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCL__7, PCL_7S, PCLRSV, PCLUNL};
 int hsw_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCLRSV, PCLUNL};
+int skl_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL__3, PCL__6, PCL__7, PCL_7S, PCL__8, PCL__9};
 int slv_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCLRSV, PCLRSV, PCL__4, PCLRSV, PCL__6, PCL__7};
 int amt_pkg_cstate_limits[8] = {PCL__0, PCL__1, PCL__2, PCLRSV, PCLRSV, PCLRSV, PCL__6, PCL__7};
 int phi_pkg_cstate_limits[8] = {PCL__0, PCL__2, PCL_6N, PCL_6R, PCLRSV, PCLRSV, PCLRSV, PCLUNL};
@@ -1619,6 +1685,8 @@ int probe_nhm_msrs(unsigned int family, unsigned int model)
 	case 0x47:	/* BDW */
 	case 0x4F:	/* BDX */
 	case 0x56:	/* BDX-DE */
+	case 0x4E:	/* SKL */
+	case 0x5E:	/* SKL */
 		pkg_cstate_limits = hsw_pkg_cstate_limits;
 		break;
 	case 0x37:	/* BYT */
@@ -1895,14 +1963,18 @@ void rapl_probe(unsigned int family, unsigned int model)
 	case 0x47:	/* BDW */
 		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_GFX | RAPL_PKG_POWER_INFO;
 		break;
+	case 0x4E:	/* SKL */
+	case 0x5E:	/* SKL */
+		do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
+		break;
 	case 0x3F:	/* HSX */
 	case 0x4F:	/* BDX */
 	case 0x56:	/* BDX-DE */
-		do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_DRAM_PERF_STATUS | RAPL_PKG_PERF_STATUS | RAPL_PKG_POWER_INFO;
 		break;
 	case 0x2D:
 	case 0x3E:
-		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
+		do_rapl = RAPL_PKG | RAPL_CORES | RAPL_CORE_POLICY | RAPL_DRAM | RAPL_DRAM_POWER_INFO | RAPL_PKG_PERF_STATUS | RAPL_DRAM_PERF_STATUS | RAPL_PKG_POWER_INFO;
 		break;
 	case 0x37:	/* BYT */
 	case 0x4D:	/* AVN */
@@ -2092,19 +2164,18 @@ int print_rapl(struct thread_data *t, struct core_data *c, struct pkg_data *p)
 			((msr >> 48) & 1) ? "EN" : "DIS");
 	}
 
-	if (do_rapl & RAPL_DRAM) {
+	if (do_rapl & RAPL_DRAM_POWER_INFO) {
 		if (get_msr(cpu, MSR_DRAM_POWER_INFO, &msr))
                 	return -6;
 
-
 		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_INFO,: 0x%08llx (%.0f W TDP, RAPL %.0f - %.0f W, %f sec.)\n",
 			cpu, msr,
 			((msr >>  0) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 16) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 32) & RAPL_POWER_GRANULARITY) * rapl_power_units,
 			((msr >> 48) & RAPL_TIME_GRANULARITY) * rapl_time_units);
-
-
+	}
+	if (do_rapl & RAPL_DRAM) {
 		if (get_msr(cpu, MSR_DRAM_POWER_LIMIT, &msr))
 			return -9;
 		fprintf(stderr, "cpu%d: MSR_DRAM_POWER_LIMIT: 0x%08llx (%slocked)\n",
@@ -2173,6 +2244,8 @@ int has_snb_msrs(unsigned int family, unsigned int model)
 	case 0x47:	/* BDW */
 	case 0x4F:	/* BDX */
 	case 0x56:	/* BDX-DE */
+	case 0x4E:	/* SKL */
+	case 0x5E:	/* SKL */
 		return 1;
 	}
 	return 0;
@@ -2193,12 +2266,36 @@ int has_hsw_msrs(unsigned int family, unsigned int model)
 	switch (model) {
 	case 0x45:	/* HSW */
 	case 0x3D:	/* BDW */
+	case 0x4E:	/* SKL */
+	case 0x5E:	/* SKL */
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * SKL adds support for additional MSRS:
+ *
+ * MSR_PKG_WEIGHTED_CORE_C0_RES    0x00000658
+ * MSR_PKG_ANY_CORE_C0_RES         0x00000659
+ * MSR_PKG_ANY_GFXE_C0_RES         0x0000065A
+ * MSR_PKG_BOTH_CORE_GFXE_C0_RES   0x0000065B
+ */
+int has_skl_msrs(unsigned int family, unsigned int model)
+{
+	if (!genuine_intel)
+		return 0;
+
+	switch (model) {
+	case 0x4E:	/* SKL */
+	case 0x5E:	/* SKL */
 		return 1;
 	}
 	return 0;
 }
 
 
+
 int is_slm(unsigned int family, unsigned int model)
 {
 	if (!genuine_intel)
@@ -2384,6 +2481,7 @@ void process_cpuid()
 	do_pc6 = (pkg_cstate_limit >= PCL__6);
 	do_pc7 = do_snb_cstates && (pkg_cstate_limit >= PCL__7);
 	do_c8_c9_c10 = has_hsw_msrs(family, model);
+	do_skl_residency = has_skl_msrs(family, model);
 	do_slm_cstates = is_slm(family, model);
 	bclk = discover_bclk(family, model);
 
-- 
cgit v1.2.3