From 5119e92efc733d730b34f9605a5ae61fdc4bf649 Mon Sep 17 00:00:00 2001
From: Jonathan Corbet
Date: Thu, 15 May 2008 09:12:01 -0600
Subject: x86: cdev lock_kernel() pushdown

Push the cdev lock_kernel() call down into the x86 msr and cpuid drivers.

Signed-off-by: Jonathan Corbet <corbet@lwn.net>
---
 arch/x86/kernel/cpuid.c | 25 +++++++++++++++++--------
 arch/x86/kernel/msr.c   | 16 ++++++++++++----
 2 files changed, 29 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..71f1c2654bec 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/smp.h>
+#include <linux/smp_lock.h>
 #include <linux/major.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
@@ -107,15 +108,23 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 
 static int cpuid_open(struct inode *inode, struct file *file)
 {
-	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
-	struct cpuinfo_x86 *c = &cpu_data(cpu);
-
-	if (cpu >= NR_CPUS || !cpu_online(cpu))
-		return -ENXIO;	/* No such CPU */
+	unsigned int cpu;
+	struct cpuinfo_x86 *c;
+	int ret = 0;
+	
+	lock_kernel();
+
+	cpu = iminor(file->f_path.dentry->d_inode);
+	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+		ret = -ENXIO;	/* No such CPU */
+		goto out;
+	}
+	c = &cpu_data(cpu);
 	if (c->cpuid_level < 0)
-		return -EIO;	/* CPUID not supported */
-
-	return 0;
+		ret = -EIO;	/* CPUID not supported */
+out:
+	unlock_kernel();
+	return ret;
 }
 
 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1f3abe048e93..a153b3905f60 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -117,12 +117,20 @@ static int msr_open(struct inode *inode, struct file *file)
 {
 	unsigned int cpu = iminor(file->f_path.dentry->d_inode);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	int ret = 0;
 
-	if (cpu >= NR_CPUS || !cpu_online(cpu))
-		return -ENXIO;	/* No such CPU */
-	if (!cpu_has(c, X86_FEATURE_MSR))
-		return -EIO;	/* MSR not supported */
+	lock_kernel();
+	cpu = iminor(file->f_path.dentry->d_inode);
 
+	if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+		ret = -ENXIO;	/* No such CPU */
+		goto out;
+	}
+	c = &cpu_data(cpu);
+	if (!cpu_has(c, X86_FEATURE_MSR))
+		ret = -EIO;	/* MSR not supported */
+out:
+	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 23adec554a7648f99c8acc0caf49c66320cd2b84 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:41 +0200
Subject: x86: add notrace annotations to vsyscall.

Add the notrace annotations to the vsyscall functions - there we are
not in kernel context yet, so the tracer function cannot (and must not)
be called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/vsyscall_64.c  |  3 ++-
 arch/x86/vdso/vclock_gettime.c | 15 ++++++++-------
 arch/x86/vdso/vgetcpu.c        |  3 ++-
 include/asm-x86/vsyscall.h     |  3 ++-
 4 files changed, 14 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..4063dfa2a02d 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -42,7 +42,8 @@
 #include <asm/topology.h>
 #include <asm/vgtod.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) \
+		__attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
 #define __syscall_clobber "r11","cx","memory"
 
 /*
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index 23476c2ebfc4..5cb8f754c52d 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -23,7 +23,7 @@
 
 #define gtod vdso_vsyscall_gtod_data
 
-static long vdso_fallback_gettime(long clock, struct timespec *ts)
+notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
 {
 	long ret;
 	asm("syscall" : "=a" (ret) :
@@ -31,7 +31,7 @@ static long vdso_fallback_gettime(long clock, struct timespec *ts)
 	return ret;
 }
 
-static inline long vgetns(void)
+notrace static inline long vgetns(void)
 {
 	long v;
 	cycles_t (*vread)(void);
@@ -40,7 +40,7 @@ static inline long vgetns(void)
 	return (v * gtod->clock.mult) >> gtod->clock.shift;
 }
 
-static noinline int do_realtime(struct timespec *ts)
+notrace static noinline int do_realtime(struct timespec *ts)
 {
 	unsigned long seq, ns;
 	do {
@@ -54,7 +54,8 @@ static noinline int do_realtime(struct timespec *ts)
 }
 
 /* Copy of the version in kernel/time.c which we cannot directly access */
-static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
+notrace static void
+vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 {
 	while (nsec >= NSEC_PER_SEC) {
 		nsec -= NSEC_PER_SEC;
@@ -68,7 +69,7 @@ static void vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
 	ts->tv_nsec = nsec;
 }
 
-static noinline int do_monotonic(struct timespec *ts)
+notrace static noinline int do_monotonic(struct timespec *ts)
 {
 	unsigned long seq, ns, secs;
 	do {
@@ -82,7 +83,7 @@ static noinline int do_monotonic(struct timespec *ts)
 	return 0;
 }
 
-int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
+notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 {
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread))
 		switch (clock) {
@@ -96,7 +97,7 @@ int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 int clock_gettime(clockid_t, struct timespec *)
 	__attribute__((weak, alias("__vdso_clock_gettime")));
 
-int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
+notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
 {
 	long ret;
 	if (likely(gtod->sysctl_enabled && gtod->clock.vread)) {
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index c8097f17f8a9..9fbc6b20026b 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -13,7 +13,8 @@
 #include <asm/vgtod.h>
 #include "vextern.h"
 
-long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
+notrace long
+__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
 {
 	unsigned int p;
 
diff --git a/include/asm-x86/vsyscall.h b/include/asm-x86/vsyscall.h
index 17b3700949bf..6b66ff905af0 100644
--- a/include/asm-x86/vsyscall.h
+++ b/include/asm-x86/vsyscall.h
@@ -24,7 +24,8 @@ enum vsyscall_num {
 	((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
 #define __section_vsyscall_clock __attribute__ \
 	((unused, __section__ (".vsyscall_clock"),aligned(16)))
-#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_fn \
+	__attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
 
 #define VGETCPU_RDTSCP	1
 #define VGETCPU_LSL	2
-- 
cgit v1.2.3


From 16444a8a40d4c7b4f6de34af0cae1f76a4f6c901 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: add basic support for gcc profiler instrumentation

If CONFIG_FTRACE is selected and /proc/sys/kernel/ftrace_enabled is
set to a non-zero value the ftrace routine will be called everytime
we enter a kernel function that is not marked with the "notrace"
attribute.

The ftrace routine will then call a registered function if a function
happens to be registered.

[ This code has been highly hacked by Steven Rostedt and Ingo Molnar,
  so don't blame Arnaldo for all of this ;-) ]

Update:
  It is now possible to register more than one ftrace function.
  If only one ftrace function is registered, that will be the
  function that ftrace calls directly. If more than one function
  is registered, then ftrace will call a function that will loop
  through the functions to call.

Signed-off-by: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Makefile                   |   4 ++
 arch/x86/Kconfig           |   1 +
 arch/x86/kernel/entry_32.S |  27 +++++++++
 arch/x86/kernel/entry_64.S |  37 ++++++++++++
 include/linux/ftrace.h     |  38 +++++++++++++
 kernel/Makefile            |   1 +
 kernel/trace/Kconfig       |   5 ++
 kernel/trace/Makefile      |   3 +
 kernel/trace/ftrace.c      | 138 +++++++++++++++++++++++++++++++++++++++++++++
 lib/Kconfig.debug          |   2 +
 10 files changed, 256 insertions(+)
 create mode 100644 include/linux/ftrace.h
 create mode 100644 kernel/trace/Kconfig
 create mode 100644 kernel/trace/Makefile
 create mode 100644 kernel/trace/ftrace.c

(limited to 'arch/x86')

diff --git a/Makefile b/Makefile
index 20b32351906b..b4a273f19b52 100644
--- a/Makefile
+++ b/Makefile
@@ -528,6 +528,10 @@ KBUILD_CFLAGS	+= -g
 KBUILD_AFLAGS	+= -gdwarf-2
 endif
 
+ifdef CONFIG_FTRACE
+KBUILD_CFLAGS	+= -pg
+endif
+
 # We trigger additional mismatches with less inlining
 ifdef CONFIG_DEBUG_SECTION_MISMATCH
 KBUILD_CFLAGS += $(call cc-option, -fno-inline-functions-called-once)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe361ae7ef2f..c742dfeb0dbe 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 2a609dc3271c..f47b9b5440d2 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1109,6 +1109,33 @@ ENDPROC(xen_failsafe_callback)
 
 #endif	/* CONFIG_XEN */
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpl $ftrace_stub, ftrace_trace_function
+	jnz trace
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+
+	/* taken from glibc */
+trace:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+	call   *ftrace_trace_function
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 .section .rodata,"a"
 #include "syscall_table_32.S"
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..f046e0c64883 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -54,6 +54,43 @@
 
 	.code64
 
+#ifdef CONFIG_FTRACE
+ENTRY(mcount)
+	cmpq $ftrace_stub, ftrace_trace_function
+	jnz trace
+.globl ftrace_stub
+ftrace_stub:
+	retq
+
+trace:
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+	call   *ftrace_trace_function
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	jmp ftrace_stub
+END(mcount)
+#endif
+
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
 #endif	
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
new file mode 100644
index 000000000000..b96ef14c249a
--- /dev/null
+++ b/include/linux/ftrace.h
@@ -0,0 +1,38 @@
+#ifndef _LINUX_FTRACE_H
+#define _LINUX_FTRACE_H
+
+#ifdef CONFIG_FTRACE
+
+#include <linux/linkage.h>
+
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+
+typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip);
+
+struct ftrace_ops {
+	ftrace_func_t	  func;
+	struct ftrace_ops *next;
+};
+
+/*
+ * The ftrace_ops must be a static and should also
+ * be read_mostly.  These functions do modify read_mostly variables
+ * so use them sparely. Never free an ftrace_op or modify the
+ * next pointer after it has been registered. Even after unregistering
+ * it, the next pointer may still be used internally.
+ */
+int register_ftrace_function(struct ftrace_ops *ops);
+int unregister_ftrace_function(struct ftrace_ops *ops);
+void clear_ftrace_function(void);
+
+extern void ftrace_stub(unsigned long a0, unsigned long a1);
+extern void mcount(void);
+
+#else /* !CONFIG_FTRACE */
+# define register_ftrace_function(ops) do { } while (0)
+# define unregister_ftrace_function(ops) do { } while (0)
+# define clear_ftrace_function(ops) do { } while (0)
+#endif /* CONFIG_FTRACE */
+#endif /* _LINUX_FTRACE_H */
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..fa05f6d8bdbf 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -69,6 +69,7 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_LATENCYTOP) += latencytop.o
+obj-$(CONFIG_FTRACE) += trace/
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..8185c91417bc
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,5 @@
+#
+# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
+#
+config HAVE_FTRACE
+	bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..bf4fd215a6a9
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_FTRACE) += libftrace.o
+
+libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..b6a80b98a3fb
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,138 @@
+/*
+ * Infrastructure for profiling code inserted by 'gcc -pg'.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * Originally ported from the -rt patch by:
+ *   Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
+ *
+ * Based on code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/module.h>
+#include <linux/ftrace.h>
+
+static DEFINE_SPINLOCK(ftrace_func_lock);
+static struct ftrace_ops ftrace_list_end __read_mostly =
+{
+	.func = ftrace_stub,
+};
+
+static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
+
+/* mcount is defined per arch in assembly */
+EXPORT_SYMBOL(mcount);
+
+notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_ops *op = ftrace_list;
+
+	/* in case someone actually ports this to alpha! */
+	read_barrier_depends();
+
+	while (op != &ftrace_list_end) {
+		/* silly alpha */
+		read_barrier_depends();
+		op->func(ip, parent_ip);
+		op = op->next;
+	};
+}
+
+/**
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
+ *
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
+ */
+int register_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+	ops->next = ftrace_list;
+	/*
+	 * We are entering ops into the ftrace_list but another
+	 * CPU might be walking that list. We need to make sure
+	 * the ops->next pointer is valid before another CPU sees
+	 * the ops pointer included into the ftrace_list.
+	 */
+	smp_wmb();
+	ftrace_list = ops;
+	/*
+	 * For one func, simply call it directly.
+	 * For more than one func, call the chain.
+	 */
+	if (ops->next == &ftrace_list_end)
+		ftrace_trace_function = ops->func;
+	else
+		ftrace_trace_function = ftrace_list_func;
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	unsigned long flags;
+	struct ftrace_ops **p;
+	int ret = 0;
+
+	spin_lock_irqsave(&ftrace_func_lock, flags);
+
+	/*
+	 * If we are the only function, then the ftrace pointer is
+	 * pointing directly to that function.
+	 */
+	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
+		ftrace_trace_function = ftrace_stub;
+		ftrace_list = &ftrace_list_end;
+		goto out;
+	}
+
+	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+		if (*p == ops)
+			break;
+
+	if (*p != ops) {
+		ret = -1;
+		goto out;
+	}
+
+	*p = (*p)->next;
+
+	/* If we only have one func left, then call that directly */
+	if (ftrace_list->next == &ftrace_list_end)
+		ftrace_trace_function = ftrace_list->func;
+
+ out:
+	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	return 0;
+}
+
+/**
+ * clear_ftrace_function - reset the ftrace function
+ *
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
+ */
+void clear_ftrace_function(void)
+{
+	ftrace_trace_function = ftrace_stub;
+}
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index d2099f41aa1e..d8b6279a9b42 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -634,6 +634,8 @@ config LATENCYTOP
 	  Enable this option if you want to use the LatencyTOP tool
 	  to find out which userspace is blocking on what kernel operations.
 
+source kernel/trace/Kconfig
+
 config PROVIDE_OHCI1394_DMA_INIT
 	bool "Remote debugging over FireWire early on boot"
 	depends on PCI && X86
-- 
cgit v1.2.3


From 81d68a96a39844853b37f20cc8282d9b65b78ef3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace irq disabled critical timings

This patch adds latency tracing for critical timings
(how long interrupts are disabled for).

 "irqsoff" is added to /debugfs/tracing/available_tracers

Note:
  tracing_max_latency
    also holds the max latency for irqsoff (in usecs).
   (default to large number so one must start latency tracing)

  tracing_thresh
    threshold (in usecs) to always print out if irqs off
    is detected to be longer than stated here.
    If irq_thresh is non-zero, then max_irq_latency
    is ignored.

Here's an example of a trace with ftrace_enabled = 0

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
Signed-off-by: Ingo Molnar <mingo@elte.hu>
--------------------------------------------------------------------
 latency: 100 us, #3/3, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1d.s3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1d.s3  100us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1d.s3  100us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

And this is a trace with ftrace_enabled == 1

=======
preemption latency trace v1.1.5 on 2.6.24-rc7
--------------------------------------------------------------------
 latency: 102 us, #12/12, CPU#1 | (M:rt VP:0, KP:0, SP:0 HP:0 #P:2)
    -----------------
    | task: swapper-0 (uid:0 nice:0 policy:0 rt_prio:0)
    -----------------
 => started at: _spin_lock_irqsave+0x2a/0xb7
 => ended at:   _spin_unlock_irqrestore+0x32/0x5f

                 _------=> CPU#
                / _-----=> irqs-off
               | / _----=> need-resched
               || / _---=> hardirq/softirq
               ||| / _--=> preempt-depth
               |||| /
               |||||     delay
   cmd     pid ||||| time  |   caller
      \   /    |||||   \   |   /
 swapper-0     1dNs3    0us+: _spin_lock_irqsave+0x2a/0xb7 (e1000_update_stats+0x47/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_read_phy_reg+0x16/0x225 [e1000] (e1000_update_stats+0x5e2/0x64c [e1000])
 swapper-0     1dNs3   46us : e1000_swfw_sync_acquire+0x10/0x99 [e1000] (e1000_read_phy_reg+0x49/0x225 [e1000])
 swapper-0     1dNs3   46us : e1000_get_hw_eeprom_semaphore+0x12/0xa6 [e1000] (e1000_swfw_sync_acquire+0x36/0x99 [e1000])
 swapper-0     1dNs3   47us : __const_udelay+0x9/0x47 (e1000_read_phy_reg+0x116/0x225 [e1000])
 swapper-0     1dNs3   47us+: __delay+0x9/0x50 (__const_udelay+0x45/0x47)
 swapper-0     1dNs3   97us : preempt_schedule+0xc/0x84 (__delay+0x4e/0x50)
 swapper-0     1dNs3   98us : e1000_swfw_sync_release+0xc/0x55 [e1000] (e1000_read_phy_reg+0x211/0x225 [e1000])
 swapper-0     1dNs3   99us+: e1000_put_hw_eeprom_semaphore+0x9/0x35 [e1000] (e1000_swfw_sync_release+0x50/0x55 [e1000])
 swapper-0     1dNs3  101us : _spin_unlock_irqrestore+0xe/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : _spin_unlock_irqrestore+0x32/0x5f (e1000_update_stats+0x641/0x64c [e1000])
 swapper-0     1dNs3  102us : trace_hardirqs_on_caller+0x75/0x89 (_spin_unlock_irqrestore+0x32/0x5f)

vim:ft=help
=======

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_64.c |   3 +
 arch/x86/lib/Makefile        |   1 +
 arch/x86/lib/thunk_32.S      |  47 +++++
 arch/x86/lib/thunk_64.S      |  19 +-
 include/asm-x86/irqflags.h   |  24 +--
 include/linux/ftrace.h       |   8 +
 include/linux/irqflags.h     |  12 +-
 kernel/fork.c                |   2 +-
 kernel/lockdep.c             |  23 ++-
 kernel/printk.c              |   2 +
 kernel/trace/Kconfig         |  18 ++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 402 +++++++++++++++++++++++++++++++++++++++++++
 13 files changed, 531 insertions(+), 31 deletions(-)
 create mode 100644 arch/x86/lib/thunk_32.S
 create mode 100644 kernel/trace/trace_irqsoff.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index e2319f39988b..dd349c92f051 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -165,7 +165,10 @@ void cpu_idle(void)
 			 */
 			local_irq_disable();
 			enter_idle();
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 			/* In many cases the interrupt that ended idle
 			   has already called exit_idle. But some idle
 			   loops can be woken up without interrupt. */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 76f60f52a885..84aa2883fe15 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -5,6 +5,7 @@
 obj-$(CONFIG_SMP) := msr-on-cpu.o
 
 lib-y := delay_$(BITS).o
+lib-y += thunk_$(BITS).o
 lib-y += usercopy_$(BITS).o getuser_$(BITS).o putuser_$(BITS).o
 lib-y += memcpy_$(BITS).o
 
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
new file mode 100644
index 000000000000..650b11e00ecc
--- /dev/null
+++ b/arch/x86/lib/thunk_32.S
@@ -0,0 +1,47 @@
+/*
+ * Trampoline to trace irqs off. (otherwise CALLER_ADDR1 might crash)
+ * Copyright 2008 by Steven Rostedt, Red Hat, Inc
+ *  (inspired by Andi Kleen's thunk_64.S)
+ * Subject to the GNU public license, v.2. No warranty of any kind.
+ */
+
+	#include <linux/linkage.h>
+
+#define ARCH_TRACE_IRQS_ON			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_on;			\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#define ARCH_TRACE_IRQS_OFF			\
+	pushl %eax;				\
+	pushl %ecx;				\
+	pushl %edx;				\
+	call trace_hardirqs_off;		\
+	popl %edx;				\
+	popl %ecx;				\
+	popl %eax;
+
+#ifdef CONFIG_TRACE_IRQFLAGS
+	/* put return address in eax (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	/* Place EIP in the arg1 */
+	movl 3*4(%esp), %eax
+	call \func
+	popl %edx
+	popl %ecx
+	popl %eax
+	ret
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
+#endif
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index e009251d4e9f..bf9a7d5a5428 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -2,6 +2,7 @@
  * Save registers before calling assembly functions. This avoids
  * disturbance of register allocation in some inline assembly constructs.
  * Copyright 2001,2002 by Andi Kleen, SuSE Labs.
+ * Added trace_hardirqs callers - Copyright 2007 Steven Rostedt, Red Hat, Inc.
  * Subject to the GNU public license, v.2. No warranty of any kind.
  */
 
@@ -42,8 +43,22 @@
 #endif	
 	
 #ifdef CONFIG_TRACE_IRQFLAGS
-	thunk trace_hardirqs_on_thunk,trace_hardirqs_on
-	thunk trace_hardirqs_off_thunk,trace_hardirqs_off
+	/* put return address in rdi (arg1) */
+	.macro thunk_ra name,func
+	.globl \name
+\name:
+	CFI_STARTPROC
+	SAVE_ARGS
+	/* SAVE_ARGS pushs 9 elements */
+	/* the next element would be the rip */
+	movq 9*8(%rsp), %rdi
+	call \func
+	jmp  restore
+	CFI_ENDPROC
+	.endm
+
+	thunk_ra trace_hardirqs_on_thunk,trace_hardirqs_on_caller
+	thunk_ra trace_hardirqs_off_thunk,trace_hardirqs_off_caller
 #endif
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
diff --git a/include/asm-x86/irqflags.h b/include/asm-x86/irqflags.h
index c242527f970e..24d71b1eb189 100644
--- a/include/asm-x86/irqflags.h
+++ b/include/asm-x86/irqflags.h
@@ -179,8 +179,6 @@ static inline void trace_hardirqs_fixup(void)
  * have a reliable stack. x86_64 only.
  */
 #define SWAPGS_UNSAFE_STACK	swapgs
-#define ARCH_TRACE_IRQS_ON		call trace_hardirqs_on_thunk
-#define ARCH_TRACE_IRQS_OFF		call trace_hardirqs_off_thunk
 #define ARCH_LOCKDEP_SYS_EXIT		call lockdep_sys_exit_thunk
 #define ARCH_LOCKDEP_SYS_EXIT_IRQ	\
 	TRACE_IRQS_ON; \
@@ -192,24 +190,6 @@ static inline void trace_hardirqs_fixup(void)
 	TRACE_IRQS_OFF;
 
 #else
-#define ARCH_TRACE_IRQS_ON			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_on;			\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
-#define ARCH_TRACE_IRQS_OFF			\
-	pushl %eax;				\
-	pushl %ecx;				\
-	pushl %edx;				\
-	call trace_hardirqs_off;		\
-	popl %edx;				\
-	popl %ecx;				\
-	popl %eax;
-
 #define ARCH_LOCKDEP_SYS_EXIT			\
 	pushl %eax;				\
 	pushl %ecx;				\
@@ -223,8 +203,8 @@ static inline void trace_hardirqs_fixup(void)
 #endif
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-#  define TRACE_IRQS_ON		ARCH_TRACE_IRQS_ON
-#  define TRACE_IRQS_OFF	ARCH_TRACE_IRQS_OFF
+#  define TRACE_IRQS_ON		call trace_hardirqs_on_thunk;
+#  define TRACE_IRQS_OFF	call trace_hardirqs_off_thunk;
 #else
 #  define TRACE_IRQS_ON
 #  define TRACE_IRQS_OFF
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db8a5e7abe41..0a20445dcbcc 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -50,4 +50,12 @@ extern void mcount(void);
 # define CALLER_ADDR5 0UL
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+  extern void notrace time_hardirqs_on(unsigned long a0, unsigned long a1);
+  extern void notrace time_hardirqs_off(unsigned long a0, unsigned long a1);
+#else
+# define time_hardirqs_on(a0, a1)		do { } while (0)
+# define time_hardirqs_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index e600c4e9b8c5..5b711d4e9fd9 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -12,10 +12,10 @@
 #define _LINUX_TRACE_IRQFLAGS_H
 
 #ifdef CONFIG_TRACE_IRQFLAGS
-  extern void trace_hardirqs_on(void);
-  extern void trace_hardirqs_off(void);
   extern void trace_softirqs_on(unsigned long ip);
   extern void trace_softirqs_off(unsigned long ip);
+  extern void trace_hardirqs_on(void);
+  extern void trace_hardirqs_off(void);
 # define trace_hardirq_context(p)	((p)->hardirq_context)
 # define trace_softirq_context(p)	((p)->softirq_context)
 # define trace_hardirqs_enabled(p)	((p)->hardirqs_enabled)
@@ -41,6 +41,14 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
+#ifdef CONFIG_IRQSOFF_TRACER
+ extern void stop_critical_timings(void);
+ extern void start_critical_timings(void);
+#else
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+#endif
+
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 
 #include <asm/irqflags.h>
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d66d676dc362 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -909,7 +909,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	rt_mutex_init_task(p);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_LOCKDEP)
 	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
 	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..e21924365ea3 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
 #include <linux/irqflags.h>
 #include <linux/utsname.h>
 #include <linux/hash.h>
+#include <linux/ftrace.h>
 
 #include <asm/sections.h>
 
@@ -982,7 +983,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
 	return 1;
 }
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 /*
  * Forwards and backwards subgraph searching, for the purposes of
  * proving that two subgraphs can be connected by a new dependency
@@ -1680,7 +1681,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
 static int mark_lock(struct task_struct *curr, struct held_lock *this,
 		     enum lock_usage_bit new_bit);
 
-#ifdef CONFIG_TRACE_IRQFLAGS
+#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
 
 /*
  * print irq inversion bug:
@@ -2013,11 +2014,13 @@ void early_boot_irqs_on(void)
 /*
  * Hardirqs will be enabled:
  */
-void trace_hardirqs_on(void)
+void notrace trace_hardirqs_on_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 	unsigned long ip;
 
+	time_hardirqs_on(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2055,16 +2058,23 @@ void trace_hardirqs_on(void)
 	curr->hardirq_enable_event = ++curr->irq_events;
 	debug_atomic_inc(&hardirqs_on_events);
 }
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
+void notrace trace_hardirqs_on(void)
+{
+	trace_hardirqs_on_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 /*
  * Hardirqs were disabled:
  */
-void trace_hardirqs_off(void)
+void notrace trace_hardirqs_off_caller(unsigned long a0)
 {
 	struct task_struct *curr = current;
 
+	time_hardirqs_off(CALLER_ADDR0, a0);
+
 	if (unlikely(!debug_locks || current->lockdep_recursion))
 		return;
 
@@ -2082,7 +2092,12 @@ void trace_hardirqs_off(void)
 	} else
 		debug_atomic_inc(&redundant_hardirqs_off);
 }
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
+void notrace trace_hardirqs_off(void)
+{
+	trace_hardirqs_off_caller(CALLER_ADDR0);
+}
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..ae7d5b9e535d 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1041,7 +1041,9 @@ void release_console_sem(void)
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
 		spin_unlock(&logbuf_lock);
+		stop_critical_timings();	/* don't trace print latency */
 		call_console_drivers(_con_start, _log_end);
+		start_critical_timings();
 		local_irq_restore(flags);
 	}
 	console_locked = 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 892ecc94a82b..896df1cf6adc 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -26,6 +26,24 @@ config FTRACE
 	  (the bootup default), then the overhead of the instructions is very
 	  small and not measurable even in micro-benchmarks.
 
+config IRQSOFF_TRACER
+	bool "Interrupts-off Latency Tracer"
+	default n
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on GENERIC_TIME
+	select TRACE_IRQFLAGS
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5508cdb19aea..46be8647fb65 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -3,6 +3,7 @@ obj-$(CONFIG_FTRACE) += libftrace.o
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
+obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..a9131b0cf1a5
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,402 @@
+/*
+ * trace irqs off criticall timings
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
+ *
+ * From code in the latency_tracer, that is:
+ *
+ *  Copyright (C) 2004-2006 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+#include <linux/kallsyms.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ftrace.h>
+#include <linux/fs.h>
+
+#include "trace.h"
+
+static struct trace_array		*irqsoff_trace __read_mostly;
+static int				tracer_enabled __read_mostly;
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp	unsigned long max_sequence;
+
+#ifdef CONFIG_FTRACE
+/*
+ * irqsoff uses its own tracer function to keep the overhead down:
+ */
+static void notrace
+irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	local_save_flags(flags);
+
+	if (!irqs_disabled_flags(flags))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = irqsoff_tracer_call,
+};
+#endif /* CONFIG_FTRACE */
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int notrace report_latency(cycle_t delta)
+{
+	if (tracing_thresh) {
+		if (delta < tracing_thresh)
+			return 0;
+	} else {
+		if (delta <= tracing_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+static void notrace
+check_critical_timing(struct trace_array *tr,
+		      struct trace_array_cpu *data,
+		      unsigned long parent_ip,
+		      int cpu)
+{
+	unsigned long latency, t0, t1;
+	cycle_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = data->preempt_timestamp;
+	T1 = now(cpu);
+	delta = T1-T0;
+
+	local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = now(cpu);
+
+	delta = T2-T0;
+
+	latency = nsecs_to_usecs(delta);
+
+	if (data->critical_sequence != max_sequence)
+		goto out;
+
+	tracing_max_latency = delta;
+	t0 = nsecs_to_usecs(T0);
+	t1 = nsecs_to_usecs(T1);
+
+	data->critical_end = parent_ip;
+
+	update_max_tr_single(tr, current, cpu);
+
+	if (tracing_thresh)
+		printk(KERN_INFO "(%16s-%-5d|#%d): %lu us critical section "
+		       "violates %lu us threshold.\n"
+		       " => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, nsecs_to_usecs(tracing_thresh), t0);
+	else
+		printk(KERN_INFO "(%16s-%-5d|#%d):"
+		       " new %lu us maximum-latency "
+		       "critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol(KERN_CONT "<%s>\n", data->critical_start);
+	printk(KERN_CONT " =>   ended at timestamp %lu: ", t1);
+	print_symbol(KERN_CONT "<%s>\n", data->critical_end);
+	dump_stack();
+	t1 = nsecs_to_usecs(now(cpu));
+	printk(KERN_CONT " =>   dump-end timestamp %lu\n\n", t1);
+
+	max_sequence++;
+
+out:
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	tracing_reset(data);
+	ftrace(tr, data, CALLER_ADDR0, parent_ip, flags);
+}
+
+static inline void notrace
+start_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+
+	data->critical_sequence = max_sequence;
+	data->preempt_timestamp = now(cpu);
+	data->critical_start = parent_ip;
+	tracing_reset(data);
+
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+
+	atomic_dec(&data->disabled);
+}
+
+static inline void notrace
+stop_critical_timing(unsigned long ip, unsigned long parent_ip)
+{
+	int cpu;
+	struct trace_array *tr = irqsoff_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+
+	if (likely(!tracer_enabled))
+		return;
+
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(!data) || unlikely(!data->trace) ||
+	    !data->critical_start || atomic_read(&data->disabled))
+		return;
+
+	atomic_inc(&data->disabled);
+	local_save_flags(flags);
+	ftrace(tr, data, ip, parent_ip, flags);
+	check_critical_timing(tr, data, parent_ip, cpu);
+	data->critical_start = 0;
+	atomic_dec(&data->disabled);
+}
+
+void notrace start_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+void notrace stop_critical_timings(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+
+#ifdef CONFIG_PROVE_LOCKING
+void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(a0, a1);
+}
+
+void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(a0, a1);
+}
+
+#else /* !CONFIG_PROVE_LOCKING */
+
+/*
+ * Stubs:
+ */
+
+void early_boot_irqs_off(void)
+{
+}
+
+void early_boot_irqs_on(void)
+{
+}
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+
+/*
+ * We are only interested in hardirq on/off events:
+ */
+void notrace trace_hardirqs_on(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void notrace trace_hardirqs_off(void)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		stop_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+	unsigned long flags;
+
+	local_save_flags(flags);
+
+	if (irqs_disabled_flags(flags))
+		start_critical_timing(CALLER_ADDR0, caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+#endif /* CONFIG_PROVE_LOCKING */
+
+static void start_irqsoff_tracer(struct trace_array *tr)
+{
+	tracer_enabled = 1;
+	register_ftrace_function(&trace_ops);
+}
+
+static void stop_irqsoff_tracer(struct trace_array *tr)
+{
+	unregister_ftrace_function(&trace_ops);
+	tracer_enabled = 0;
+}
+
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	irqsoff_trace = tr;
+	/* make sure that the tracer is visibel */
+	smp_wmb();
+
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_reset(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		stop_irqsoff_tracer(tr);
+}
+
+static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
+{
+	if (tr->ctrl)
+		start_irqsoff_tracer(tr);
+	else
+		stop_irqsoff_tracer(tr);
+}
+
+static void notrace irqsoff_tracer_open(struct trace_iterator *iter)
+{
+	/* stop the trace while dumping */
+	if (iter->tr->ctrl)
+		stop_irqsoff_tracer(iter->tr);
+}
+
+static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
+{
+	if (iter->tr->ctrl)
+		start_irqsoff_tracer(iter->tr);
+}
+
+static struct tracer irqsoff_tracer __read_mostly =
+{
+	.name		= "irqsoff",
+	.init		= irqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+__init static int init_irqsoff_tracer(void)
+{
+	register_tracer(&irqsoff_tracer);
+
+	return 0;
+}
+device_initcall(init_irqsoff_tracer);
-- 
cgit v1.2.3


From 6cd8a4bb2f97527a9ceb30bc77ea4e959c6a95e3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: trace preempt off critical timings

Add preempt off timings. A lot of kernel core code is taken from the RT patch
latency trace that was written by Ingo Molnar.

This adds "preemptoff" and "preemptirqsoff" to /debugfs/tracing/available_tracers

Now instead of just tracing irqs off, preemption off can be selected
to be recorded.

When this is selected, it shares the same files as irqs off timings.
One can either trace preemption off, irqs off, or one or the other off.

By echoing "preemptoff" into /debugfs/tracing/current_tracer, recording
of preempt off only is performed. "irqsoff" will only record the time
irqs are disabled, but "preemptirqsoff" will take the total time irqs
or preemption are disabled. Runtime switching of these options is now
supported by simpling echoing in the appropriate trace name into
/debugfs/tracing/current_tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/process_32.c |   3 +
 include/linux/ftrace.h       |   8 ++
 include/linux/irqflags.h     |   3 +-
 include/linux/preempt.h      |   2 +-
 kernel/sched.c               |  24 +++++-
 kernel/trace/Kconfig         |  25 ++++++
 kernel/trace/Makefile        |   1 +
 kernel/trace/trace_irqsoff.c | 184 +++++++++++++++++++++++++++++++------------
 8 files changed, 197 insertions(+), 53 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f8476dfbb60d..a30aa1f2607a 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -185,7 +185,10 @@ void cpu_idle(void)
 
 			local_irq_disable();
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			/* Don't trace irqs off for idle */
+			stop_critical_timings();
 			idle();
+			start_critical_timings();
 		}
 		tick_nohz_restart_sched_tick();
 		preempt_enable_no_resched();
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 0a20445dcbcc..740c97dcf9cb 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -58,4 +58,12 @@ extern void mcount(void);
 # define time_hardirqs_off(a0, a1)		do { } while (0)
 #endif
 
+#ifdef CONFIG_PREEMPT_TRACER
+  extern void notrace trace_preempt_on(unsigned long a0, unsigned long a1);
+  extern void notrace trace_preempt_off(unsigned long a0, unsigned long a1);
+#else
+# define trace_preempt_on(a0, a1)		do { } while (0)
+# define trace_preempt_off(a0, a1)		do { } while (0)
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 5b711d4e9fd9..2b1c2e58566e 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -41,7 +41,8 @@
 # define INIT_TRACE_IRQFLAGS
 #endif
 
-#ifdef CONFIG_IRQSOFF_TRACER
+#if defined(CONFIG_IRQSOFF_TRACER) || \
+	defined(CONFIG_PREEMPT_TRACER)
  extern void stop_critical_timings(void);
  extern void start_critical_timings(void);
 #else
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 36b03d50bf40..72b1a10a59b6 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -10,7 +10,7 @@
 #include <linux/linkage.h>
 #include <linux/list.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
   extern void add_preempt_count(int val);
   extern void sub_preempt_count(int val);
 #else
diff --git a/kernel/sched.c b/kernel/sched.c
index 73e600852365..328494e28df2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,6 +70,7 @@
 #include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
+#include <linux/ftrace.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -4365,26 +4366,44 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
+static inline unsigned long get_parent_ip(unsigned long addr)
+{
+	if (in_lock_functions(addr)) {
+		addr = CALLER_ADDR2;
+		if (in_lock_functions(addr))
+			addr = CALLER_ADDR3;
+	}
+	return addr;
+}
 
 void __kprobes add_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
 	if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
 		return;
+#endif
 	preempt_count() += val;
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Spinlock count overflowing soon?
 	 */
 	DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
 				PREEMPT_MASK - 10);
+#endif
+	if (preempt_count() == val)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 EXPORT_SYMBOL(add_preempt_count);
 
 void __kprobes sub_preempt_count(int val)
 {
+#ifdef CONFIG_DEBUG_PREEMPT
 	/*
 	 * Underflow?
 	 */
@@ -4396,7 +4415,10 @@ void __kprobes sub_preempt_count(int val)
 	if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
 			!(preempt_count() & PREEMPT_MASK)))
 		return;
+#endif
 
+	if (preempt_count() == val)
+		trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 	preempt_count() -= val;
 }
 EXPORT_SYMBOL(sub_preempt_count);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 896df1cf6adc..6430016b98e8 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -44,6 +44,31 @@ config IRQSOFF_TRACER
 
 	      echo 0 > /debugfs/tracing/tracing_max_latency
 
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_TRACER
+	bool "Preemption-off Latency Tracer"
+	default n
+	depends on GENERIC_TIME
+	depends on PREEMPT
+	select TRACING
+	select TRACER_MAX_TRACE
+	help
+	  This option measures the time spent in preemption off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started
+	  via:
+
+	      echo 0 > /debugfs/tracing/tracing_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46be8647fb65..3fec653d6533 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -4,6 +4,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
+obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index a9131b0cf1a5..8b1231633dc5 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -21,6 +21,36 @@
 static struct trace_array		*irqsoff_trace __read_mostly;
 static int				tracer_enabled __read_mostly;
 
+static DEFINE_PER_CPU(int, tracing_cpu);
+
+enum {
+	TRACER_IRQS_OFF		= (1 << 1),
+	TRACER_PREEMPT_OFF	= (1 << 2),
+};
+
+static int trace_type __read_mostly;
+
+#ifdef CONFIG_PREEMPT_TRACER
+static inline int notrace
+preempt_trace(void)
+{
+	return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
+}
+#else
+# define preempt_trace() (0)
+#endif
+
+#ifdef CONFIG_IRQSOFF_TRACER
+static inline int notrace
+irq_trace(void)
+{
+	return ((trace_type & TRACER_IRQS_OFF) &&
+		irqs_disabled());
+}
+#else
+# define irq_trace() (0)
+#endif
+
 /*
  * Sequence count - we record it when starting a measurement and
  * skip the latency if the sequence has changed - some other section
@@ -44,14 +74,11 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	long disabled;
 	int cpu;
 
-	if (likely(!tracer_enabled))
+	if (likely(!__get_cpu_var(tracing_cpu)))
 		return;
 
 	local_save_flags(flags);
 
-	if (!irqs_disabled_flags(flags))
-		return;
-
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
@@ -171,23 +198,29 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 	if (likely(!tracer_enabled))
 		return;
 
+	if (__get_cpu_var(tracing_cpu))
+		return;
+
 	cpu = raw_smp_processor_id();
 	data = tr->data[cpu];
 
 	if (unlikely(!data) || unlikely(!data->trace) ||
-	    data->critical_start || atomic_read(&data->disabled))
+	    atomic_read(&data->disabled))
 		return;
 
 	atomic_inc(&data->disabled);
 
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = now(cpu);
-	data->critical_start = parent_ip;
+	data->critical_start = parent_ip ? : ip;
 	tracing_reset(data);
 
 	local_save_flags(flags);
+
 	ftrace(tr, data, ip, parent_ip, flags);
 
+	__get_cpu_var(tracing_cpu) = 1;
+
 	atomic_dec(&data->disabled);
 }
 
@@ -199,7 +232,13 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 
-	if (likely(!tracer_enabled))
+	/* Always clear the tracing cpu on stopping the trace */
+	if (unlikely(__get_cpu_var(tracing_cpu)))
+		__get_cpu_var(tracing_cpu) = 0;
+	else
+		return;
+
+	if (!tracer_enabled)
 		return;
 
 	cpu = raw_smp_processor_id();
@@ -212,49 +251,35 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 	local_save_flags(flags);
 	ftrace(tr, data, ip, parent_ip, flags);
-	check_critical_timing(tr, data, parent_ip, cpu);
+	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
 }
 
+/* start and stop critical timings used to for stoppage (in idle) */
 void notrace start_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
 void notrace stop_critical_timings(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (preempt_trace() || irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
 #ifdef CONFIG_PROVE_LOCKING
 void notrace time_hardirqs_on(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(a0, a1);
 }
 
 void notrace time_hardirqs_off(unsigned long a0, unsigned long a1)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(a0, a1);
 }
 
@@ -289,49 +314,46 @@ inline void print_irqtrace_events(struct task_struct *curr)
  */
 void notrace trace_hardirqs_on(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_on);
 
 void notrace trace_hardirqs_off(void)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
 }
 EXPORT_SYMBOL(trace_hardirqs_off);
 
 void notrace trace_hardirqs_on_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		stop_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_on_caller);
 
 void notrace trace_hardirqs_off_caller(unsigned long caller_addr)
 {
-	unsigned long flags;
-
-	local_save_flags(flags);
-
-	if (irqs_disabled_flags(flags))
+	if (!preempt_trace() && irq_trace())
 		start_critical_timing(CALLER_ADDR0, caller_addr);
 }
 EXPORT_SYMBOL(trace_hardirqs_off_caller);
 
 #endif /* CONFIG_PROVE_LOCKING */
+#endif /*  CONFIG_IRQSOFF_TRACER */
+
+#ifdef CONFIG_PREEMPT_TRACER
+void notrace trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+	stop_critical_timing(a0, a1);
+}
+
+void notrace trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+	start_critical_timing(a0, a1);
+}
+#endif /* CONFIG_PREEMPT_TRACER */
 
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
@@ -345,7 +367,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 	tracer_enabled = 0;
 }
 
-static void irqsoff_tracer_init(struct trace_array *tr)
+static void __irqsoff_tracer_init(struct trace_array *tr)
 {
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visibel */
@@ -382,6 +404,13 @@ static void notrace irqsoff_tracer_close(struct trace_iterator *iter)
 		start_irqsoff_tracer(iter->tr);
 }
 
+#ifdef CONFIG_IRQSOFF_TRACER
+static void irqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
 static struct tracer irqsoff_tracer __read_mostly =
 {
 	.name		= "irqsoff",
@@ -392,10 +421,65 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.ctrl_update	= irqsoff_tracer_ctrl_update,
 	.print_max	= 1,
 };
+# define register_irqsoff(trace) register_tracer(&trace)
+#else
+# define register_irqsoff(trace) do { } while (0)
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACER
+static void preemptoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptoff_tracer __read_mostly =
+{
+	.name		= "preemptoff",
+	.init		= preemptoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+# define register_preemptoff(trace) register_tracer(&trace)
+#else
+# define register_preemptoff(trace) do { } while (0)
+#endif
+
+#if defined(CONFIG_IRQSOFF_TRACER) && \
+	defined(CONFIG_PREEMPT_TRACER)
+
+static void preemptirqsoff_tracer_init(struct trace_array *tr)
+{
+	trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
+
+	__irqsoff_tracer_init(tr);
+}
+
+static struct tracer preemptirqsoff_tracer __read_mostly =
+{
+	.name		= "preemptirqsoff",
+	.init		= preemptirqsoff_tracer_init,
+	.reset		= irqsoff_tracer_reset,
+	.open		= irqsoff_tracer_open,
+	.close		= irqsoff_tracer_close,
+	.ctrl_update	= irqsoff_tracer_ctrl_update,
+	.print_max	= 1,
+};
+
+# define register_preemptirqsoff(trace) register_tracer(&trace)
+#else
+# define register_preemptirqsoff(trace) do { } while (0)
+#endif
 
 __init static int init_irqsoff_tracer(void)
 {
-	register_tracer(&irqsoff_tracer);
+	register_irqsoff(irqsoff_tracer);
+	register_preemptoff(preemptoff_tracer);
+	register_preemptirqsoff(preemptirqsoff_tracer);
 
 	return 0;
 }
-- 
cgit v1.2.3


From 3d0833953e1b98b79ddf491dd49229eef9baeac1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:42 +0200
Subject: ftrace: dynamic enabling/disabling of function calls

This patch adds a feature to dynamically replace the ftrace code
with the jmps to allow a kernel with ftrace configured to run
as fast as it can without it configured.

The way this works, is on bootup (if ftrace is enabled), a ftrace
function is registered to record the instruction pointer of all
places that call the function.

Later, if there's still any code to patch, a kthread is awoken
(rate limited to at most once a second) that performs a stop_machine,
and replaces all the code that was called with a jmp over the call
to ftrace. It only replaces what was found the previous time. Typically
the system reaches equilibrium quickly after bootup and there's no code
patching needed at all.

e.g.

  call ftrace  /* 5 bytes */

is replaced with

  jmp 3f  /* jmp is 2 bytes and we jump 3 forward */
3:

When we want to enable ftrace for function tracing, the IP recording
is removed, and stop_machine is called again to replace all the locations
of that were recorded back to the call of ftrace.  When it is disabled,
we replace the code back to the jmp.

Allocation is done by the kthread. If the ftrace recording function is
called, and we don't have any record slots available, then we simply
skip that call. Once a second a new page (if needed) is allocated for
recording new ftrace function calls.  A large batch is allocated at
boot up to get most of the calls there.

Because we do this via stop_machine, we don't have to worry about another
CPU executing a ftrace call as we modify it. But we do need to worry
about NMI's so all functions that might be called via nmi must be
annotated with notrace_nmi. When this code is configured in, the NMI code
will not call notrace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile |   1 +
 arch/x86/kernel/ftrace.c | 237 +++++++++++++++++++++++++++++++
 include/linux/ftrace.h   |  18 +++
 kernel/trace/Kconfig     |  17 +++
 kernel/trace/ftrace.c    | 356 ++++++++++++++++++++++++++++++++++++++++++-----
 5 files changed, 597 insertions(+), 32 deletions(-)
 create mode 100644 arch/x86/kernel/ftrace.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5e618c3b4720..e142091524b0 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -56,6 +56,7 @@ obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic_$(BITS).o nmi_$(BITS).o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic_$(BITS).o
 obj-$(CONFIG_X86_REBOOTFIXUPS)	+= reboot_fixups_32.o
+obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec_$(BITS).o
 obj-$(CONFIG_KEXEC)		+= relocate_kernel_$(BITS).o crash.o
 obj-$(CONFIG_CRASH_DUMP)	+= crash_dump_$(BITS).o
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
new file mode 100644
index 000000000000..5dd58136ef02
--- /dev/null
+++ b/arch/x86/kernel/ftrace.c
@@ -0,0 +1,237 @@
+/*
+ * Code for replacing ftrace calls with jumps.
+ *
+ * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ * Thanks goes to Ingo Molnar, for suggesting the idea.
+ * Mathieu Desnoyers, for suggesting postponing the modifications.
+ * Arjan van de Ven, for keeping me straight, and explaining to me
+ * the dangers of modifying code on the run.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/ftrace.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/list.h>
+
+#define CALL_BACK		5
+
+#define JMPFWD			0x03eb
+
+static unsigned short ftrace_jmp = JMPFWD;
+
+struct ftrace_record {
+	struct dyn_ftrace	rec;
+	int			failed;
+} __attribute__((packed));
+
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct ftrace_record	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+#define MCOUNT_ADDR ((long)(&mcount))
+
+union ftrace_code_union {
+	char code[5];
+	struct {
+		char e8;
+		int offset;
+	} __attribute__((packed));
+};
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
+notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	struct ftrace_record *rec;
+	unsigned short save;
+
+	ip -= CALL_BACK;
+	save = *(short *)ip;
+
+	/* If this was already converted, skip it */
+	if (save == JMPFWD)
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	rec = &ftrace_pages->records[ftrace_pages->index++];
+
+	return &rec->rec;
+}
+
+static int notrace
+ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+		   unsigned char *new_code)
+{
+	unsigned short old = *(unsigned short *)old_code;
+	unsigned short new = *(unsigned short *)new_code;
+	unsigned short replaced;
+	int faulted = 0;
+
+	/*
+	 * Note: Due to modules and __init, code can
+	 *  disappear and change, we need to protect against faulting
+	 *  as well as code changing.
+	 *
+	 * No real locking needed, this code is run through
+	 * kstop_machine.
+	 */
+	asm volatile (
+		"1: lock\n"
+		"   cmpxchg %w3, (%2)\n"
+		"2:\n"
+		".section .fixup, \"ax\"\n"
+		"	movl $1, %0\n"
+		"3:	jmp 2b\n"
+		".previous\n"
+		_ASM_EXTABLE(1b, 3b)
+		: "=r"(faulted), "=a"(replaced)
+		: "r"(ip), "r"(new), "0"(faulted), "a"(old)
+		: "memory");
+	sync_core();
+
+	if (replaced != old)
+		faulted = 2;
+
+	return faulted;
+}
+
+static int notrace ftrace_calc_offset(long ip)
+{
+	return (int)(MCOUNT_ADDR - ip);
+}
+
+notrace void ftrace_code_disable(struct dyn_ftrace *rec)
+{
+	unsigned long ip;
+	union ftrace_code_union save;
+	struct ftrace_record *r =
+		container_of(rec, struct ftrace_record, rec);
+
+	ip = rec->ip;
+
+	save.e8		= 0xe8;
+	save.offset 	= ftrace_calc_offset(ip);
+
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
+	r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp);
+}
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct ftrace_record *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int i;
+
+	if (saved)
+		old = (char *)&ftrace_jmp;
+	else
+		new = (char *)&ftrace_jmp;
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			union ftrace_code_union calc;
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->failed)
+				continue;
+
+			ip = rec->rec.ip;
+
+			calc.e8		= 0xe8;
+			calc.offset	= ftrace_calc_offset(ip);
+
+			if (saved)
+				new = calc.code;
+			else
+				old = calc.code;
+
+			ip -= CALL_BACK;
+
+			rec->failed = ftrace_modify_code(ip, old, new);
+		}
+	}
+
+}
+
+notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
+
+notrace int ftrace_shutdown_arch_init(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 740c97dcf9cb..90dbc0ee2046 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -32,6 +32,24 @@ extern void mcount(void);
 # define clear_ftrace_function(ops) do { } while (0)
 #endif /* CONFIG_FTRACE */
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+# define FTRACE_HASHBITS	10
+# define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
+
+struct dyn_ftrace {
+	struct hlist_node node;
+	unsigned long	  ip;
+};
+
+/* defined in arch */
+extern struct dyn_ftrace *
+ftrace_alloc_shutdown_node(unsigned long ip);
+extern int ftrace_shutdown_arch_init(void);
+extern void ftrace_code_disable(struct dyn_ftrace *rec);
+extern void ftrace_startup_code(void);
+extern void ftrace_shutdown_code(void);
+extern void ftrace_shutdown_replenish(void);
+#endif
 
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 6430016b98e8..cad9db1dee02 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -88,3 +88,20 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config DYNAMIC_FTRACE
+	bool "enable/disable ftrace tracepoints dynamically"
+	depends on FTRACE
+	default y
+	help
+         This option will modify all the calls to ftrace dynamically
+	 (will patch them out of the binary image and replaces them
+	 with a No-Op instruction) as they are called. A table is
+	 created to dynamically enable them again.
+
+	 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
+	 has native performance as long as no tracing is active.
+
+	 The changes to the code are done by a kernel thread that
+	 wakes up once a second and checks to see if any ftrace calls
+	 were made. If so, it runs stop_machine (stops all CPUS)
+	 and modifies the code to jump over the call to ftrace.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index b6a80b98a3fb..d1ae2ba25274 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -13,10 +13,19 @@
  *  Copyright (C) 2004 William Lee Irwin III
  */
 
-#include <linux/module.h>
+#include <linux/stop_machine.h>
+#include <linux/clocksource.h>
+#include <linux/kallsyms.h>
+#include <linux/kthread.h>
+#include <linux/hardirq.h>
 #include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/hash.h>
+#include <linux/list.h>
+
+#include "trace.h"
 
-static DEFINE_SPINLOCK(ftrace_func_lock);
+static DEFINE_SPINLOCK(ftrace_lock);
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
 	.func = ftrace_stub,
@@ -44,21 +53,21 @@ notrace void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 }
 
 /**
- * register_ftrace_function - register a function for profiling
- * @ops - ops structure that holds the function for profiling.
- *
- * Register a function to be called by all functions in the
- * kernel.
+ * clear_ftrace_function - reset the ftrace function
  *
- * Note: @ops->func and all the functions it calls must be labeled
- *       with "notrace", otherwise it will go into a
- *       recursive loop.
+ * This NULLs the ftrace function and in essence stops
+ * tracing.  There may be lag
  */
-int register_ftrace_function(struct ftrace_ops *ops)
+void clear_ftrace_function(void)
 {
-	unsigned long flags;
+	ftrace_trace_function = ftrace_stub;
+}
+
+static int notrace __register_ftrace_function(struct ftrace_ops *ops)
+{
+	/* Should never be called by interrupts */
+	spin_lock(&ftrace_lock);
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -68,6 +77,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	 */
 	smp_wmb();
 	ftrace_list = ops;
+
 	/*
 	 * For one func, simply call it directly.
 	 * For more than one func, call the chain.
@@ -76,28 +86,22 @@ int register_ftrace_function(struct ftrace_ops *ops)
 		ftrace_trace_function = ops->func;
 	else
 		ftrace_trace_function = ftrace_list_func;
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+
+	spin_unlock(&ftrace_lock);
 
 	return 0;
 }
 
-/**
- * unregister_ftrace_function - unresgister a function for profiling.
- * @ops - ops structure that holds the function to unregister
- *
- * Unregister a function that was added to be called by ftrace profiling.
- */
-int unregister_ftrace_function(struct ftrace_ops *ops)
+static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 {
-	unsigned long flags;
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	spin_lock_irqsave(&ftrace_func_lock, flags);
+	spin_lock(&ftrace_lock);
 
 	/*
-	 * If we are the only function, then the ftrace pointer is
-	 * pointing directly to that function.
+	 * If we are removing the last function, then simply point
+	 * to the ftrace_stub.
 	 */
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
@@ -117,22 +121,310 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 	*p = (*p)->next;
 
 	/* If we only have one func left, then call that directly */
-	if (ftrace_list->next == &ftrace_list_end)
+	if (ftrace_list == &ftrace_list_end ||
+	    ftrace_list->next == &ftrace_list_end)
 		ftrace_trace_function = ftrace_list->func;
 
  out:
-	spin_unlock_irqrestore(&ftrace_func_lock, flags);
+	spin_unlock(&ftrace_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
+
+static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
+
+static DEFINE_SPINLOCK(ftrace_shutdown_lock);
+static DEFINE_MUTEX(ftraced_lock);
+
+static int ftraced_trigger;
+static int ftraced_suspend;
+
+static int ftrace_record_suspend;
+
+static inline int
+notrace ftrace_ip_in_hash(unsigned long ip, unsigned long key)
+{
+	struct dyn_ftrace *p;
+	struct hlist_node *t;
+	int found = 0;
+
+	hlist_for_each_entry(p, t, &ftrace_hash[key], node) {
+		if (p->ip == ip) {
+			found = 1;
+			break;
+		}
+	}
+
+	return found;
+}
+
+static inline void notrace
+ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
+{
+	hlist_add_head(&node->node, &ftrace_hash[key]);
+}
+
+static void notrace
+ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+{
+	struct dyn_ftrace *node;
+	unsigned long flags;
+	unsigned long key;
+	int resched;
+	int atomic;
+
+	resched = need_resched();
+	preempt_disable_notrace();
+
+	/* We simply need to protect against recursion */
+	__get_cpu_var(ftrace_shutdown_disable_cpu)++;
+	if (__get_cpu_var(ftrace_shutdown_disable_cpu) != 1)
+		goto out;
+
+	if (unlikely(ftrace_record_suspend))
+		goto out;
+
+	key = hash_long(ip, FTRACE_HASHBITS);
+
+	WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
+
+	if (ftrace_ip_in_hash(ip, key))
+		goto out;
+
+	atomic = irqs_disabled();
+
+	spin_lock_irqsave(&ftrace_shutdown_lock, flags);
+
+	/* This ip may have hit the hash before the lock */
+	if (ftrace_ip_in_hash(ip, key))
+		goto out_unlock;
+
+	/*
+	 * There's a slight race that the ftraced will update the
+	 * hash and reset here. The arch alloc is responsible
+	 * for seeing if the IP has already changed, and if
+	 * it has, the alloc will fail.
+	 */
+	node = ftrace_alloc_shutdown_node(ip);
+	if (!node)
+		goto out_unlock;
+
+	node->ip = ip;
+
+	ftrace_add_hash(node, key);
+
+	ftraced_trigger = 1;
+
+ out_unlock:
+	spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
+ out:
+	__get_cpu_var(ftrace_shutdown_disable_cpu)--;
+
+	/* prevent recursion with scheduler */
+	if (resched)
+		preempt_enable_no_resched_notrace();
+	else
+		preempt_enable_notrace();
+}
+
+static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
+{
+	.func = ftrace_record_ip,
+};
+
+
+static int notrace __ftrace_modify_code(void *data)
+{
+	void (*func)(void) = data;
+
+	func();
+	return 0;
+}
+
+static void notrace ftrace_run_startup_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+}
+
+static void notrace ftrace_run_shutdown_code(void)
+{
+	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+}
+
+static void notrace ftrace_startup(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend++;
+	if (ftraced_suspend != 1)
+		goto out;
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	ftrace_run_startup_code();
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static void notrace ftrace_shutdown(void)
+{
+	mutex_lock(&ftraced_lock);
+	ftraced_suspend--;
+	if (ftraced_suspend)
+		goto out;
+
+	ftrace_run_shutdown_code();
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+ out:
+	mutex_unlock(&ftraced_lock);
+}
+
+static cycle_t		ftrace_update_time;
+static unsigned long	ftrace_update_cnt;
+unsigned long		ftrace_update_tot_cnt;
+
+static int notrace __ftrace_update_code(void *ignore)
+{
+	struct dyn_ftrace *p;
+	struct hlist_head head;
+	struct hlist_node *t;
+	cycle_t start, stop;
+	int i;
+
+	/* Don't be calling ftrace ops now */
+	__unregister_ftrace_function(&ftrace_shutdown_ops);
+
+	start = now(raw_smp_processor_id());
+	ftrace_update_cnt = 0;
+
+	/* No locks needed, the machine is stopped! */
+	for (i = 0; i < FTRACE_HASHSIZE; i++) {
+		if (hlist_empty(&ftrace_hash[i]))
+			continue;
+
+		head = ftrace_hash[i];
+		INIT_HLIST_HEAD(&ftrace_hash[i]);
+
+		/* all CPUS are stopped, we are safe to modify code */
+		hlist_for_each_entry(p, t, &head, node) {
+			ftrace_code_disable(p);
+			ftrace_update_cnt++;
+		}
+
+	}
+
+	stop = now(raw_smp_processor_id());
+	ftrace_update_time = stop - start;
+	ftrace_update_tot_cnt += ftrace_update_cnt;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
 
 	return 0;
 }
 
+static void notrace ftrace_update_code(void)
+{
+	stop_machine_run(__ftrace_update_code, NULL, NR_CPUS);
+}
+
+static int notrace ftraced(void *ignore)
+{
+	unsigned long usecs;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+
+		/* check once a second */
+		schedule_timeout(HZ);
+
+		mutex_lock(&ftraced_lock);
+		if (ftraced_trigger && !ftraced_suspend) {
+			ftrace_record_suspend++;
+			ftrace_update_code();
+			usecs = nsecs_to_usecs(ftrace_update_time);
+			if (ftrace_update_tot_cnt > 100000) {
+				ftrace_update_tot_cnt = 0;
+				pr_info("hm, dftrace overflow: %lu change%s"
+					 " (%lu total) in %lu usec%s\n",
+					ftrace_update_cnt,
+					ftrace_update_cnt != 1 ? "s" : "",
+					ftrace_update_tot_cnt,
+					usecs, usecs != 1 ? "s" : "");
+				WARN_ON_ONCE(1);
+			}
+			ftraced_trigger = 0;
+			ftrace_record_suspend--;
+		}
+		mutex_unlock(&ftraced_lock);
+
+		ftrace_shutdown_replenish();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __init notrace ftrace_shutdown_init(void)
+{
+	struct task_struct *p;
+	int ret;
+
+	ret = ftrace_shutdown_arch_init();
+	if (ret)
+		return ret;
+
+	p = kthread_run(ftraced, NULL, "ftraced");
+	if (IS_ERR(p))
+		return -1;
+
+	__register_ftrace_function(&ftrace_shutdown_ops);
+
+	return 0;
+}
+
+core_initcall(ftrace_shutdown_init);
+#else
+# define ftrace_startup()	do { } while (0)
+# define ftrace_shutdown()	do { } while (0)
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 /**
- * clear_ftrace_function - reset the ftrace function
+ * register_ftrace_function - register a function for profiling
+ * @ops - ops structure that holds the function for profiling.
  *
- * This NULLs the ftrace function and in essence stops
- * tracing.  There may be lag
+ * Register a function to be called by all functions in the
+ * kernel.
+ *
+ * Note: @ops->func and all the functions it calls must be labeled
+ *       with "notrace", otherwise it will go into a
+ *       recursive loop.
  */
-void clear_ftrace_function(void)
+int register_ftrace_function(struct ftrace_ops *ops)
 {
-	ftrace_trace_function = ftrace_stub;
+	ftrace_startup();
+
+	return __register_ftrace_function(ops);
+}
+
+/**
+ * unregister_ftrace_function - unresgister a function for profiling.
+ * @ops - ops structure that holds the function to unregister
+ *
+ * Unregister a function that was added to be called by ftrace profiling.
+ */
+int unregister_ftrace_function(struct ftrace_ops *ops)
+{
+	int ret;
+
+	ret = __unregister_ftrace_function(ops);
+
+	if (ftrace_list == &ftrace_list_end)
+		ftrace_shutdown();
+
+	return ret;
 }
-- 
cgit v1.2.3


From dfa60aba04dae7833d75b2e2be124bb7cfb8239f Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use nops instead of jmp

This patch patches the call to mcount with nops instead
of a jmp over the mcount call.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c |  4 ++--
 arch/x86/kernel/ftrace.c      | 40 ++++++++++++++++++++++++----------------
 include/asm-x86/alternative.h |  2 ++
 3 files changed, 28 insertions(+), 18 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65c7857a90dd..de240ba2e288 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -143,7 +143,7 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
 #ifdef CONFIG_X86_64
 
 extern char __vsyscall_0;
-static inline const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
 	       boot_cpu_data.x86 < 6 ? k8_nops : p6_nops;
@@ -162,7 +162,7 @@ static const struct nop {
 	{ -1, NULL }
 };
 
-static const unsigned char*const * find_nop_table(void)
+const unsigned char *const *find_nop_table(void)
 {
 	const unsigned char *const *noptable = intel_nops;
 	int i;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 5dd58136ef02..2e060c58b860 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -16,11 +16,12 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
-#define CALL_BACK		5
+#include <asm/alternative.h>
 
-#define JMPFWD			0x03eb
+#define CALL_BACK		5
 
-static unsigned short ftrace_jmp = JMPFWD;
+/* Long is fine, even if it is only 4 bytes ;-) */
+static long *ftrace_nop;
 
 struct ftrace_record {
 	struct dyn_ftrace	rec;
@@ -55,13 +56,13 @@ static struct ftrace_page	*ftrace_pages;
 notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 {
 	struct ftrace_record *rec;
-	unsigned short save;
+	unsigned long save;
 
 	ip -= CALL_BACK;
-	save = *(short *)ip;
+	save = *(long *)ip;
 
 	/* If this was already converted, skip it */
-	if (save == JMPFWD)
+	if (save == *ftrace_nop)
 		return NULL;
 
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
@@ -79,9 +80,10 @@ static int notrace
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
-	unsigned short old = *(unsigned short *)old_code;
-	unsigned short new = *(unsigned short *)new_code;
-	unsigned short replaced;
+	unsigned replaced;
+	unsigned old = *(unsigned *)old_code; /* 4 bytes */
+	unsigned new = *(unsigned *)new_code; /* 4 bytes */
+	unsigned char newch = new_code[4];
 	int faulted = 0;
 
 	/*
@@ -94,7 +96,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	 */
 	asm volatile (
 		"1: lock\n"
-		"   cmpxchg %w3, (%2)\n"
+		"   cmpxchg %3, (%2)\n"
+		"   jnz 2f\n"
+		"   movb %b4, 4(%2)\n"
 		"2:\n"
 		".section .fixup, \"ax\"\n"
 		"	movl $1, %0\n"
@@ -102,11 +106,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-		: "r"(ip), "r"(new), "0"(faulted), "a"(old)
+		: "r"(ip), "r"(new), "r"(newch),
+		  "0"(faulted), "a"(old)
 		: "memory");
 	sync_core();
 
-	if (replaced != old)
+	if (replaced != old && replaced != new)
 		faulted = 2;
 
 	return faulted;
@@ -132,7 +137,7 @@ notrace void ftrace_code_disable(struct dyn_ftrace *rec)
 	/* move the IP back to the start of the call */
 	ip -= CALL_BACK;
 
-	r->failed = ftrace_modify_code(ip, save.code, (char *)&ftrace_jmp);
+	r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop);
 }
 
 static void notrace ftrace_replace_code(int saved)
@@ -144,9 +149,9 @@ static void notrace ftrace_replace_code(int saved)
 	int i;
 
 	if (saved)
-		old = (char *)&ftrace_jmp;
+		old = (char *)ftrace_nop;
 	else
-		new = (char *)&ftrace_jmp;
+		new = (char *)ftrace_nop;
 
 	for (pg = ftrace_pages_start; pg; pg = pg->next) {
 		for (i = 0; i < pg->index; i++) {
@@ -194,12 +199,15 @@ notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-notrace int ftrace_shutdown_arch_init(void)
+notrace int __init ftrace_shutdown_arch_init(void)
 {
+	const unsigned char *const *noptable = find_nop_table();
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
 
+	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
 	if (!ftrace_pages_start)
diff --git a/include/asm-x86/alternative.h b/include/asm-x86/alternative.h
index 1f6a9ca10126..f6aa18eadf71 100644
--- a/include/asm-x86/alternative.h
+++ b/include/asm-x86/alternative.h
@@ -72,6 +72,8 @@ static inline void alternatives_smp_module_del(struct module *mod) {}
 static inline void alternatives_smp_switch(int smp) {}
 #endif	/* CONFIG_SMP */
 
+const unsigned char *const *find_nop_table(void);
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
-- 
cgit v1.2.3


From 3c1720f00bb619302ba19d55986ab565e74d06db Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: move memory management out of arch code

This patch moves the memory management of the ftrace
records out of the arch code and into the generic code
making the arch code simpler.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ftrace.c | 183 ++++++++---------------------------------------
 include/linux/ftrace.h   |  18 +++--
 kernel/trace/ftrace.c    | 154 ++++++++++++++++++++++++++++++++++++++-
 3 files changed, 192 insertions(+), 163 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 2e060c58b860..b69795efa226 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -23,25 +23,6 @@
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
-struct ftrace_record {
-	struct dyn_ftrace	rec;
-	int			failed;
-} __attribute__((packed));
-
-struct ftrace_page {
-	struct ftrace_page	*next;
-	int			index;
-	struct ftrace_record	records[];
-} __attribute__((packed));
-
-#define ENTRIES_PER_PAGE \
-  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct ftrace_record))
-
-/* estimate from running different kernels */
-#define NR_TO_INIT		10000
-
-#define MCOUNT_ADDR ((long)(&mcount))
-
 union ftrace_code_union {
 	char code[5];
 	struct {
@@ -50,33 +31,41 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-static struct ftrace_page	*ftrace_pages_start;
-static struct ftrace_page	*ftrace_pages;
-
-notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+notrace int ftrace_ip_converted(unsigned long ip)
 {
-	struct ftrace_record *rec;
 	unsigned long save;
 
 	ip -= CALL_BACK;
 	save = *(long *)ip;
 
-	/* If this was already converted, skip it */
-	if (save == *ftrace_nop)
-		return NULL;
+	return save == *ftrace_nop;
+}
 
-	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
-		if (!ftrace_pages->next)
-			return NULL;
-		ftrace_pages = ftrace_pages->next;
-	}
+static int notrace ftrace_calc_offset(long ip, long addr)
+{
+	return (int)(addr - ip);
+}
 
-	rec = &ftrace_pages->records[ftrace_pages->index++];
+notrace unsigned char *ftrace_nop_replace(void)
+{
+	return (char *)ftrace_nop;
+}
+
+notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
+{
+	static union ftrace_code_union calc;
 
-	return &rec->rec;
+	calc.e8		= 0xe8;
+	calc.offset	= ftrace_calc_offset(ip, addr);
+
+	/*
+	 * No locking needed, this must be called via kstop_machine
+	 * which in essence is like running on a uniprocessor machine.
+	 */
+	return calc.code;
 }
 
-static int notrace
+notrace int
 ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		   unsigned char *new_code)
 {
@@ -86,6 +75,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
+	/* move the IP back to the start of the call */
+	ip -= CALL_BACK;
+
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -117,129 +109,12 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-static int notrace ftrace_calc_offset(long ip)
-{
-	return (int)(MCOUNT_ADDR - ip);
-}
-
-notrace void ftrace_code_disable(struct dyn_ftrace *rec)
-{
-	unsigned long ip;
-	union ftrace_code_union save;
-	struct ftrace_record *r =
-		container_of(rec, struct ftrace_record, rec);
-
-	ip = rec->ip;
-
-	save.e8		= 0xe8;
-	save.offset 	= ftrace_calc_offset(ip);
-
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
-	r->failed = ftrace_modify_code(ip, save.code, (char *)ftrace_nop);
-}
-
-static void notrace ftrace_replace_code(int saved)
-{
-	unsigned char *new = NULL, *old = NULL;
-	struct ftrace_record *rec;
-	struct ftrace_page *pg;
-	unsigned long ip;
-	int i;
-
-	if (saved)
-		old = (char *)ftrace_nop;
-	else
-		new = (char *)ftrace_nop;
-
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			union ftrace_code_union calc;
-			rec = &pg->records[i];
-
-			/* don't modify code that has already faulted */
-			if (rec->failed)
-				continue;
-
-			ip = rec->rec.ip;
-
-			calc.e8		= 0xe8;
-			calc.offset	= ftrace_calc_offset(ip);
-
-			if (saved)
-				new = calc.code;
-			else
-				old = calc.code;
-
-			ip -= CALL_BACK;
-
-			rec->failed = ftrace_modify_code(ip, old, new);
-		}
-	}
-
-}
-
-notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
-notrace void ftrace_shutdown_replenish(void)
-{
-	if (ftrace_pages->next)
-		return;
-
-	/* allocate another page */
-	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
-}
-
-notrace int __init ftrace_shutdown_arch_init(void)
+int __init ftrace_dyn_arch_init(void)
 {
 	const unsigned char *const *noptable = find_nop_table();
-	struct ftrace_page *pg;
-	int cnt;
-	int i;
 
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
-	/* allocate a few pages */
-	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
-	if (!ftrace_pages_start)
-		return -1;
-
-	/*
-	 * Allocate a few more pages.
-	 *
-	 * TODO: have some parser search vmlinux before
-	 *   final linking to find all calls to ftrace.
-	 *   Then we can:
-	 *    a) know how many pages to allocate.
-	 *     and/or
-	 *    b) set up the table then.
-	 *
-	 *  The dynamic code is still necessary for
-	 *  modules.
-	 */
-
-	pg = ftrace_pages = ftrace_pages_start;
-
-	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
-
-	for (i = 0; i < cnt; i++) {
-		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
-
-		/* If we fail, we'll try later anyway */
-		if (!pg->next)
-			break;
-
-		pg = pg->next;
-	}
-
 	return 0;
 }
+
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index ccd8537dbdb7..d509ad6c9cb8 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -42,19 +42,23 @@ extern void mcount(void);
 # define FTRACE_HASHBITS	10
 # define FTRACE_HASHSIZE	(1<<FTRACE_HASHBITS)
 
+enum {
+	FTRACE_FL_FAILED	= (1<<0),
+};
+
 struct dyn_ftrace {
 	struct hlist_node node;
 	unsigned long	  ip;
+	unsigned long	  flags;
 };
 
 /* defined in arch */
-extern struct dyn_ftrace *
-ftrace_alloc_shutdown_node(unsigned long ip);
-extern int ftrace_shutdown_arch_init(void);
-extern void ftrace_code_disable(struct dyn_ftrace *rec);
-extern void ftrace_startup_code(void);
-extern void ftrace_shutdown_code(void);
-extern void ftrace_shutdown_replenish(void);
+extern int ftrace_ip_converted(unsigned long ip);
+extern unsigned char *ftrace_nop_replace(void);
+extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
+extern int ftrace_dyn_arch_init(void);
+extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
+			      unsigned char *new_code);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d3de37299ba4..f6d9af3bf66b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -156,6 +156,21 @@ static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
 static DEFINE_SPINLOCK(ftrace_shutdown_lock);
 static DEFINE_MUTEX(ftraced_lock);
 
+struct ftrace_page {
+	struct ftrace_page	*next;
+	int			index;
+	struct dyn_ftrace	records[];
+} __attribute__((packed));
+
+#define ENTRIES_PER_PAGE \
+  ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
+
+/* estimate from running different kernels */
+#define NR_TO_INIT		10000
+
+static struct ftrace_page	*ftrace_pages_start;
+static struct ftrace_page	*ftrace_pages;
+
 static int ftraced_trigger;
 static int ftraced_suspend;
 
@@ -184,6 +199,21 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
+static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+{
+	/* If this was already converted, skip it */
+	if (ftrace_ip_converted(ip))
+		return NULL;
+
+	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
+		if (!ftrace_pages->next)
+			return NULL;
+		ftrace_pages = ftrace_pages->next;
+	}
+
+	return &ftrace_pages->records[ftrace_pages->index++];
+}
+
 static void notrace
 ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 {
@@ -252,6 +282,62 @@ static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
 	.func = ftrace_record_ip,
 };
 
+#define MCOUNT_ADDR ((long)(&mcount))
+
+static void notrace ftrace_replace_code(int saved)
+{
+	unsigned char *new = NULL, *old = NULL;
+	struct dyn_ftrace *rec;
+	struct ftrace_page *pg;
+	unsigned long ip;
+	int failed;
+	int i;
+
+	if (saved)
+		old = ftrace_nop_replace();
+	else
+		new = ftrace_nop_replace();
+
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {
+		for (i = 0; i < pg->index; i++) {
+			rec = &pg->records[i];
+
+			/* don't modify code that has already faulted */
+			if (rec->flags & FTRACE_FL_FAILED)
+				continue;
+
+			ip = rec->ip;
+
+			if (saved)
+				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+			else
+				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+
+			failed = ftrace_modify_code(ip, old, new);
+			if (failed)
+				rec->flags |= FTRACE_FL_FAILED;
+		}
+	}
+}
+
+static notrace void ftrace_startup_code(void)
+{
+	ftrace_replace_code(1);
+}
+
+static notrace void ftrace_shutdown_code(void)
+{
+	ftrace_replace_code(0);
+}
+
+static notrace void ftrace_shutdown_replenish(void)
+{
+	if (ftrace_pages->next)
+		return;
+
+	/* allocate another page */
+	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
+}
 
 static int notrace __ftrace_modify_code(void *data)
 {
@@ -261,6 +347,23 @@ static int notrace __ftrace_modify_code(void *data)
 	return 0;
 }
 
+static notrace void
+ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+{
+	unsigned long ip;
+	unsigned char *nop, *call;
+	int failed;
+
+	ip = rec->ip;
+
+	nop = ftrace_nop_replace();
+	call = ftrace_call_replace(ip, addr);
+
+	failed = ftrace_modify_code(ip, call, nop);
+	if (failed)
+		rec->flags |= FTRACE_FL_FAILED;
+}
+
 static void notrace ftrace_run_startup_code(void)
 {
 	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
@@ -346,7 +449,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p);
+			ftrace_code_disable(p, MCOUNT_ADDR);
 			ftrace_update_cnt++;
 		}
 
@@ -407,12 +510,59 @@ static int notrace ftraced(void *ignore)
 	return 0;
 }
 
+static int __init ftrace_dyn_table_alloc(void)
+{
+	struct ftrace_page *pg;
+	int cnt;
+	int i;
+	int ret;
+
+	ret = ftrace_dyn_arch_init();
+	if (ret)
+		return ret;
+
+	/* allocate a few pages */
+	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!ftrace_pages_start)
+		return -1;
+
+	/*
+	 * Allocate a few more pages.
+	 *
+	 * TODO: have some parser search vmlinux before
+	 *   final linking to find all calls to ftrace.
+	 *   Then we can:
+	 *    a) know how many pages to allocate.
+	 *     and/or
+	 *    b) set up the table then.
+	 *
+	 *  The dynamic code is still necessary for
+	 *  modules.
+	 */
+
+	pg = ftrace_pages = ftrace_pages_start;
+
+	cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
+
+	for (i = 0; i < cnt; i++) {
+		pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+
+		/* If we fail, we'll try later anyway */
+		if (!pg->next)
+			break;
+
+		pg = pg->next;
+	}
+
+	return 0;
+}
+
 static int __init notrace ftrace_shutdown_init(void)
 {
 	struct task_struct *p;
 	int ret;
 
-	ret = ftrace_shutdown_arch_init();
+	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
 
-- 
cgit v1.2.3


From d61f82d06672f57fca410da6f7fffd15867db622 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: use dynamic patching for updating mcount calls

This patch replaces the indirect call to the mcount function
pointer with a direct call that will be patched by the
dynamic ftrace routines.

On boot up, the mcount function calls the ftace_stub function.
When the dynamic ftrace code is initialized, the ftrace_stub
is replaced with a call to the ftrace_record_ip, which records
the instruction pointers of the locations that call it.

Later, the ftraced daemon will call kstop_machine and patch all
the locations to nops.

When a ftrace is enabled, the original calls to mcount will now
be set top call ftrace_caller, which will do a direct call
to the registered ftrace function. This direct call is also patched
when the function that should be called is updated.

All patching is performed by a kstop_machine routine to prevent any
type of race conditions that is associated with modifying code
on the fly.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/entry_32.S |  47 +++++++++++-
 arch/x86/kernel/entry_64.S |  67 ++++++++++++++++-
 arch/x86/kernel/ftrace.c   |  41 +++++++++-
 include/linux/ftrace.h     |   7 +-
 kernel/trace/ftrace.c      | 183 ++++++++++++++++++++++++++-------------------
 5 files changed, 261 insertions(+), 84 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index f47b9b5440d2..e6517ce0b824 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -1110,10 +1110,50 @@ ENDPROC(xen_failsafe_callback)
 #endif	/* CONFIG_XEN */
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+ENTRY(mcount)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+	ret
+END(mcount)
+
+ENTRY(ftrace_caller)
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+	movl 0xc(%esp), %eax
+	movl 0x4(%ebp), %edx
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	popl %edx
+	popl %ecx
+	popl %eax
+
+.globl ftrace_stub
+ftrace_stub:
+	ret
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
+
 ENTRY(mcount)
 	cmpl $ftrace_stub, ftrace_trace_function
 	jnz trace
-
 .globl ftrace_stub
 ftrace_stub:
 	ret
@@ -1126,7 +1166,7 @@ trace:
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
 
-	call   *ftrace_trace_function
+	call *ftrace_trace_function
 
 	popl %edx
 	popl %ecx
@@ -1134,7 +1174,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 .section .rodata,"a"
 #include "syscall_table_32.S"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index f046e0c64883..fe25e5febca3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -55,6 +55,70 @@
 	.code64
 
 #ifdef CONFIG_FTRACE
+#ifdef CONFIG_DYNAMIC_FTRACE
+ENTRY(mcount)
+
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+
+.globl mcount_call
+mcount_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+	retq
+END(mcount)
+
+ENTRY(ftrace_caller)
+
+	/* taken from glibc */
+	subq $0x38, %rsp
+	movq %rax, (%rsp)
+	movq %rcx, 8(%rsp)
+	movq %rdx, 16(%rsp)
+	movq %rsi, 24(%rsp)
+	movq %rdi, 32(%rsp)
+	movq %r8, 40(%rsp)
+	movq %r9, 48(%rsp)
+
+	movq 0x38(%rsp), %rdi
+	movq 8(%rbp), %rsi
+
+.globl ftrace_call
+ftrace_call:
+	call ftrace_stub
+
+	movq 48(%rsp), %r9
+	movq 40(%rsp), %r8
+	movq 32(%rsp), %rdi
+	movq 24(%rsp), %rsi
+	movq 16(%rsp), %rdx
+	movq 8(%rsp), %rcx
+	movq (%rsp), %rax
+	addq $0x38, %rsp
+
+.globl ftrace_stub
+ftrace_stub:
+	retq
+END(ftrace_caller)
+
+#else /* ! CONFIG_DYNAMIC_FTRACE */
 ENTRY(mcount)
 	cmpq $ftrace_stub, ftrace_trace_function
 	jnz trace
@@ -89,7 +153,8 @@ trace:
 
 	jmp ftrace_stub
 END(mcount)
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#endif /* CONFIG_FTRACE */
 
 #ifndef CONFIG_PREEMPT
 #define retint_kernel retint_restore_args
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index b69795efa226..9f44623e0072 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -109,10 +109,49 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	return faulted;
 }
 
-int __init ftrace_dyn_arch_init(void)
+notrace int ftrace_update_ftrace_func(ftrace_func_t func)
+{
+	unsigned long ip = (unsigned long)(&ftrace_call);
+	unsigned char old[5], *new;
+	int ret;
+
+	ip += CALL_BACK;
+
+	memcpy(old, &ftrace_call, 5);
+	new = ftrace_call_replace(ip, (unsigned long)func);
+	ret = ftrace_modify_code(ip, old, new);
+
+	return ret;
+}
+
+notrace int ftrace_mcount_set(unsigned long *data)
+{
+	unsigned long ip = (long)(&mcount_call);
+	unsigned long *addr = data;
+	unsigned char old[5], *new;
+
+	/* ip is at the location, but modify code will subtact this */
+	ip += CALL_BACK;
+
+	/*
+	 * Replace the mcount stub with a pointer to the
+	 * ip recorder function.
+	 */
+	memcpy(old, &mcount_call, 5);
+	new = ftrace_call_replace(ip, *addr);
+	*addr = ftrace_modify_code(ip, old, new);
+
+	return 0;
+}
+
+int __init ftrace_dyn_arch_init(void *data)
 {
 	const unsigned char *const *noptable = find_nop_table();
 
+	/* This is running in kstop_machine */
+
+	ftrace_mcount_set(data);
+
 	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
 
 	return 0;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index d509ad6c9cb8..b0dd0093058f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -56,9 +56,14 @@ struct dyn_ftrace {
 extern int ftrace_ip_converted(unsigned long ip);
 extern unsigned char *ftrace_nop_replace(void);
 extern unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr);
-extern int ftrace_dyn_arch_init(void);
+extern int ftrace_dyn_arch_init(void *data);
+extern int ftrace_mcount_set(unsigned long *data);
 extern int ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 			      unsigned char *new_code);
+extern int ftrace_update_ftrace_func(ftrace_func_t func);
+extern void ftrace_caller(void);
+extern void ftrace_call(void);
+extern void mcount_call(void);
 #endif
 
 #ifdef CONFIG_FRAME_POINTER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f6d9af3bf66b..88544f9bc0ed 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -26,14 +26,8 @@
 
 #include "trace.h"
 
-#ifdef CONFIG_DYNAMIC_FTRACE
-# define FTRACE_ENABLED_INIT 1
-#else
-# define FTRACE_ENABLED_INIT 0
-#endif
-
-int ftrace_enabled = FTRACE_ENABLED_INIT;
-static int last_ftrace_enabled = FTRACE_ENABLED_INIT;
+int ftrace_enabled;
+static int last_ftrace_enabled;
 
 static DEFINE_SPINLOCK(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
@@ -149,6 +143,14 @@ static int notrace __unregister_ftrace_function(struct ftrace_ops *ops)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
+enum {
+	FTRACE_ENABLE_CALLS		= (1 << 0),
+	FTRACE_DISABLE_CALLS		= (1 << 1),
+	FTRACE_UPDATE_TRACE_FUNC	= (1 << 2),
+	FTRACE_ENABLE_MCOUNT		= (1 << 3),
+	FTRACE_DISABLE_MCOUNT		= (1 << 4),
+};
+
 static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
 
 static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
@@ -199,12 +201,8 @@ ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
 	hlist_add_head(&node->node, &ftrace_hash[key]);
 }
 
-static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
+static notrace struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
-	/* If this was already converted, skip it */
-	if (ftrace_ip_converted(ip))
-		return NULL;
-
 	if (ftrace_pages->index == ENTRIES_PER_PAGE) {
 		if (!ftrace_pages->next)
 			return NULL;
@@ -215,7 +213,7 @@ static notrace struct dyn_ftrace *ftrace_alloc_shutdown_node(unsigned long ip)
 }
 
 static void notrace
-ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
+ftrace_record_ip(unsigned long ip)
 {
 	struct dyn_ftrace *node;
 	unsigned long flags;
@@ -223,6 +221,9 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 	int resched;
 	int atomic;
 
+	if (!ftrace_enabled)
+		return;
+
 	resched = need_resched();
 	preempt_disable_notrace();
 
@@ -251,11 +252,12 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 
 	/*
 	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. The arch alloc is responsible
-	 * for seeing if the IP has already changed, and if
-	 * it has, the alloc will fail.
+	 * hash and reset here. If it is already converted, skip it.
 	 */
-	node = ftrace_alloc_shutdown_node(ip);
+	if (ftrace_ip_converted(ip))
+		goto out_unlock;
+
+	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
 
@@ -277,11 +279,7 @@ ftrace_record_ip(unsigned long ip, unsigned long parent_ip)
 		preempt_enable_notrace();
 }
 
-static struct ftrace_ops ftrace_shutdown_ops __read_mostly =
-{
-	.func = ftrace_record_ip,
-};
-
+#define FTRACE_ADDR ((long)(&ftrace_caller))
 #define MCOUNT_ADDR ((long)(&mcount))
 
 static void notrace ftrace_replace_code(int saved)
@@ -309,9 +307,9 @@ static void notrace ftrace_replace_code(int saved)
 			ip = rec->ip;
 
 			if (saved)
-				new = ftrace_call_replace(ip, MCOUNT_ADDR);
+				new = ftrace_call_replace(ip, FTRACE_ADDR);
 			else
-				old = ftrace_call_replace(ip, MCOUNT_ADDR);
+				old = ftrace_call_replace(ip, FTRACE_ADDR);
 
 			failed = ftrace_modify_code(ip, old, new);
 			if (failed)
@@ -320,16 +318,6 @@ static void notrace ftrace_replace_code(int saved)
 	}
 }
 
-static notrace void ftrace_startup_code(void)
-{
-	ftrace_replace_code(1);
-}
-
-static notrace void ftrace_shutdown_code(void)
-{
-	ftrace_replace_code(0);
-}
-
 static notrace void ftrace_shutdown_replenish(void)
 {
 	if (ftrace_pages->next)
@@ -339,16 +327,8 @@ static notrace void ftrace_shutdown_replenish(void)
 	ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
 }
 
-static int notrace __ftrace_modify_code(void *data)
-{
-	void (*func)(void) = data;
-
-	func();
-	return 0;
-}
-
 static notrace void
-ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
+ftrace_code_disable(struct dyn_ftrace *rec)
 {
 	unsigned long ip;
 	unsigned char *nop, *call;
@@ -357,67 +337,113 @@ ftrace_code_disable(struct dyn_ftrace *rec, unsigned long addr)
 	ip = rec->ip;
 
 	nop = ftrace_nop_replace();
-	call = ftrace_call_replace(ip, addr);
+	call = ftrace_call_replace(ip, MCOUNT_ADDR);
 
 	failed = ftrace_modify_code(ip, call, nop);
 	if (failed)
 		rec->flags |= FTRACE_FL_FAILED;
 }
 
-static void notrace ftrace_run_startup_code(void)
+static int notrace __ftrace_modify_code(void *data)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_startup_code, NR_CPUS);
+	unsigned long addr;
+	int *command = data;
+
+	if (*command & FTRACE_ENABLE_CALLS)
+		ftrace_replace_code(1);
+	else if (*command & FTRACE_DISABLE_CALLS)
+		ftrace_replace_code(0);
+
+	if (*command & FTRACE_UPDATE_TRACE_FUNC)
+		ftrace_update_ftrace_func(ftrace_trace_function);
+
+	if (*command & FTRACE_ENABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_record_ip;
+		ftrace_mcount_set(&addr);
+	} else if (*command & FTRACE_DISABLE_MCOUNT) {
+		addr = (unsigned long)ftrace_stub;
+		ftrace_mcount_set(&addr);
+	}
+
+	return 0;
 }
 
-static void notrace ftrace_run_shutdown_code(void)
+static void notrace ftrace_run_update_code(int command)
 {
-	stop_machine_run(__ftrace_modify_code, ftrace_shutdown_code, NR_CPUS);
+	stop_machine_run(__ftrace_modify_code, &command, NR_CPUS);
 }
 
+static ftrace_func_t saved_ftrace_func;
+
 static void notrace ftrace_startup(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend++;
-	if (ftraced_suspend != 1)
+	if (ftraced_suspend == 1)
+		command |= FTRACE_ENABLE_CALLS;
+
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
+
+	if (!command || !ftrace_enabled)
 		goto out;
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
 
-	if (ftrace_enabled)
-		ftrace_run_startup_code();
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown(void)
 {
+	int command = 0;
+
 	mutex_lock(&ftraced_lock);
 	ftraced_suspend--;
-	if (ftraced_suspend)
-		goto out;
+	if (!ftraced_suspend)
+		command |= FTRACE_DISABLE_CALLS;
 
-	if (ftrace_enabled)
-		ftrace_run_shutdown_code();
+	if (saved_ftrace_func != ftrace_trace_function) {
+		saved_ftrace_func = ftrace_trace_function;
+		command |= FTRACE_UPDATE_TRACE_FUNC;
+	}
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	if (!command || !ftrace_enabled)
+		goto out;
+
+	ftrace_run_update_code(command);
  out:
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_startup_sysctl(void)
 {
+	int command = FTRACE_ENABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
+	/* Force update next time */
+	saved_ftrace_func = NULL;
 	/* ftraced_suspend is true if we want ftrace running */
 	if (ftraced_suspend)
-		ftrace_run_startup_code();
+		command |= FTRACE_ENABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
 static void notrace ftrace_shutdown_sysctl(void)
 {
+	int command = FTRACE_DISABLE_MCOUNT;
+
 	mutex_lock(&ftraced_lock);
 	/* ftraced_suspend is true if ftrace is running */
 	if (ftraced_suspend)
-		ftrace_run_shutdown_code();
+		command |= FTRACE_DISABLE_CALLS;
+
+	ftrace_run_update_code(command);
 	mutex_unlock(&ftraced_lock);
 }
 
@@ -430,11 +456,13 @@ static int notrace __ftrace_update_code(void *ignore)
 	struct dyn_ftrace *p;
 	struct hlist_head head;
 	struct hlist_node *t;
+	int save_ftrace_enabled;
 	cycle_t start, stop;
 	int i;
 
-	/* Don't be calling ftrace ops now */
-	__unregister_ftrace_function(&ftrace_shutdown_ops);
+	/* Don't be recording funcs now */
+	save_ftrace_enabled = ftrace_enabled;
+	ftrace_enabled = 0;
 
 	start = now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
@@ -449,7 +477,7 @@ static int notrace __ftrace_update_code(void *ignore)
 
 		/* all CPUS are stopped, we are safe to modify code */
 		hlist_for_each_entry(p, t, &head, node) {
-			ftrace_code_disable(p, MCOUNT_ADDR);
+			ftrace_code_disable(p);
 			ftrace_update_cnt++;
 		}
 
@@ -459,7 +487,7 @@ static int notrace __ftrace_update_code(void *ignore)
 	ftrace_update_time = stop - start;
 	ftrace_update_tot_cnt += ftrace_update_cnt;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	ftrace_enabled = save_ftrace_enabled;
 
 	return 0;
 }
@@ -515,11 +543,6 @@ static int __init ftrace_dyn_table_alloc(void)
 	struct ftrace_page *pg;
 	int cnt;
 	int i;
-	int ret;
-
-	ret = ftrace_dyn_arch_init();
-	if (ret)
-		return ret;
 
 	/* allocate a few pages */
 	ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
@@ -557,11 +580,19 @@ static int __init ftrace_dyn_table_alloc(void)
 	return 0;
 }
 
-static int __init notrace ftrace_shutdown_init(void)
+static int __init notrace ftrace_dynamic_init(void)
 {
 	struct task_struct *p;
+	unsigned long addr;
 	int ret;
 
+	addr = (unsigned long)ftrace_record_ip;
+	stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS);
+
+	/* ftrace_dyn_arch_init places the return code in addr */
+	if (addr)
+		return addr;
+
 	ret = ftrace_dyn_table_alloc();
 	if (ret)
 		return ret;
@@ -570,12 +601,12 @@ static int __init notrace ftrace_shutdown_init(void)
 	if (IS_ERR(p))
 		return -1;
 
-	__register_ftrace_function(&ftrace_shutdown_ops);
+	last_ftrace_enabled = ftrace_enabled = 1;
 
 	return 0;
 }
 
-core_initcall(ftrace_shutdown_init);
+core_initcall(ftrace_dynamic_init);
 #else
 # define ftrace_startup()	  do { } while (0)
 # define ftrace_shutdown()	  do { } while (0)
@@ -599,9 +630,8 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	int ret;
 
 	mutex_lock(&ftrace_sysctl_lock);
-	ftrace_startup();
-
 	ret = __register_ftrace_function(ops);
+	ftrace_startup();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
@@ -619,10 +649,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 
 	mutex_lock(&ftrace_sysctl_lock);
 	ret = __unregister_ftrace_function(ops);
-
-	if (ftrace_list == &ftrace_list_end)
-		ftrace_shutdown();
-
+	ftrace_shutdown();
 	mutex_unlock(&ftrace_sysctl_lock);
 
 	return ret;
-- 
cgit v1.2.3


From f43fdad8627fec2d21df92799b254dceb66c9c3c Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:43 +0200
Subject: ftrace: fix kexec

disable the tracer while kexec pulls the rug from under the old
kernel.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/machine_kexec_32.c | 4 ++++
 arch/x86/kernel/machine_kexec_64.c | 4 ++++
 include/linux/ftrace.h             | 7 +++++++
 3 files changed, 15 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index d0b234c9fc31..88923fd7a6fc 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -11,6 +11,8 @@
 #include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/tlbflush.h>
@@ -107,6 +109,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 576a03db4511..1558fdc174f9 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -11,6 +11,8 @@
 #include <linux/string.h>
 #include <linux/reboot.h>
 #include <linux/numa.h>
+#include <linux/ftrace.h>
+
 #include <asm/pgtable.h>
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -184,6 +186,8 @@ NORET_TYPE void machine_kexec(struct kimage *image)
 	unsigned long page_list[PAGES_NR];
 	void *control_page;
 
+	tracer_disable();
+
 	/* Interrupts aren't acceptable while we reboot */
 	local_irq_disable();
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f5911d2d42c3..a42390c1d6e1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -68,6 +68,13 @@ extern void ftrace_call(void);
 extern void mcount_call(void);
 #endif
 
+static inline void tracer_disable(void)
+{
+#ifdef CONFIG_FTRACE
+	ftrace_enabled = 0;
+#endif
+}
+
 #ifdef CONFIG_FRAME_POINTER
 /* TODO: need to fix this for ARM */
 # define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
-- 
cgit v1.2.3


From a56be3fe2f65f9f776e727bfd382e35db75911d6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: ftrace: fix the fault label in updating code

The fault label to jump to on fault of updating the code was misplaced
preventing the fault from being recorded.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9f44623e0072..498608c015fb 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -93,8 +93,8 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		"   movb %b4, 4(%2)\n"
 		"2:\n"
 		".section .fixup, \"ax\"\n"
-		"	movl $1, %0\n"
-		"3:	jmp 2b\n"
+		"3:	movl $1, %0\n"
+		"	jmp 2b\n"
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-- 
cgit v1.2.3


From 8f0f996e80b980fba07d11961d96a5fefb60976a Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: ftrace: dont write protect kernel text

Dynamic ftrace cant work when the kernel has its text write protected.
This patch keeps the kernel from being write protected when
dynamic ftrace is in place.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_32.c |  4 ++++
 arch/x86/mm/init_64.c | 10 ++++++++--
 2 files changed, 12 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index ec30d10154b6..f96eca21ad8f 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -710,6 +710,8 @@ void mark_rodata_ro(void)
 	unsigned long start = PFN_ALIGN(_text);
 	unsigned long size = PFN_ALIGN(_etext) - start;
 
+#ifndef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
 	printk(KERN_INFO "Write protecting the kernel text: %luk\n",
 		size >> 10);
@@ -722,6 +724,8 @@ void mark_rodata_ro(void)
 	printk(KERN_INFO "Testing CPA: write protecting again\n");
 	set_pages_ro(virt_to_page(start), size>>PAGE_SHIFT);
 #endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 	start += size;
 	size = (unsigned long)__end_rodata - start;
 	set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 32ba13b0f818..41824e776b6c 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -766,6 +766,13 @@ EXPORT_SYMBOL_GPL(rodata_test_data);
 void mark_rodata_ro(void)
 {
 	unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata);
+	unsigned long rodata_start =
+		((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+	/* Dynamic tracing modifies the kernel text section */
+	start = rodata_start;
+#endif
 
 	printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
 	       (end - start) >> 10);
@@ -775,8 +782,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	start = ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
-	set_memory_nx(start, (end - start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - start) >> PAGE_SHIFT);
 
 	rodata_test();
 
-- 
cgit v1.2.3


From 86069782d62e731b4835a0cf8eb7d1d0e17cf306 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86: add a list for custom page fault handlers.

Provides kernel modules a way to register custom page fault handlers.
On every page fault this will call a list of registered functions. The
functions may handle the fault and force do_page_fault() to return
immediately.

This functionality is similar to the now removed page fault notifiers.
Custom page fault handlers are used by debugging and reverse engineering
tools. Mmiotrace is one such tool and a patch to add it into the tree
will follow.

The custom page fault handlers are called earlier in do_page_fault()
than the page fault notifiers were.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug   |  8 +++++++
 arch/x86/mm/fault.c      | 56 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/kdebug.h |  9 ++++++++
 3 files changed, 73 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ac1e31ba4795..9431a8399844 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -168,6 +168,14 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
+config PAGE_FAULT_HANDLERS
+	bool "Custom page fault handlers"
+	depends on DEBUG_KERNEL
+	help
+	  Allow the use of custom page fault handlers. A kernel module may
+	  register a function that is called on every page fault. Custom
+	  handlers are used by some debugging and reverse engineering tools.
+
 #
 # IO delay types:
 #
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..343f5c1aacc8 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -49,6 +49,60 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+static HLIST_HEAD(pf_handlers); /* protected by RCU */
+static DEFINE_SPINLOCK(pf_handlers_writer);
+
+void register_page_fault_handler(struct pf_handler *new_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(register_page_fault_handler);
+
+/**
+ * unregister_page_fault_handler:
+ * The caller must ensure @old_pfh is not in use anymore before freeing it.
+ * This function does not guarantee it. The list of handlers is protected by
+ * RCU, so you can do this by e.g. calling synchronize_rcu().
+ */
+void unregister_page_fault_handler(struct pf_handler *old_pfh)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pf_handlers_writer, flags);
+	hlist_del_rcu(&old_pfh->hlist);
+	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+}
+EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
+#endif
+
+/* returns non-zero if do_page_fault() should return */
+static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
+							unsigned long address)
+{
+#ifdef CONFIG_PAGE_FAULT_HANDLERS
+	int ret = 0;
+	struct pf_handler *cur;
+	struct hlist_node *ncur;
+
+	if (hlist_empty(&pf_handlers))
+		return 0;
+
+	rcu_read_lock();
+	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
+		ret = cur->handler(regs, error_code, address);
+		if (ret)
+			break;
+	}
+	rcu_read_unlock();
+	return ret;
+#else
+	return 0;
+#endif
+}
+
 static inline int notify_page_fault(struct pt_regs *regs)
 {
 #ifdef CONFIG_KPROBES
@@ -601,6 +655,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
+	if (handle_custom_pf(regs, error_code, address))
+		return;
 
 	/*
 	 * We fault-in kernel-space virtual memory on-demand. The
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index 96651bb59ba1..a80f2d6cc737 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,4 +35,13 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
+struct pf_handler {
+	struct hlist_node hlist;
+	int (*handler)(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address);
+};
+
+extern void register_page_fault_handler(struct pf_handler *new_pfh);
+extern void unregister_page_fault_handler(struct pf_handler *old_pfh);
+
 #endif
-- 
cgit v1.2.3


From 72b59d67f80983f7bb587b086fb4cb1bc95263a4 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: x86_64: fix kernel rodata NX setting

Without CONFIG_DYNAMIC_FTRACE, mark_rodata_ro() would mark a wrong
number of pages as no-execute. The bug was introduced in the patch
"ftrace: dont write protect kernel text". The symptom was machine reboot
after a CPU hotplug.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 41824e776b6c..295be1d07b82 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -782,7 +782,7 @@ void mark_rodata_ro(void)
 	 * The rodata section (but not the kernel text!) should also be
 	 * not-executable.
 	 */
-	set_memory_nx(rodata_start, (end - start) >> PAGE_SHIFT);
+	set_memory_nx(rodata_start, (end - rodata_start) >> PAGE_SHIFT);
 
 	rodata_test();
 
-- 
cgit v1.2.3


From 2f1dafe50cc4e58a239fd81bd47f87f32042a1ee Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:01 +0200
Subject: x86: fix SMP alternatives: use mutex instead of spinlock, text_poke
 is sleepable

text_poke is sleepable.
The original fix by Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/alternative.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de240ba2e288..2763cb37b553 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 #include <linux/sched.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/kprobes.h>
 #include <linux/mm.h>
@@ -279,7 +279,7 @@ struct smp_alt_module {
 	struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_SPINLOCK(smp_alt);
+static DEFINE_MUTEX(smp_alt);
 static int smp_mode = 1;	/* protected by smp_alt */
 
 void alternatives_smp_module_add(struct module *mod, char *name,
@@ -312,12 +312,12 @@ void alternatives_smp_module_add(struct module *mod, char *name,
 		__func__, smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_add_tail(&smp->next, &smp_alt_modules);
 	if (boot_cpu_has(X86_FEATURE_UP))
 		alternatives_smp_unlock(smp->locks, smp->locks_end,
 					smp->text, smp->text_end);
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_module_del(struct module *mod)
@@ -327,17 +327,17 @@ void alternatives_smp_module_del(struct module *mod)
 	if (smp_alt_once || noreplace_smp)
 		return;
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 	list_for_each_entry(item, &smp_alt_modules, next) {
 		if (mod != item->mod)
 			continue;
 		list_del(&item->next);
-		spin_unlock(&smp_alt);
+		mutex_unlock(&smp_alt);
 		DPRINTK("%s: %s\n", __func__, item->name);
 		kfree(item);
 		return;
 	}
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 void alternatives_smp_switch(int smp)
@@ -359,7 +359,7 @@ void alternatives_smp_switch(int smp)
 		return;
 	BUG_ON(!smp && (num_online_cpus() > 1));
 
-	spin_lock(&smp_alt);
+	mutex_lock(&smp_alt);
 
 	/*
 	 * Avoid unnecessary switches because it forces JIT based VMs to
@@ -383,7 +383,7 @@ void alternatives_smp_switch(int smp)
 						mod->text, mod->text_end);
 	}
 	smp_mode = smp;
-	spin_unlock(&smp_alt);
+	mutex_unlock(&smp_alt);
 }
 
 #endif
-- 
cgit v1.2.3


From 37135677e653537ffc6e7def679443272a1c03c3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Wed, 14 May 2008 08:10:31 +0200
Subject: ftrace: fix mcount export bug

David S. Miller noticed the following bug: the -pg instrumentation
function callback is named differently on each platform. On x86 it
is mcount, on sparc it is _mcount. So the export does not make sense
in kernel/trace/ftrace.c - move it to x86.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/i386_ksyms_32.c  |  9 ++++++++-
 arch/x86/kernel/x8664_ksyms_64.c | 11 +++++++++--
 kernel/trace/ftrace.c            |  3 ---
 3 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index deb43785e923..29999dbb754c 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,7 +1,14 @@
+#include <linux/ftrace.h>
 #include <linux/module.h>
+
 #include <asm/checksum.h>
-#include <asm/desc.h>
 #include <asm/pgtable.h>
+#include <asm/desc.h>
+
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
 
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index f6c05d0410fb..122885bc5f3b 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,15 +1,22 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
+#include <linux/ftrace.h>
 #include <linux/module.h>
-#include <net/checksum.h>
 #include <linux/smp.h>
 
+#include <net/checksum.h>
+
 #include <asm/processor.h>
-#include <asm/uaccess.h>
 #include <asm/pgtable.h>
+#include <asm/uaccess.h>
 #include <asm/desc.h>
 
+#ifdef CONFIG_FTRACE
+/* mcount is defined in assembly */
+EXPORT_SYMBOL(mcount);
+#endif
+
 EXPORT_SYMBOL(kernel_thread);
 
 EXPORT_SYMBOL(__get_user_1);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 07b2a14943f8..a3e47f43f8a0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -50,9 +50,6 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
 ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
 
-/* mcount is defined per arch in assembly */
-EXPORT_SYMBOL(mcount);
-
 void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
 {
 	struct ftrace_ops *op = ftrace_list;
-- 
cgit v1.2.3


From 7fa09f24b477ad41b821713eba757b3aa7a2864a Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Wed, 14 May 2008 21:30:32 -0400
Subject: ftrace: use the new kbuild CFLAGS_REMOVE for x86/kernel directory

This patch removes the Makefile turd and uses the nice CFLAGS_REMOVE macro
in the x86/kernel directory.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index e142091524b0..739d49acd2f1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -6,6 +6,13 @@ extra-y                := head_$(BITS).o head$(BITS).o init_task.o vmlinux.lds
 
 CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
+ifdef CONFIG_FTRACE
+# Do not profile debug utilities
+CFLAGS_REMOVE_tsc_64.o = -pg
+CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_rtc.o = -pg
+endif
+
 #
 # vsyscalls (which work on the user stack) should have
 # no stack-protector checks:
-- 
cgit v1.2.3


From 677aa9f77e8de3791b481a0cec6c8b84d1eec626 Mon Sep 17 00:00:00 2001
From: Steven Rostedt
Date: Sat, 17 May 2008 00:01:36 -0400
Subject: ftrace: add have dynamic ftrace config for archs

Now that ftrace is being ported to other architectures, it has become
apparent that DYNAMIC_FTRACE is dependent on whether or not that
architecture implements dynamic ftrace. FTRACE itself may be ported to
an architecture without porting dynamic ftrace.

This patch adds HAVE_DYNAMIC_FTRACE to allow architectures to port ftrace
without having to also port the dynamic aspect as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/powerpc/Kconfig | 1 +
 arch/sparc64/Kconfig | 1 +
 arch/x86/Kconfig     | 1 +
 kernel/trace/Kconfig | 4 ++++
 4 files changed, 7 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 62d034adbd43..a5e9912e2d37 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -105,6 +105,7 @@ config ARCH_NO_VIRT_TO_BUS
 config PPC
 	bool
 	default y
+	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
 	select HAVE_IDE
 	select HAVE_KPROBES
diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig
index a480df6e6012..fca9246470b1 100644
--- a/arch/sparc64/Kconfig
+++ b/arch/sparc64/Kconfig
@@ -11,6 +11,7 @@ config SPARC
 config SPARC64
 	bool
 	default y
+	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
 	select HAVE_IDE
 	select HAVE_LMB
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c742dfeb0dbe..fc86c54e791e 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -23,6 +23,7 @@ config X86
 	select HAVE_OPROFILE
 	select HAVE_KPROBES
 	select HAVE_KRETPROBES
+	select HAVE_DYNAMIC_FTRACE
 	select HAVE_FTRACE
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index f3005717bcd0..5c2295b29f2c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -4,6 +4,9 @@
 config HAVE_FTRACE
 	bool
 
+config HAVE_DYNAMIC_FTRACE
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -94,6 +97,7 @@ config CONTEXT_SWITCH_TRACER
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FTRACE
+	depends on HAVE_DYNAMIC_FTRACE
 	default y
 	help
          This option will modify all the calls to ftrace dynamically
-- 
cgit v1.2.3


From 8b7d89d02ef3c6a7c73d6596f28cea7632850af4 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86: mmiotrace - trace memory mapped IO

Mmiotrace is a tool for trapping memory mapped IO (MMIO) accesses within
the kernel. It is used for debugging and especially for reverse
engineering evil binary drivers.

Mmiotrace works by wrapping the ioremap family of kernel functions and
marking the returned pages as not present. Access to the IO memory
triggers a page fault, which will be handled by mmiotrace's custom page
fault handler. This will single-step the faulted instruction with the
MMIO page marked as present. Access logs are directed to user space via
relay and debug_fs.

This page fault approach is necessary, because binary drivers have
readl/writel etc. calls inlined and therefore extremely difficult to
trap with with e.g. kprobes.

This patch depends on the custom page fault handlers patch.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug                    |  27 ++
 arch/x86/kernel/Makefile                  |   2 +
 arch/x86/kernel/init_task.c               |   1 +
 arch/x86/kernel/mmiotrace/Makefile        |   4 +
 arch/x86/kernel/mmiotrace/kmmio.c         | 391 ++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/kmmio.h         |  58 ++++
 arch/x86/kernel/mmiotrace/mmio-mod.c      | 527 ++++++++++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/pf_in.c         | 489 +++++++++++++++++++++++++++
 arch/x86/kernel/mmiotrace/pf_in.h         |  39 +++
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  77 +++++
 include/linux/mmiotrace.h                 |  62 ++++
 11 files changed, 1677 insertions(+)
 create mode 100644 arch/x86/kernel/mmiotrace/Makefile
 create mode 100644 arch/x86/kernel/mmiotrace/kmmio.c
 create mode 100644 arch/x86/kernel/mmiotrace/kmmio.h
 create mode 100644 arch/x86/kernel/mmiotrace/mmio-mod.c
 create mode 100644 arch/x86/kernel/mmiotrace/pf_in.c
 create mode 100644 arch/x86/kernel/mmiotrace/pf_in.h
 create mode 100644 arch/x86/kernel/mmiotrace/testmmiotrace.c
 create mode 100644 include/linux/mmiotrace.h

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9431a8399844..7c6496e2225e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -176,6 +176,33 @@ config PAGE_FAULT_HANDLERS
 	  register a function that is called on every page fault. Custom
 	  handlers are used by some debugging and reverse engineering tools.
 
+config MMIOTRACE
+	tristate "Memory mapped IO tracing"
+	depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS
+	default n
+	help
+	  This will build a kernel module called mmiotrace.
+
+	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
+	  and reverse engineering. The kernel module offers wrapped
+	  versions of the ioremap family of functions. The driver to be traced
+	  must be modified to call these wrappers. A user space program is
+	  required to collect the MMIO data.
+
+	  See http://nouveau.freedesktop.org/wiki/MmioTrace
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	default n
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 #
 # IO delay types:
 #
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 739d49acd2f1..a51ac153685e 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -79,6 +79,8 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace/
+
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 obj-$(CONFIG_K8_NB)		+= k8.o
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index a4f93b4120c1..027a5b6a12b2 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,6 +15,7 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
+EXPORT_SYMBOL_GPL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
new file mode 100644
index 000000000000..d6905f7f981b
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
+mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o
+
+obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
new file mode 100644
index 000000000000..8ba48f9c91b4
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -0,0 +1,391 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#include <linux/version.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <asm/io.h>
+#include <asm/cacheflush.h>
+#include <asm/errno.h>
+#include <asm/tlbflush.h>
+
+#include "kmmio.h"
+
+#define KMMIO_HASH_BITS 6
+#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	int active;
+};
+
+static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address);
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+								void *args);
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* These are protected by kmmio_lock */
+unsigned int kmmio_count;
+static unsigned int handler_registered;
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct kmmio_context kmmio_ctx[NR_CPUS];
+
+static struct pf_handler kmmio_pf_hook = {
+	.handler = kmmio_page_fault
+};
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+int init_kmmio(void)
+{
+	int i;
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+
+	register_die_notifier(&nb_die);
+	return 0;
+}
+
+void cleanup_kmmio(void)
+{
+	/*
+	 * Assume the following have been already cleaned by calling
+	 * unregister_kmmio_probe() appropriately:
+	 * kmmio_page_table, kmmio_probes
+	 */
+	if (handler_registered) {
+		unregister_page_fault_handler(&kmmio_pf_hook);
+		synchronize_rcu();
+	}
+	unregister_die_notifier(&nb_die);
+}
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+	list_for_each_entry(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr <= (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+	struct list_head *head, *tmp;
+
+	page &= PAGE_MASK;
+	head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+	list_for_each(tmp, head) {
+		struct kmmio_fault_page *p
+			= list_entry(tmp, struct kmmio_fault_page, list);
+		if (p->page == page)
+			return p;
+	}
+
+	return NULL;
+}
+
+static void arm_kmmio_fault_page(unsigned long page, int *large)
+{
+	unsigned long address = page & PAGE_MASK;
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	pte_t *pte = pte_offset_kernel(pmd, address);
+
+	if (pmd_large(*pmd)) {
+		set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
+		if (large)
+			*large = 1;
+	} else {
+		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	}
+
+	__flush_tlb_one(page);
+}
+
+static void disarm_kmmio_fault_page(unsigned long page, int *large)
+{
+	unsigned long address = page & PAGE_MASK;
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	pte_t *pte = pte_offset_kernel(pmd, address);
+
+	if (large && *large) {
+		set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
+		*large = 0;
+	} else {
+		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
+	}
+
+	__flush_tlb_one(page);
+}
+
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	int cpu;
+
+	/*
+	 * Preemption is now disabled to prevent process switch during
+	 * single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run.
+	 *
+	 * XXX what if an interrupt occurs between returning from
+	 * do_page_fault() and entering the single-step exception handler?
+	 * And that interrupt triggers a kmmio trap?
+	 */
+	preempt_disable();
+	cpu = smp_processor_id();
+	ctx = &kmmio_ctx[cpu];
+
+	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
+	if (ctx->active) {
+		/*
+		 * This avoids a deadlock with kmmio_lock.
+		 * If this page fault really was due to kmmio trap,
+		 * all hell breaks loose.
+		 */
+		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
+					"for address %lu. Ignoring.\n",
+					cpu, addr);
+		goto no_kmmio;
+	}
+	ctx->active++;
+
+	/*
+	 * Acquire the kmmio lock to prevent changes affecting
+	 * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
+	 * returned pointers.
+	 * The lock is released in post_kmmio_handler().
+	 * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
+	 */
+	spin_lock(&kmmio_lock);
+
+	ctx->fpage = get_kmmio_fault_page(addr);
+	if (!ctx->fpage) {
+		/* this page fault is not caused by kmmio */
+		goto no_kmmio_locked;
+	}
+
+	ctx->probe = get_kmmio_probe(addr);
+	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	regs->flags |= TF_MASK;
+	regs->flags &= ~IF_MASK;
+
+	/* We hold lock, now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	return 1;
+
+no_kmmio_locked:
+	spin_unlock(&kmmio_lock);
+	ctx->active--;
+no_kmmio:
+	preempt_enable_no_resched();
+	/* page fault not handled by kmmio */
+	return 0;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * And we hold kmmio lock.
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int cpu = smp_processor_id();
+	struct kmmio_context *ctx = &kmmio_ctx[cpu];
+
+	if (!ctx->active)
+		return 0;
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	regs->flags &= ~TF_MASK;
+	regs->flags |= ctx->saved_flags;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	spin_unlock(&kmmio_lock);
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (regs->flags & TF_MASK)
+		return 0;
+
+	return 1;
+}
+
+static int add_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (f) {
+		f->count++;
+		return 0;
+	}
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->page = page;
+	list_add(&f->list,
+		 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
+
+	arm_kmmio_fault_page(f->page, NULL);
+
+	return 0;
+}
+
+static void release_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (!f)
+		return;
+
+	f->count--;
+	if (!f->count) {
+		disarm_kmmio_fault_page(f->page, NULL);
+		list_del(&f->list);
+	}
+}
+
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	int ret = 0;
+	unsigned long size = 0;
+
+	spin_lock_irq(&kmmio_lock);
+	kmmio_count++;
+	if (get_kmmio_probe(p->addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+	list_add(&p->list, &kmmio_probes);
+	/*printk("adding fault pages...\n");*/
+	while (size < p->len) {
+		if (add_kmmio_fault_page(p->addr + size))
+			printk(KERN_ERR "mmio: Unable to set page fault.\n");
+		size += PAGE_SIZE;
+	}
+
+	if (!handler_registered) {
+		register_page_fault_handler(&kmmio_pf_hook);
+		handler_registered++;
+	}
+
+out:
+	spin_unlock_irq(&kmmio_lock);
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore.
+	 */
+	return ret;
+}
+
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long size = 0;
+
+	spin_lock_irq(&kmmio_lock);
+	while (size < p->len) {
+		release_kmmio_fault_page(p->addr + size);
+		size += PAGE_SIZE;
+	}
+	list_del(&p->list);
+	kmmio_count--;
+	spin_unlock_irq(&kmmio_lock);
+}
+
+/*
+ * According to 2.6.20, mainly x86_64 arch:
+ * This is being called from do_page_fault(), via the page fault notifier
+ * chain. The chain is called for both user space faults and kernel space
+ * faults (address >= TASK_SIZE64), except not on faults serviced by
+ * vmalloc_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * The page fault hook functionality has put us inside RCU read lock.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
+						unsigned long address)
+{
+	if (is_kmmio_active())
+		if (kmmio_handler(regs, address) == 1)
+			return -1;
+	return 0;
+}
+
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+								void *args)
+{
+	struct die_args *arg = args;
+
+	if (val == DIE_DEBUG)
+		if (post_kmmio_handler(arg->err, arg->regs) == 1)
+			return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+}
diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h
new file mode 100644
index 000000000000..85b7f68a3b8a
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/kmmio.h
@@ -0,0 +1,58 @@
+#ifndef _LINUX_KMMIO_H
+#define _LINUX_KMMIO_H
+
+#include <linux/list.h>
+#include <linux/notifier.h>
+#include <linux/smp.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <linux/kdebug.h>
+
+struct kmmio_probe;
+struct kmmio_fault_page;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list;
+
+	/* start location of the probe point */
+	unsigned long addr;
+
+	/* length of the probe region */
+	unsigned long len;
+
+	 /* Called before addr is executed. */
+	kmmio_pre_handler_t pre_handler;
+
+	/* Called after addr is executed, unless... */
+	kmmio_post_handler_t post_handler;
+};
+
+struct kmmio_fault_page {
+	struct list_head list;
+
+	/* location of the fault page */
+	unsigned long page;
+
+	int count;
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+int init_kmmio(void);
+void cleanup_kmmio(void);
+int register_kmmio_probe(struct kmmio_probe *p);
+void unregister_kmmio_probe(struct kmmio_probe *p);
+
+#endif /* _LINUX_KMMIO_H */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
new file mode 100644
index 000000000000..73561fe85f03
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -0,0 +1,527 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+#include <linux/module.h>
+#include <linux/relay.h>
+#include <linux/debugfs.h>
+#include <linux/proc_fs.h>
+#include <asm/io.h>
+#include <linux/version.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+
+#include "kmmio.h"
+#include "pf_in.h"
+
+/* This app's relay channel files will appear in /debug/mmio-trace */
+#define APP_DIR		"mmio-trace"
+/* the marker injection file in /proc */
+#define MARKER_FILE     "mmio-marker"
+
+#define MODULE_NAME     "mmiotrace"
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	enum reason_type type;
+	int active_traces;
+};
+
+static struct trap_reason pf_reason[NR_CPUS];
+static struct mm_io_header_rw cpu_trace[NR_CPUS];
+
+static struct file_operations mmio_fops = {
+	.owner = THIS_MODULE,
+};
+
+static const size_t subbuf_size = 256*1024;
+static struct rchan *chan;
+static struct dentry *dir;
+static int suspended;      /* XXX should this be per cpu? */
+static struct proc_dir_entry *proc_marker_file;
+
+/* module parameters */
+static unsigned int      n_subbufs = 32*4;
+static unsigned long filter_offset;
+static int                 nommiotrace;
+static int               ISA_trace;
+static int                trace_pc;
+
+module_param(n_subbufs, uint, 0);
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(ISA_trace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static void record_timestamp(struct mm_io_header *header)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+	header->sec = now.tv_sec;
+	header->nsec = now.tv_nsec;
+}
+
+/*
+ * Write callback for the /proc entry:
+ * Read a marker and write it to the mmio trace log
+ */
+static int write_marker(struct file *file, const char __user *buffer,
+					unsigned long count, void *data)
+{
+	char *event = NULL;
+	struct mm_io_header *headp;
+	int len = (count > 65535) ? 65535 : count;
+
+	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	headp = (struct mm_io_header *)event;
+	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
+	headp->data_len = len;
+	record_timestamp(headp);
+
+	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
+		kfree(event);
+		return -EFAULT;
+	}
+
+	relay_write(chan, event, sizeof(*headp) + len);
+	kfree(event);
+	return len;
+}
+
+static void print_pte(unsigned long address)
+{
+	pgd_t *pgd = pgd_offset_k(address);
+	pud_t *pud = pud_offset(pgd, address);
+	pmd_t *pmd = pmd_offset(pud, address);
+	if (pmd_large(*pmd)) {
+		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
+						"currently supported: %lx\n",
+						address);
+		BUG();
+	}
+	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
+		address,
+		pte_val(*pte_offset_kernel(pmd, address)),
+		pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const unsigned long cpu = smp_processor_id();
+	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
+					"last fault for address: %lx\n",
+					addr, pf_reason[cpu].addr);
+	print_pte(addr);
+#ifdef __i386__
+	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
+							pf_reason[cpu].ip);
+	printk(KERN_EMERG
+			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+			regs->ax, regs->bx, regs->cx, regs->dx);
+	printk(KERN_EMERG
+			"esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+			regs->si, regs->di, regs->bp, regs->sp);
+#else
+	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
+							pf_reason[cpu].ip);
+	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+					regs->ax, regs->cx, regs->dx);
+	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
+				"rbp: %016lx   rsp: %016lx\n",
+				regs->si, regs->di, regs->bp, regs->sp);
+#endif
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	const unsigned long cpu = smp_processor_id();
+	const unsigned long instptr = instruction_pointer(regs);
+	const enum reason_type type = get_ins_type(instptr);
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (pf_reason[cpu].active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		pf_reason[cpu].active_traces++;
+
+	pf_reason[cpu].type = type;
+	pf_reason[cpu].addr = addr;
+	pf_reason[cpu].ip = instptr;
+
+	cpu_trace[cpu].header.type = MMIO_MAGIC;
+	cpu_trace[cpu].header.pid = 0;
+	cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw);
+	cpu_trace[cpu].rw.address = addr;
+
+	/*
+	 * Only record the program counter when requested.
+	 * It may taint clean-room reverse engineering.
+	 */
+	if (trace_pc)
+		cpu_trace[cpu].rw.pc = instptr;
+	else
+		cpu_trace[cpu].rw.pc = 0;
+
+	record_timestamp(&cpu_trace[cpu].header);
+
+	switch (type) {
+	case REG_READ:
+		cpu_trace[cpu].header.type |=
+			(MMIO_READ << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		break;
+	case REG_WRITE:
+		cpu_trace[cpu].header.type |=
+			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs);
+		break;
+	case IMM_WRITE:
+		cpu_trace[cpu].header.type |=
+			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
+			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		cpu_trace[cpu].rw.value = get_ins_imm_val(instptr);
+		break;
+	default:
+		{
+			unsigned char *ip = (unsigned char *)instptr;
+			cpu_trace[cpu].header.type |=
+					(MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
+			cpu_trace[cpu].rw.value = (*ip) << 16 |
+							*(ip + 1) << 8 |
+							*(ip + 2);
+		}
+	}
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	const unsigned long cpu = smp_processor_id();
+
+	/* this should always return the active_trace count to 0 */
+	pf_reason[cpu].active_traces--;
+	if (pf_reason[cpu].active_traces) {
+		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
+		BUG();
+	}
+
+	switch (pf_reason[cpu].type) {
+	case REG_READ:
+		cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip,
+									regs);
+		break;
+	default:
+		break;
+	}
+	relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw));
+}
+
+/*
+ * subbuf_start() relay callback.
+ *
+ * Defined so that we know when events are dropped due to the buffer-full
+ * condition.
+ */
+static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
+					void *prev_subbuf, size_t prev_padding)
+{
+	if (relay_buf_full(buf)) {
+		if (!suspended) {
+			suspended = 1;
+			printk(KERN_ERR MODULE_NAME
+						": cpu %d buffer full!!!\n",
+						smp_processor_id());
+		}
+		return 0;
+	} else if (suspended) {
+		suspended = 0;
+		printk(KERN_ERR MODULE_NAME
+					": cpu %d buffer no longer full.\n",
+					smp_processor_id());
+	}
+
+	return 1;
+}
+
+/* file_create() callback.  Creates relay file in debugfs. */
+static struct dentry *create_buf_file_handler(const char *filename,
+						struct dentry *parent,
+						int mode,
+						struct rchan_buf *buf,
+						int *is_global)
+{
+	struct dentry *buf_file;
+
+	mmio_fops.read = relay_file_operations.read;
+	mmio_fops.open = relay_file_operations.open;
+	mmio_fops.poll = relay_file_operations.poll;
+	mmio_fops.mmap = relay_file_operations.mmap;
+	mmio_fops.release = relay_file_operations.release;
+	mmio_fops.splice_read = relay_file_operations.splice_read;
+
+	buf_file = debugfs_create_file(filename, mode, parent, buf,
+								&mmio_fops);
+
+	return buf_file;
+}
+
+/* file_remove() default callback.  Removes relay file in debugfs. */
+static int remove_buf_file_handler(struct dentry *dentry)
+{
+	debugfs_remove(dentry);
+	return 0;
+}
+
+static struct rchan_callbacks relay_callbacks = {
+	.subbuf_start = subbuf_start_handler,
+	.create_buf_file = create_buf_file_handler,
+	.remove_buf_file = remove_buf_file_handler,
+};
+
+/*
+ * create_channel - creates channel /debug/APP_DIR/cpuXXX
+ * Returns channel on success, NULL otherwise
+ */
+static struct rchan *create_channel(unsigned size, unsigned n)
+{
+	return relay_open("cpu", dir, size, n, &relay_callbacks, NULL);
+}
+
+/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */
+static void destroy_channel(void)
+{
+	if (chan) {
+		relay_close(chan);
+		chan = NULL;
+	}
+}
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+};
+static LIST_HEAD(trace_list);
+static DEFINE_SPINLOCK(trace_list_lock);
+
+static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
+							void __iomem *addr)
+{
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	struct mm_io_header_map event = {
+		.header = {
+			.type = MMIO_MAGIC |
+					(MMIO_PROBE << MMIO_OPCODE_SHIFT),
+			.sec = 0,
+			.nsec = 0,
+			.pid = 0,
+			.data_len = sizeof(struct mm_io_map)
+		},
+		.map = {
+			.phys = offset,
+			.addr = (unsigned long)addr,
+			.len  = size,
+			.pc   = 0
+		}
+	};
+	record_timestamp(&event.header);
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+		}
+	};
+
+	relay_write(chan, &event, sizeof(event));
+	spin_lock(&trace_list_lock);
+	list_add_tail(&trace->list, &trace_list);
+	spin_unlock(&trace_list_lock);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+}
+
+static void ioremap_trace_core(unsigned long offset, unsigned long size,
+							void __iomem *addr)
+{
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+
+	/* Don't trace the low PCI/ISA area, it's always mapped.. */
+	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
+					(offset + size > ISA_START_ADDRESS)) {
+		printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low "
+						"PCI/ISA area (0x%lx-0x%lx)\n",
+						offset, offset + size);
+		return;
+	}
+	do_ioremap_trace_core(offset, size, addr);
+}
+
+void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
+{
+	void __iomem *p = ioremap_cache(offset, size);
+	printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
+							offset, size, p);
+	ioremap_trace_core(offset, size, p);
+	return p;
+}
+EXPORT_SYMBOL(ioremap_cache_trace);
+
+void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
+{
+	void __iomem *p = ioremap_nocache(offset, size);
+	printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
+							offset, size, p);
+	ioremap_trace_core(offset, size, p);
+	return p;
+}
+EXPORT_SYMBOL(ioremap_nocache_trace);
+
+void iounmap_trace(volatile void __iomem *addr)
+{
+	struct mm_io_header_map event = {
+		.header = {
+			.type = MMIO_MAGIC |
+				(MMIO_UNPROBE << MMIO_OPCODE_SHIFT),
+			.sec = 0,
+			.nsec = 0,
+			.pid = 0,
+			.data_len = sizeof(struct mm_io_map)
+		},
+		.map = {
+			.phys = 0,
+			.addr = (unsigned long)addr,
+			.len  = 0,
+			.pc   = 0
+		}
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr);
+	record_timestamp(&event.header);
+
+	spin_lock(&trace_list_lock);
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			kfree(trace);
+			break;
+		}
+	}
+	spin_unlock(&trace_list_lock);
+	relay_write(chan, &event, sizeof(event));
+	iounmap(addr);
+}
+EXPORT_SYMBOL(iounmap_trace);
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	spin_lock(&trace_list_lock);
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped "
+					"trace @0x%08lx, size 0x%lx.\n",
+					trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+		list_del(&trace->list);
+		kfree(trace);
+		break;
+	}
+	spin_unlock(&trace_list_lock);
+}
+
+static int __init init(void)
+{
+	if (n_subbufs < 2)
+		return -EINVAL;
+
+	dir = debugfs_create_dir(APP_DIR, NULL);
+	if (!dir) {
+		printk(KERN_ERR MODULE_NAME
+				": Couldn't create relay app directory.\n");
+		return -ENOMEM;
+	}
+
+	chan = create_channel(subbuf_size, n_subbufs);
+	if (!chan) {
+		debugfs_remove(dir);
+		printk(KERN_ERR MODULE_NAME
+				": relay app channel creation failed\n");
+		return -ENOMEM;
+	}
+
+	init_kmmio();
+
+	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
+	if (proc_marker_file)
+		proc_marker_file->write_proc = write_marker;
+
+	printk(KERN_DEBUG MODULE_NAME ": loaded.\n");
+	if (nommiotrace)
+		printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n");
+	if (ISA_trace)
+		printk(KERN_WARNING MODULE_NAME
+				": Warning! low ISA range will be traced.\n");
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	printk(KERN_DEBUG MODULE_NAME ": unload...\n");
+	clear_trace_list();
+	cleanup_kmmio();
+	remove_proc_entry(MARKER_FILE, NULL);
+	destroy_channel();
+	if (dir)
+		debugfs_remove(dir);
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
new file mode 100644
index 000000000000..67ea520dde62
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/pf_in.c
@@ -0,0 +1,489 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+/*  $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+	0x65, 0x2E, 0x3E, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw32[] = {
+	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+	0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+	0xF0, 0xF3, 0xF2,
+	/* REX Prefixes */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw32[] = {
+	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B };
+#endif /* not __i386__ */
+
+static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
+								int *rexr)
+{
+	int i;
+	unsigned char *p = addr;
+	*shorted = 0;
+	*enlarged = 0;
+	*rexr = 0;
+
+restart:
+	for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+		if (*p == prefix_codes[i]) {
+			if (*p == 0x66)
+				*shorted = 1;
+#ifdef __amd64__
+			if ((*p & 0xf8) == 0x48)
+				*enlarged = 1;
+			if ((*p & 0xf4) == 0x44)
+				*rexr = 1;
+#endif
+			p++;
+			goto restart;
+		}
+	}
+
+	return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+	int len;
+
+	if (*addr == 0x0F) {
+		/* 0x0F is extension instruction */
+		*opcode = *(unsigned short *)addr;
+		len = 2;
+	} else {
+		*opcode = *addr;
+		len = 1;
+	}
+
+	return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+	for (i = 0; i < ARRAY_SIZE(array); i++) { \
+		if (array[i] == opcode) { \
+			rv = type; \
+			goto exit; \
+		} \
+	}
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int shorted, enlarged, rexr;
+	int i;
+	enum reason_type rv = OTHERS;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+	CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+	CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+	return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(rw8); i++)
+		if (rw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(rw32); i++)
+		if (rw32[i] == opcode)
+			return (shorted ? 2 : (enlarged ? 8 : 4));
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(mw8); i++)
+		if (mw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(mw16); i++)
+		if (mw16[i] == opcode)
+			return 2;
+
+	for (i = 0; i < ARRAY_SIZE(mw32); i++)
+		if (mw32[i] == opcode)
+			return shorted ? 2 : 4;
+
+	for (i = 0; i < ARRAY_SIZE(mw64); i++)
+		if (mw64[i] == opcode)
+			return shorted ? 2 : (enlarged ? 8 : 4);
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+	arg_AL = 0,
+	arg_CL = 1,
+	arg_DL = 2,
+	arg_BL = 3,
+	arg_AH = 4,
+	arg_CH = 5,
+	arg_DH = 6,
+	arg_BH = 7,
+
+	arg_AX = 0,
+	arg_CX = 1,
+	arg_DX = 2,
+	arg_BX = 3,
+	arg_SP = 4,
+	arg_BP = 5,
+	arg_SI = 6,
+	arg_DI = 7,
+#ifdef __amd64__
+	arg_R8  = 8,
+	arg_R9  = 9,
+	arg_R10 = 10,
+	arg_R11 = 11,
+	arg_R12 = 12,
+	arg_R13 = 13,
+	arg_R14 = 14,
+	arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+{
+	unsigned char *rv = NULL;
+
+	switch (no) {
+	case arg_AL:
+		rv = (unsigned char *)&regs->ax;
+		break;
+	case arg_BL:
+		rv = (unsigned char *)&regs->bx;
+		break;
+	case arg_CL:
+		rv = (unsigned char *)&regs->cx;
+		break;
+	case arg_DL:
+		rv = (unsigned char *)&regs->dx;
+		break;
+	case arg_AH:
+		rv = 1 + (unsigned char *)&regs->ax;
+		break;
+	case arg_BH:
+		rv = 1 + (unsigned char *)&regs->bx;
+		break;
+	case arg_CH:
+		rv = 1 + (unsigned char *)&regs->cx;
+		break;
+	case arg_DH:
+		rv = 1 + (unsigned char *)&regs->dx;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = (unsigned char *)&regs->r8;
+		break;
+	case arg_R9:
+		rv = (unsigned char *)&regs->r9;
+		break;
+	case arg_R10:
+		rv = (unsigned char *)&regs->r10;
+		break;
+	case arg_R11:
+		rv = (unsigned char *)&regs->r11;
+		break;
+	case arg_R12:
+		rv = (unsigned char *)&regs->r12;
+		break;
+	case arg_R13:
+		rv = (unsigned char *)&regs->r13;
+		break;
+	case arg_R14:
+		rv = (unsigned char *)&regs->r14;
+		break;
+	case arg_R15:
+		rv = (unsigned char *)&regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+		break;
+	}
+	return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+	unsigned long *rv = NULL;
+
+	switch (no) {
+	case arg_AX:
+		rv = &regs->ax;
+		break;
+	case arg_BX:
+		rv = &regs->bx;
+		break;
+	case arg_CX:
+		rv = &regs->cx;
+		break;
+	case arg_DX:
+		rv = &regs->dx;
+		break;
+	case arg_SP:
+		rv = &regs->sp;
+		break;
+	case arg_BP:
+		rv = &regs->bp;
+		break;
+	case arg_SI:
+		rv = &regs->si;
+		break;
+	case arg_DI:
+		rv = &regs->di;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = &regs->r8;
+		break;
+	case arg_R9:
+		rv = &regs->r9;
+		break;
+	case arg_R10:
+		rv = &regs->r10;
+		break;
+	case arg_R11:
+		rv = &regs->r11;
+		break;
+	case arg_R12:
+		rv = &regs->r12;
+		break;
+	case arg_R13:
+		rv = &regs->r13;
+		break;
+	case arg_R14:
+		rv = &regs->r14;
+		break;
+	case arg_R15:
+		rv = &regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+	}
+
+	return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	int reg;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+		if (reg_rop[i] == opcode) {
+			rv = REG_READ;
+			goto do_work;
+		}
+
+	for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+		if (reg_wop[i] == opcode) {
+			rv = REG_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *get_reg_w8(reg, regs);
+
+	case 2:
+		return *(unsigned short *)get_reg_w32(reg, regs);
+
+	case 4:
+		return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+	}
+
+err:
+	return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	unsigned char mod;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+		if (imm_wop[i] == opcode) {
+			rv = IMM_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	mod = mod_rm >> 6;
+	p++;
+	switch (mod) {
+	case 0:
+		/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+		/* AMD64: XXX Check for address size prefix? */
+		if ((mod_rm & 0x7) == 0x5)
+			p += 4;
+		break;
+
+	case 1:
+		p += 1;
+		break;
+
+	case 2:
+		p += 4;
+		break;
+
+	case 3:
+	default:
+		printk(KERN_ERR "mmiotrace: not a memory access instruction "
+						"at 0x%lx, rm_mod=0x%02x\n",
+						ins_addr, mod_rm);
+	}
+
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *(unsigned char *)p;
+
+	case 2:
+		return *(unsigned short *)p;
+
+	case 4:
+		return *(unsigned int *)p;
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)p;
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error: width.\n");
+	}
+
+err:
+	return 0;
+}
diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+	NOT_ME,	/* page fault is not in regions */
+	NOTHING,	/* access others point in regions */
+	REG_READ,	/* read from addr to reg */
+	REG_WRITE,	/* write from reg to addr */
+	IMM_WRITE,	/* write from imm to addr */
+	OTHERS	/* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
new file mode 100644
index 000000000000..40e66b0e6480
--- /dev/null
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -0,0 +1,77 @@
+/*
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ */
+#include <linux/module.h>
+#include <asm/io.h>
+
+extern void __iomem *ioremap_nocache_trace(unsigned long offset,
+						unsigned long size);
+extern void iounmap_trace(volatile void __iomem *addr);
+
+#define MODULE_NAME "testmmiotrace"
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+
+static void do_write_test(void __iomem *p)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++)
+		iowrite8(i, p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		iowrite16(i * 12 + 7, p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		iowrite32(i * 212371 + 13, p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+	unsigned int i;
+	volatile unsigned int v;
+	for (i = 0; i < 256; i++)
+		v = ioread8(p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		v = ioread16(p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		v = ioread32(p + i);
+}
+
+static void do_test(void)
+{
+	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
+	if (!p) {
+		printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, "
+							"aborting.\n");
+		return;
+	}
+	do_write_test(p);
+	do_read_test(p);
+	iounmap_trace(p);
+}
+
+static int __init init(void)
+{
+	if (mmio_address == 0) {
+		printk(KERN_ERR MODULE_NAME ": you have to use the module "
+						"argument mmio_address.\n");
+		printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+		return -ENXIO;
+	}
+
+	printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+					"in PCI address space, and writing "
+					"rubbish in there.\n", mmio_address);
+	do_test();
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	printk(KERN_DEBUG MODULE_NAME ": unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
new file mode 100644
index 000000000000..cb247825f3ec
--- /dev/null
+++ b/include/linux/mmiotrace.h
@@ -0,0 +1,62 @@
+#ifndef MMIOTRACE_H
+#define MMIOTRACE_H
+
+#include <asm/types.h>
+
+#define MMIO_VERSION 0x04
+
+/* mm_io_header.type */
+#define MMIO_OPCODE_MASK 0xff
+#define MMIO_OPCODE_SHIFT 0
+#define MMIO_WIDTH_MASK 0xff00
+#define MMIO_WIDTH_SHIFT 8
+#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16))
+#define MMIO_MAGIC_MASK 0xffff0000
+
+enum mm_io_opcode {          /* payload type: */
+	MMIO_READ = 0x1,     /* struct mm_io_rw */
+	MMIO_WRITE = 0x2,    /* struct mm_io_rw */
+	MMIO_PROBE = 0x3,    /* struct mm_io_map */
+	MMIO_UNPROBE = 0x4,  /* struct mm_io_map */
+	MMIO_MARKER = 0x5,   /* raw char data */
+	MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */
+};
+
+struct mm_io_header {
+	__u32 type;
+	__u32 sec;      /* timestamp */
+	__u32 nsec;
+	__u32 pid;      /* PID of the process, or 0 for kernel core */
+	__u16 data_len; /* length of the following payload */
+};
+
+struct mm_io_rw {
+	__u64 address; /* virtual address of register */
+	__u64 value;
+	__u64 pc;      /* optional program counter */
+};
+
+struct mm_io_map {
+	__u64 phys;  /* base address in PCI space */
+	__u64 addr;  /* base virtual address */
+	__u64 len;   /* mapping size */
+	__u64 pc;    /* optional program counter */
+};
+
+
+/*
+ * These structures are used to allow a single relay_write()
+ * call to write a full packet.
+ */
+
+struct mm_io_header_rw {
+	struct mm_io_header header;
+	struct mm_io_rw rw;
+} __attribute__((packed));
+
+struct mm_io_header_map {
+	struct mm_io_header header;
+	struct mm_io_map map;
+} __attribute__((packed));
+
+#endif /* MMIOTRACE_H */
-- 
cgit v1.2.3


From 75bb88350e0501b3cf5ac096a1008757844414a9 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86 mmiotrace: use lookup_address()

Use lookup_address() from pageattr.c instead of doing the same
manually. Also had to EXPORT_SYMBOL_GPL(lookup_address) to make this
work for modules. This also fixes "undefined symbol 'init_mm'"
compile error for x86_32.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/kmmio.c    | 46 ++++++++++++++++++++++++------------
 arch/x86/kernel/mmiotrace/mmio-mod.c | 19 +++++++++------
 arch/x86/mm/pageattr.c               |  1 +
 3 files changed, 44 insertions(+), 22 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 8ba48f9c91b4..28411dadb8b3 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -20,6 +20,7 @@
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/tlbflush.h>
+#include <asm/pgtable.h>
 
 #include "kmmio.h"
 
@@ -117,40 +118,55 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void arm_kmmio_fault_page(unsigned long page, int *large)
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud = pud_offset(pgd, address);
-	pmd_t *pmd = pmd_offset(pud, address);
-	pte_t *pte = pte_offset_kernel(pmd, address);
+	int level;
+	pte_t *pte = lookup_address(address, &level);
 
-	if (pmd_large(*pmd)) {
+	if (!pte) {
+		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
+						__FUNCTION__, page);
+		return;
+	}
+
+	if (level == PG_LEVEL_2M) {
+		pmd_t *pmd = (pmd_t *)pte;
 		set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
-		if (large)
-			*large = 1;
 	} else {
+		/* PG_LEVEL_4K */
 		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
 	}
 
+	if (page_level)
+		*page_level = level;
+
 	__flush_tlb_one(page);
 }
 
-static void disarm_kmmio_fault_page(unsigned long page, int *large)
+static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud = pud_offset(pgd, address);
-	pmd_t *pmd = pmd_offset(pud, address);
-	pte_t *pte = pte_offset_kernel(pmd, address);
+	int level;
+	pte_t *pte = lookup_address(address, &level);
 
-	if (large && *large) {
+	if (!pte) {
+		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
+						__FUNCTION__, page);
+		return;
+	}
+
+	if (level == PG_LEVEL_2M) {
+		pmd_t *pmd = (pmd_t *)pte;
 		set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
-		*large = 0;
 	} else {
+		/* PG_LEVEL_4K */
 		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
 	}
 
+	if (page_level)
+		*page_level = level;
+
 	__flush_tlb_one(page);
 }
 
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 73561fe85f03..e43947d218a5 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -120,19 +120,24 @@ static int write_marker(struct file *file, const char __user *buffer,
 
 static void print_pte(unsigned long address)
 {
-	pgd_t *pgd = pgd_offset_k(address);
-	pud_t *pud = pud_offset(pgd, address);
-	pmd_t *pmd = pmd_offset(pud, address);
-	if (pmd_large(*pmd)) {
+	int level;
+	pte_t *pte = lookup_address(address, &level);
+
+	if (!pte) {
+		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
+						__FUNCTION__, address);
+		return;
+	}
+
+	if (level == PG_LEVEL_2M) {
 		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
 						"currently supported: %lx\n",
 						address);
 		BUG();
 	}
 	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
-		address,
-		pte_val(*pte_offset_kernel(pmd, address)),
-		pte_val(*pte_offset_kernel(pmd, address)) & _PAGE_PRESENT);
+					address, pte_val(*pte),
+					pte_val(*pte) & _PAGE_PRESENT);
 }
 
 /*
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..57970f2935c0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -227,6 +227,7 @@ pte_t *lookup_address(unsigned long address, unsigned int *level)
 
 	return pte_offset_kernel(pmd, address);
 }
+EXPORT_SYMBOL_GPL(lookup_address);
 
 /*
  * Set the new pmd in all the pgds we know about:
-- 
cgit v1.2.3


From fe1ffafa80f6673101c6560c2bacfe3df10372ee Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:56 +0200
Subject: x86 mmiotrace: fix relay-buffer-full flag for SMP

Relay has per-cpu buffers, but mmiotrace was using only a single flag
for detecting buffer full/not-full transitions. The new code makes
this per-cpu and actually counts missed events.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/mmio-mod.c | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index e43947d218a5..0019dcdf6158 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -29,6 +29,7 @@
 #include <asm/pgtable.h>
 #include <linux/mmiotrace.h>
 #include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <asm/atomic.h>
 
 #include "kmmio.h"
 #include "pf_in.h"
@@ -47,9 +48,13 @@ struct trap_reason {
 	int active_traces;
 };
 
+/* Accessed per-cpu. */
 static struct trap_reason pf_reason[NR_CPUS];
 static struct mm_io_header_rw cpu_trace[NR_CPUS];
 
+/* Access to this is not per-cpu. */
+static atomic_t dropped[NR_CPUS];
+
 static struct file_operations mmio_fops = {
 	.owner = THIS_MODULE,
 };
@@ -57,7 +62,6 @@ static struct file_operations mmio_fops = {
 static const size_t subbuf_size = 256*1024;
 static struct rchan *chan;
 static struct dentry *dir;
-static int suspended;      /* XXX should this be per cpu? */
 static struct proc_dir_entry *proc_marker_file;
 
 /* module parameters */
@@ -269,19 +273,21 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 					void *prev_subbuf, size_t prev_padding)
 {
+	unsigned int cpu = buf->cpu;
+	atomic_t *drop = &dropped[cpu];
+	int count;
 	if (relay_buf_full(buf)) {
-		if (!suspended) {
-			suspended = 1;
-			printk(KERN_ERR MODULE_NAME
-						": cpu %d buffer full!!!\n",
-						smp_processor_id());
+		if (atomic_inc_return(drop) == 1) {
+			printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n",
+									cpu);
 		}
 		return 0;
-	} else if (suspended) {
-		suspended = 0;
+	} else if ((count = atomic_read(drop))) {
 		printk(KERN_ERR MODULE_NAME
-					": cpu %d buffer no longer full.\n",
-					smp_processor_id());
+					": cpu %d buffer no longer full, "
+					"missed %d events.\n",
+					cpu, count);
+		atomic_sub(count, drop);
 	}
 
 	return 1;
-- 
cgit v1.2.3


From 10c43d2eb50c9a5ad60388b9d3c41c31150049e6 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: explicit call to mmiotrace in do_page_fault()

The custom page fault handler list is replaced with a single function
pointer. All related functions and variables are renamed for
mmiotrace.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Arjan van de Ven <arjan@infradead.org>
Cc: pq@iki.fi
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug            | 14 ++++-----
 arch/x86/kernel/mmiotrace/kmmio.c | 14 ++++-----
 arch/x86/mm/fault.c               | 66 ++++++++++++++++++++-------------------
 include/asm-x86/kdebug.h          | 12 +++----
 4 files changed, 52 insertions(+), 54 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7c6496e2225e..9491c0ae03a3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -168,20 +168,18 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config PAGE_FAULT_HANDLERS
-	bool "Custom page fault handlers"
-	depends on DEBUG_KERNEL
-	help
-	  Allow the use of custom page fault handlers. A kernel module may
-	  register a function that is called on every page fault. Custom
-	  handlers are used by some debugging and reverse engineering tools.
+config MMIOTRACE_HOOKS
+	bool
+	default n
 
 config MMIOTRACE
 	tristate "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && PAGE_FAULT_HANDLERS && RELAY && DEBUG_FS
+	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
+	select MMIOTRACE_HOOKS
 	default n
 	help
 	  This will build a kernel module called mmiotrace.
+	  Making this a built-in is heavily discouraged.
 
 	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
 	  and reverse engineering. The kernel module offers wrapped
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 28411dadb8b3..e759f7c3878f 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -51,10 +51,6 @@ static LIST_HEAD(kmmio_probes);
 
 static struct kmmio_context kmmio_ctx[NR_CPUS];
 
-static struct pf_handler kmmio_pf_hook = {
-	.handler = kmmio_page_fault
-};
-
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
@@ -77,7 +73,8 @@ void cleanup_kmmio(void)
 	 * kmmio_page_table, kmmio_probes
 	 */
 	if (handler_registered) {
-		unregister_page_fault_handler(&kmmio_pf_hook);
+		if (mmiotrace_unregister_pf(&kmmio_page_fault))
+			BUG();
 		synchronize_rcu();
 	}
 	unregister_die_notifier(&nb_die);
@@ -343,8 +340,11 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	}
 
 	if (!handler_registered) {
-		register_page_fault_handler(&kmmio_pf_hook);
-		handler_registered++;
+		if (mmiotrace_register_pf(&kmmio_page_fault))
+			printk(KERN_ERR "mmiotrace: Cannot register page "
+					"fault handler.\n");
+		else
+			handler_registered++;
 	}
 
 out:
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 343f5c1aacc8..e9a086a1a9ff 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -49,53 +49,55 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_PAGE_FAULT_HANDLERS
-static HLIST_HEAD(pf_handlers); /* protected by RCU */
-static DEFINE_SPINLOCK(pf_handlers_writer);
+#ifdef CONFIG_MMIOTRACE_HOOKS
+static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
+static DEFINE_SPINLOCK(mmiotrace_handler_lock);
 
-void register_page_fault_handler(struct pf_handler *new_pfh)
+int mmiotrace_register_pf(pf_handler_func new_pfh)
 {
+	int ret = 0;
 	unsigned long flags;
-	spin_lock_irqsave(&pf_handlers_writer, flags);
-	hlist_add_head_rcu(&new_pfh->hlist, &pf_handlers);
-	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
+	if (mmiotrace_pf_handler)
+		ret = -EBUSY;
+	else
+		mmiotrace_pf_handler = new_pfh;
+	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(register_page_fault_handler);
+EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
 
 /**
- * unregister_page_fault_handler:
+ * mmiotrace_unregister_pf:
  * The caller must ensure @old_pfh is not in use anymore before freeing it.
- * This function does not guarantee it. The list of handlers is protected by
- * RCU, so you can do this by e.g. calling synchronize_rcu().
+ * This function does not guarantee it. The handler function pointer is
+ * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
  */
-void unregister_page_fault_handler(struct pf_handler *old_pfh)
+int mmiotrace_unregister_pf(pf_handler_func old_pfh)
 {
+	int ret = 0;
 	unsigned long flags;
-	spin_lock_irqsave(&pf_handlers_writer, flags);
-	hlist_del_rcu(&old_pfh->hlist);
-	spin_unlock_irqrestore(&pf_handlers_writer, flags);
+	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
+	if (mmiotrace_pf_handler != old_pfh)
+		ret = -EPERM;
+	else
+		mmiotrace_pf_handler = NULL;
+	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
+	return ret;
 }
-EXPORT_SYMBOL_GPL(unregister_page_fault_handler);
-#endif
+EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
+#endif /* CONFIG_MMIOTRACE_HOOKS */
 
 /* returns non-zero if do_page_fault() should return */
-static int handle_custom_pf(struct pt_regs *regs, unsigned long error_code,
-							unsigned long address)
+static inline int call_mmiotrace(struct pt_regs *regs,
+					unsigned long error_code,
+					unsigned long address)
 {
-#ifdef CONFIG_PAGE_FAULT_HANDLERS
+#ifdef CONFIG_MMIOTRACE_HOOKS
 	int ret = 0;
-	struct pf_handler *cur;
-	struct hlist_node *ncur;
-
-	if (hlist_empty(&pf_handlers))
-		return 0;
-
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(cur, ncur, &pf_handlers, hlist) {
-		ret = cur->handler(regs, error_code, address);
-		if (ret)
-			break;
-	}
+	if (mmiotrace_pf_handler)
+		ret = mmiotrace_pf_handler(regs, error_code, address);
 	rcu_read_unlock();
 	return ret;
 #else
@@ -655,7 +657,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
-	if (handle_custom_pf(regs, error_code, address))
+	if (call_mmiotrace(regs, error_code, address))
 		return;
 
 	/*
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index a80f2d6cc737..7063281040da 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,13 +35,11 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
-struct pf_handler {
-	struct hlist_node hlist;
-	int (*handler)(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address);
-};
+typedef int (*pf_handler_func)(struct pt_regs *regs,
+				unsigned long error_code,
+				unsigned long address);
 
-extern void register_page_fault_handler(struct pf_handler *new_pfh);
-extern void unregister_page_fault_handler(struct pf_handler *old_pfh);
+extern int mmiotrace_register_pf(pf_handler_func new_pfh);
+extern int mmiotrace_unregister_pf(pf_handler_func old_pfh);
 
 #endif
-- 
cgit v1.2.3


From f513638030ca384b0bace4df64f0b82f6ae1e4c6 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86 mmiotrace: Use percpu instead of arrays.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: pq@iki.fi
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/kmmio.c    | 27 ++++++------
 arch/x86/kernel/mmiotrace/mmio-mod.c | 80 +++++++++++++++++++-----------------
 2 files changed, 58 insertions(+), 49 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index e759f7c3878f..5e239d0b8467 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
+#include <linux/percpu.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -49,7 +50,8 @@ static unsigned int handler_registered;
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
-static struct kmmio_context kmmio_ctx[NR_CPUS];
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
@@ -173,8 +175,7 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
  */
 static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
-	struct kmmio_context *ctx;
-	int cpu;
+	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -187,8 +188,6 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * And that interrupt triggers a kmmio trap?
 	 */
 	preempt_disable();
-	cpu = smp_processor_id();
-	ctx = &kmmio_ctx[cpu];
 
 	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
 	if (ctx->active) {
@@ -199,7 +198,7 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 		 */
 		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
 					"for address %lu. Ignoring.\n",
-					cpu, addr);
+					smp_processor_id(), addr);
 		goto no_kmmio;
 	}
 	ctx->active++;
@@ -231,6 +230,7 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	/* We hold lock, now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
+	put_cpu_var(kmmio_ctx);
 	return 1;
 
 no_kmmio_locked:
@@ -238,6 +238,7 @@ no_kmmio_locked:
 	ctx->active--;
 no_kmmio:
 	preempt_enable_no_resched();
+	put_cpu_var(kmmio_ctx);
 	/* page fault not handled by kmmio */
 	return 0;
 }
@@ -249,11 +250,11 @@ no_kmmio:
  */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
-	int cpu = smp_processor_id();
-	struct kmmio_context *ctx = &kmmio_ctx[cpu];
+	int ret = 0;
+	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
-		return 0;
+		goto out;
 
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
@@ -273,10 +274,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (regs->flags & TF_MASK)
-		return 0;
+	if (!(regs->flags & TF_MASK))
+		ret = 1;
 
-	return 1;
+out:
+	put_cpu_var(kmmio_ctx);
+	return ret;
 }
 
 static int add_kmmio_fault_page(unsigned long page)
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 0019dcdf6158..f9c609266d83 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -30,6 +30,7 @@
 #include <linux/mmiotrace.h>
 #include <asm/e820.h> /* for ISA_START_ADDRESS */
 #include <asm/atomic.h>
+#include <linux/percpu.h>
 
 #include "kmmio.h"
 #include "pf_in.h"
@@ -49,11 +50,11 @@ struct trap_reason {
 };
 
 /* Accessed per-cpu. */
-static struct trap_reason pf_reason[NR_CPUS];
-static struct mm_io_header_rw cpu_trace[NR_CPUS];
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
 
 /* Access to this is not per-cpu. */
-static atomic_t dropped[NR_CPUS];
+static DEFINE_PER_CPU(atomic_t, dropped);
 
 static struct file_operations mmio_fops = {
 	.owner = THIS_MODULE,
@@ -150,15 +151,15 @@ static void print_pte(unsigned long address)
  */
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
-	const unsigned long cpu = smp_processor_id();
+	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
 					"last fault for address: %lx\n",
-					addr, pf_reason[cpu].addr);
+					addr, my_reason->addr);
 	print_pte(addr);
 #ifdef __i386__
 	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
-							pf_reason[cpu].ip);
+							my_reason->ip);
 	printk(KERN_EMERG
 			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
@@ -168,100 +169,105 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 #else
 	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
-							pf_reason[cpu].ip);
+							my_reason->ip);
 	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
 	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
 				"rbp: %016lx   rsp: %016lx\n",
 				regs->si, regs->di, regs->bp, regs->sp);
 #endif
+	put_cpu_var(pf_reason);
 	BUG();
 }
 
 static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 						unsigned long addr)
 {
-	const unsigned long cpu = smp_processor_id();
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 	const unsigned long instptr = instruction_pointer(regs);
 	const enum reason_type type = get_ins_type(instptr);
 
 	/* it doesn't make sense to have more than one active trace per cpu */
-	if (pf_reason[cpu].active_traces)
+	if (my_reason->active_traces)
 		die_kmmio_nesting_error(regs, addr);
 	else
-		pf_reason[cpu].active_traces++;
+		my_reason->active_traces++;
 
-	pf_reason[cpu].type = type;
-	pf_reason[cpu].addr = addr;
-	pf_reason[cpu].ip = instptr;
+	my_reason->type = type;
+	my_reason->addr = addr;
+	my_reason->ip = instptr;
 
-	cpu_trace[cpu].header.type = MMIO_MAGIC;
-	cpu_trace[cpu].header.pid = 0;
-	cpu_trace[cpu].header.data_len = sizeof(struct mm_io_rw);
-	cpu_trace[cpu].rw.address = addr;
+	my_trace->header.type = MMIO_MAGIC;
+	my_trace->header.pid = 0;
+	my_trace->header.data_len = sizeof(struct mm_io_rw);
+	my_trace->rw.address = addr;
 
 	/*
 	 * Only record the program counter when requested.
 	 * It may taint clean-room reverse engineering.
 	 */
 	if (trace_pc)
-		cpu_trace[cpu].rw.pc = instptr;
+		my_trace->rw.pc = instptr;
 	else
-		cpu_trace[cpu].rw.pc = 0;
+		my_trace->rw.pc = 0;
 
-	record_timestamp(&cpu_trace[cpu].header);
+	record_timestamp(&my_trace->header);
 
 	switch (type) {
 	case REG_READ:
-		cpu_trace[cpu].header.type |=
+		my_trace->header.type |=
 			(MMIO_READ << MMIO_OPCODE_SHIFT) |
 			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
 		break;
 	case REG_WRITE:
-		cpu_trace[cpu].header.type |=
+		my_trace->header.type |=
 			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
 			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		cpu_trace[cpu].rw.value = get_ins_reg_val(instptr, regs);
+		my_trace->rw.value = get_ins_reg_val(instptr, regs);
 		break;
 	case IMM_WRITE:
-		cpu_trace[cpu].header.type |=
+		my_trace->header.type |=
 			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
 			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		cpu_trace[cpu].rw.value = get_ins_imm_val(instptr);
+		my_trace->rw.value = get_ins_imm_val(instptr);
 		break;
 	default:
 		{
 			unsigned char *ip = (unsigned char *)instptr;
-			cpu_trace[cpu].header.type |=
+			my_trace->header.type |=
 					(MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
-			cpu_trace[cpu].rw.value = (*ip) << 16 |
-							*(ip + 1) << 8 |
-							*(ip + 2);
+			my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 |
+								*(ip + 2);
 		}
 	}
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
 }
 
 static void post(struct kmmio_probe *p, unsigned long condition,
 							struct pt_regs *regs)
 {
-	const unsigned long cpu = smp_processor_id();
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
 	/* this should always return the active_trace count to 0 */
-	pf_reason[cpu].active_traces--;
-	if (pf_reason[cpu].active_traces) {
+	my_reason->active_traces--;
+	if (my_reason->active_traces) {
 		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
 		BUG();
 	}
 
-	switch (pf_reason[cpu].type) {
+	switch (my_reason->type) {
 	case REG_READ:
-		cpu_trace[cpu].rw.value = get_ins_reg_val(pf_reason[cpu].ip,
-									regs);
+		my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs);
 		break;
 	default:
 		break;
 	}
-	relay_write(chan, &cpu_trace[cpu], sizeof(struct mm_io_header_rw));
+	relay_write(chan, my_trace, sizeof(*my_trace));
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
 }
 
 /*
@@ -274,7 +280,7 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 					void *prev_subbuf, size_t prev_padding)
 {
 	unsigned int cpu = buf->cpu;
-	atomic_t *drop = &dropped[cpu];
+	atomic_t *drop = &per_cpu(dropped, cpu);
 	int count;
 	if (relay_buf_full(buf)) {
 		if (atomic_inc_return(drop) == 1) {
-- 
cgit v1.2.3


From 0fd0e3da4557c479b820b9a4a7afa25b4637ddf2 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: mmiotrace full patch, preview 1

kmmio.c handles the list of mmio probes with callbacks, list of traced
pages, and attaching into the page fault handler and die notifier. It
arms, traps and disarms the given pages, this is the core of mmiotrace.

mmio-mod.c is a user interface, hooking into ioremap functions and
registering the mmio probes. It also decodes the required information
from trapped mmio accesses via the pre and post callbacks in each probe.
Currently, hooking into ioremap functions works by redefining the symbols
of the target (binary) kernel module, so that it calls the traced
versions of the functions.

The most notable changes done since the last discussion are:
- kmmio.c is a built-in, not part of the module
- direct call from fault.c to kmmio.c, removing all dynamic hooks
- prepare for unregistering probes at any time
- make kmmio re-initializable and accessible to more than one user
- rewrite kmmio locking to remove all spinlocks from page fault path

Can I abuse call_rcu() like I do in kmmio.c:unregister_kmmio_probe()
or is there a better way?

The function called via call_rcu() itself calls call_rcu() again,
will this work or break? There I need a second grace period for RCU
after the first grace period for page faults.

Mmiotrace itself (mmio-mod.c) is still a module, I am going to attack
that next. At some point I will start looking into how to make mmiotrace
a tracer component of ftrace (thanks for the hint, Ingo). Ftrace should
make the user space part of mmiotracing as simple as
'cat /debug/trace/mmio > dump.txt'.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/init_task.c               |   1 -
 arch/x86/kernel/mmiotrace/Makefile        |   8 +-
 arch/x86/kernel/mmiotrace/kmmio.c         | 349 ++++++++++++++++++++----------
 arch/x86/kernel/mmiotrace/kmmio.h         |  58 -----
 arch/x86/kernel/mmiotrace/mmio-mod.c      |  81 ++++---
 arch/x86/kernel/mmiotrace/pf_in.c         |   2 +-
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  13 +-
 arch/x86/mm/fault.c                       |  59 +----
 include/asm-x86/kdebug.h                  |   7 -
 include/linux/mmiotrace.h                 |  38 ++++
 10 files changed, 335 insertions(+), 281 deletions(-)
 delete mode 100644 arch/x86/kernel/mmiotrace/kmmio.h

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 027a5b6a12b2..a4f93b4120c1 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -15,7 +15,6 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
 EXPORT_UNUSED_SYMBOL(init_mm); /* will be removed in 2.6.26 */
-EXPORT_SYMBOL_GPL(init_mm);
 
 /*
  * Initial thread structure.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
index d6905f7f981b..cf1e747b463e 100644
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -1,4 +1,4 @@
-obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
-mmiotrace-objs := pf_in.o kmmio.o mmio-mod.o
-
-obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
+obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
+mmiotrace-objs			:= pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 5e239d0b8467..539a9b19588f 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/version.h>
+#include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
@@ -17,70 +18,119 @@
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
 #include <linux/percpu.h>
+#include <linux/kdebug.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
 #include <asm/tlbflush.h>
 #include <asm/pgtable.h>
 
-#include "kmmio.h"
+#include <linux/mmiotrace.h>
 
-#define KMMIO_HASH_BITS 6
-#define KMMIO_TABLE_SIZE (1 << KMMIO_HASH_BITS)
 #define KMMIO_PAGE_HASH_BITS 4
 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
 
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long page; /* location of the fault page */
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU).
+	 */
+	int count;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
 struct kmmio_context {
 	struct kmmio_fault_page *fpage;
 	struct kmmio_probe *probe;
 	unsigned long saved_flags;
+	unsigned long addr;
 	int active;
 };
 
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address);
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args);
 
+static DECLARE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
 /* These are protected by kmmio_lock */
+static int kmmio_initialized;
 unsigned int kmmio_count;
-static unsigned int handler_registered;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
 static LIST_HEAD(kmmio_probes);
 
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
+/* protected by kmmio_init_mutex */
 static struct notifier_block nb_die = {
 	.notifier_call = kmmio_die_notifier
 };
 
-int init_kmmio(void)
+/**
+ * Makes sure kmmio is initialized and usable.
+ * This must be called before any other kmmio function defined here.
+ * May sleep.
+ */
+void reference_kmmio(void)
 {
-	int i;
-	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&kmmio_page_table[i]);
-
-	register_die_notifier(&nb_die);
-	return 0;
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+	if (!kmmio_initialized) {
+		int i;
+		for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+			INIT_LIST_HEAD(&kmmio_page_table[i]);
+		if (register_die_notifier(&nb_die))
+			BUG();
+	}
+	kmmio_initialized++;
+	spin_unlock_irq(&kmmio_lock);
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL_GPL(reference_kmmio);
 
-void cleanup_kmmio(void)
+/**
+ * Clean up kmmio after use. This must be called for every call to
+ * reference_kmmio(). All probes registered after the corresponding
+ * reference_kmmio() must have been unregistered when calling this.
+ * May sleep.
+ */
+void unreference_kmmio(void)
 {
-	/*
-	 * Assume the following have been already cleaned by calling
-	 * unregister_kmmio_probe() appropriately:
-	 * kmmio_page_table, kmmio_probes
-	 */
-	if (handler_registered) {
-		if (mmiotrace_unregister_pf(&kmmio_page_fault))
-			BUG();
-		synchronize_rcu();
+	bool unreg = false;
+
+	down(&kmmio_init_mutex);
+	spin_lock_irq(&kmmio_lock);
+
+	if (kmmio_initialized == 1) {
+		BUG_ON(is_kmmio_active());
+		unreg = true;
 	}
-	unregister_die_notifier(&nb_die);
+	kmmio_initialized--;
+	BUG_ON(kmmio_initialized < 0);
+	spin_unlock_irq(&kmmio_lock);
+
+	if (unreg)
+		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
+	up(&kmmio_init_mutex);
 }
+EXPORT_SYMBOL(unreference_kmmio);
 
 /*
  * this is basically a dynamic stabbing problem:
@@ -90,33 +140,33 @@ void cleanup_kmmio(void)
  * Overlap a Point (might be simple)
  * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
  */
-/* Get the kmmio at this addr (if any). You must be holding kmmio_lock. */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
 static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 {
 	struct kmmio_probe *p;
-	list_for_each_entry(p, &kmmio_probes, list) {
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
 		if (addr >= p->addr && addr <= (p->addr + p->len))
 			return p;
 	}
 	return NULL;
 }
 
+/* You must be holding RCU read lock. */
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
-	struct list_head *head, *tmp;
+	struct list_head *head;
+	struct kmmio_fault_page *p;
 
 	page &= PAGE_MASK;
-	head = &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
-	list_for_each(tmp, head) {
-		struct kmmio_fault_page *p
-			= list_entry(tmp, struct kmmio_fault_page, list);
+	head = kmmio_page_list(page);
+	list_for_each_entry_rcu(p, head, list) {
 		if (p->page == page)
 			return p;
 	}
-
 	return NULL;
 }
 
+/** Mark the given page as not present. Access to it will trigger a fault. */
 static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -124,8 +174,8 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -143,6 +193,7 @@ static void arm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
 	unsigned long address = page & PAGE_MASK;
@@ -150,8 +201,8 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, page);
+		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
+							__func__, page);
 		return;
 	}
 
@@ -169,13 +220,25 @@ static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 	__flush_tlb_one(page);
 }
 
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
 /*
  * Interrupts are disabled on entry as trap3 is an interrupt gate
  * and they remain disabled thorough out this function.
  */
-static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
-	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -186,40 +249,40 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * XXX what if an interrupt occurs between returning from
 	 * do_page_fault() and entering the single-step exception handler?
 	 * And that interrupt triggers a kmmio trap?
+	 * XXX If we tracing an interrupt service routine or whatever, is
+	 * this enough to keep it on the current cpu?
 	 */
 	preempt_disable();
 
-	/* interrupts disabled and CPU-local data => atomicity guaranteed. */
+	rcu_read_lock();
+	faultpage = get_kmmio_fault_page(addr);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. In the latter case all hell breaks loose.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
 		/*
-		 * This avoids a deadlock with kmmio_lock.
+		 * Prevent overwriting already in-flight context.
 		 * If this page fault really was due to kmmio trap,
 		 * all hell breaks loose.
 		 */
-		printk(KERN_EMERG "mmiotrace: recursive probe hit on CPU %d, "
-					"for address %lu. Ignoring.\n",
+		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
-		goto no_kmmio;
+		goto no_kmmio_ctx;
 	}
 	ctx->active++;
 
-	/*
-	 * Acquire the kmmio lock to prevent changes affecting
-	 * get_kmmio_fault_page() and get_kmmio_probe(), since we save their
-	 * returned pointers.
-	 * The lock is released in post_kmmio_handler().
-	 * XXX: could/should get_kmmio_*() be using RCU instead of spinlock?
-	 */
-	spin_lock(&kmmio_lock);
-
-	ctx->fpage = get_kmmio_fault_page(addr);
-	if (!ctx->fpage) {
-		/* this page fault is not caused by kmmio */
-		goto no_kmmio_locked;
-	}
-
+	ctx->fpage = faultpage;
 	ctx->probe = get_kmmio_probe(addr);
 	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+	ctx->addr = addr;
 
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
@@ -227,46 +290,62 @@ static int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	regs->flags |= TF_MASK;
 	regs->flags &= ~IF_MASK;
 
-	/* We hold lock, now we set present bit in PTE and single step. */
+	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	put_cpu_var(kmmio_ctx);
+	rcu_read_unlock();
 	return 1;
 
-no_kmmio_locked:
-	spin_unlock(&kmmio_lock);
-	ctx->active--;
+no_kmmio_ctx:
+	put_cpu_var(kmmio_ctx);
 no_kmmio:
+	rcu_read_unlock();
 	preempt_enable_no_resched();
-	put_cpu_var(kmmio_ctx);
-	/* page fault not handled by kmmio */
-	return 0;
+	return 0; /* page fault not handled by kmmio */
 }
 
 /*
  * Interrupts are disabled on entry as trap1 is an interrupt gate
  * and they remain disabled thorough out this function.
- * And we hold kmmio lock.
+ * This must always get called as the pair to kmmio_handler().
  */
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
 	int ret = 0;
+	struct kmmio_probe *probe;
+	struct kmmio_fault_page *faultpage;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
 		goto out;
 
+	rcu_read_lock();
+
+	faultpage = get_kmmio_fault_page(ctx->addr);
+	probe = get_kmmio_probe(ctx->addr);
+	if (faultpage != ctx->fpage || probe != ctx->probe) {
+		/*
+		 * The trace setup changed after kmmio_handler() and before
+		 * running this respective post handler. User does not want
+		 * the result anymore.
+		 */
+		ctx->probe = NULL;
+		ctx->fpage = NULL;
+	}
+
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	if (ctx->fpage)
+		arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~TF_MASK;
 	regs->flags |= ctx->saved_flags;
 
 	/* These were acquired in kmmio_handler(). */
 	ctx->active--;
-	spin_unlock(&kmmio_lock);
+	BUG_ON(ctx->active);
 	preempt_enable_no_resched();
 
 	/*
@@ -277,11 +356,13 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	if (!(regs->flags & TF_MASK))
 		ret = 1;
 
+	rcu_read_unlock();
 out:
 	put_cpu_var(kmmio_ctx);
 	return ret;
 }
 
+/* You must be holding kmmio_lock. */
 static int add_kmmio_fault_page(unsigned long page)
 {
 	struct kmmio_fault_page *f;
@@ -289,6 +370,8 @@ static int add_kmmio_fault_page(unsigned long page)
 	page &= PAGE_MASK;
 	f = get_kmmio_fault_page(page);
 	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f->page, NULL);
 		f->count++;
 		return 0;
 	}
@@ -299,15 +382,16 @@ static int add_kmmio_fault_page(unsigned long page)
 
 	f->count = 1;
 	f->page = page;
-	list_add(&f->list,
-		 &kmmio_page_table[hash_long(f->page, KMMIO_PAGE_HASH_BITS)]);
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
 
 	arm_kmmio_fault_page(f->page, NULL);
 
 	return 0;
 }
 
-static void release_kmmio_fault_page(unsigned long page)
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+				struct kmmio_fault_page **release_list)
 {
 	struct kmmio_fault_page *f;
 
@@ -317,9 +401,11 @@ static void release_kmmio_fault_page(unsigned long page)
 		return;
 
 	f->count--;
+	BUG_ON(f->count < 0);
 	if (!f->count) {
 		disarm_kmmio_fault_page(f->page, NULL);
-		list_del(&f->list);
+		f->release_next = *release_list;
+		*release_list = f;
 	}
 }
 
@@ -334,68 +420,113 @@ int register_kmmio_probe(struct kmmio_probe *p)
 		ret = -EEXIST;
 		goto out;
 	}
-	list_add(&p->list, &kmmio_probes);
-	/*printk("adding fault pages...\n");*/
+	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < p->len) {
 		if (add_kmmio_fault_page(p->addr + size))
-			printk(KERN_ERR "mmio: Unable to set page fault.\n");
+			pr_err("kmmio: Unable to set page fault.\n");
 		size += PAGE_SIZE;
 	}
-
-	if (!handler_registered) {
-		if (mmiotrace_register_pf(&kmmio_page_fault))
-			printk(KERN_ERR "mmiotrace: Cannot register page "
-					"fault handler.\n");
-		else
-			handler_registered++;
-	}
-
 out:
 	spin_unlock_irq(&kmmio_lock);
 	/*
 	 * XXX: What should I do here?
 	 * Here was a call to global_flush_tlb(), but it does not exist
-	 * anymore.
+	 * anymore. It seems it's not needed after all.
 	 */
 	return ret;
 }
+EXPORT_SYMBOL(register_kmmio_probe);
 
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	while (p) {
+		struct kmmio_fault_page *next = p->release_next;
+		BUG_ON(p->count);
+		kfree(p);
+		p = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (p) {
+		if (!p->count)
+			list_del_rcu(&p->list);
+		else
+			*prevp = p->release_next;
+		prevp = &p->release_next;
+		p = p->release_next;
+	}
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long size = 0;
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
 
 	spin_lock_irq(&kmmio_lock);
 	while (size < p->len) {
-		release_kmmio_fault_page(p->addr + size);
+		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
-	list_del(&p->list);
+	list_del_rcu(&p->list);
 	kmmio_count--;
 	spin_unlock_irq(&kmmio_lock);
-}
 
-/*
- * According to 2.6.20, mainly x86_64 arch:
- * This is being called from do_page_fault(), via the page fault notifier
- * chain. The chain is called for both user space faults and kernel space
- * faults (address >= TASK_SIZE64), except not on faults serviced by
- * vmalloc_fault().
- *
- * We may be in an interrupt or a critical section. Also prefecthing may
- * trigger a page fault. We may be in the middle of process switch.
- * The page fault hook functionality has put us inside RCU read lock.
- *
- * Local interrupts are disabled, so preemption cannot happen.
- * Do not enable interrupts, do not sleep, and watch out for other CPUs.
- */
-static int kmmio_page_fault(struct pt_regs *regs, unsigned long error_code,
-						unsigned long address)
-{
-	if (is_kmmio_active())
-		if (kmmio_handler(regs, address) == 1)
-			return -1;
-	return 0;
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease) {
+		pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+		return;
+	}
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
 }
+EXPORT_SYMBOL(unregister_kmmio_probe);
 
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args)
diff --git a/arch/x86/kernel/mmiotrace/kmmio.h b/arch/x86/kernel/mmiotrace/kmmio.h
deleted file mode 100644
index 85b7f68a3b8a..000000000000
--- a/arch/x86/kernel/mmiotrace/kmmio.h
+++ /dev/null
@@ -1,58 +0,0 @@
-#ifndef _LINUX_KMMIO_H
-#define _LINUX_KMMIO_H
-
-#include <linux/list.h>
-#include <linux/notifier.h>
-#include <linux/smp.h>
-#include <linux/types.h>
-#include <linux/ptrace.h>
-#include <linux/version.h>
-#include <linux/kdebug.h>
-
-struct kmmio_probe;
-struct kmmio_fault_page;
-struct pt_regs;
-
-typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
-				struct pt_regs *, unsigned long addr);
-typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
-				unsigned long condition, struct pt_regs *);
-
-struct kmmio_probe {
-	struct list_head list;
-
-	/* start location of the probe point */
-	unsigned long addr;
-
-	/* length of the probe region */
-	unsigned long len;
-
-	 /* Called before addr is executed. */
-	kmmio_pre_handler_t pre_handler;
-
-	/* Called after addr is executed, unless... */
-	kmmio_post_handler_t post_handler;
-};
-
-struct kmmio_fault_page {
-	struct list_head list;
-
-	/* location of the fault page */
-	unsigned long page;
-
-	int count;
-};
-
-/* kmmio is active by some kmmio_probes? */
-static inline int is_kmmio_active(void)
-{
-	extern unsigned int kmmio_count;
-	return kmmio_count;
-}
-
-int init_kmmio(void);
-void cleanup_kmmio(void);
-int register_kmmio_probe(struct kmmio_probe *p);
-void unregister_kmmio_probe(struct kmmio_probe *p);
-
-#endif /* _LINUX_KMMIO_H */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index f9c609266d83..e1a508588f03 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -32,7 +32,6 @@
 #include <asm/atomic.h>
 #include <linux/percpu.h>
 
-#include "kmmio.h"
 #include "pf_in.h"
 
 /* This app's relay channel files will appear in /debug/mmio-trace */
@@ -129,18 +128,17 @@ static void print_pte(unsigned long address)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		printk(KERN_ERR "Error in %s: no pte for page 0x%08lx\n",
-						__FUNCTION__, address);
+		pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n",
+							__func__, address);
 		return;
 	}
 
 	if (level == PG_LEVEL_2M) {
-		printk(KERN_EMERG MODULE_NAME ": 4MB pages are not "
-						"currently supported: %lx\n",
-						address);
+		pr_emerg(MODULE_NAME ": 4MB pages are not currently "
+						"supported: %lx\n", address);
 		BUG();
 	}
-	printk(KERN_DEBUG MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
+	pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
 					address, pte_val(*pte),
 					pte_val(*pte) & _PAGE_PRESENT);
 }
@@ -152,7 +150,7 @@ static void print_pte(unsigned long address)
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
 	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	printk(KERN_EMERG MODULE_NAME ": unexpected fault for address: %lx, "
+	pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, "
 					"last fault for address: %lx\n",
 					addr, my_reason->addr);
 	print_pte(addr);
@@ -160,20 +158,17 @@ static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG
-			"eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
-	printk(KERN_EMERG
-			"esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 			regs->si, regs->di, regs->bp, regs->sp);
 #else
 	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
 	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
 							my_reason->ip);
-	printk(KERN_EMERG "rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
-	printk(KERN_EMERG "rsi: %016lx   rdi: %016lx   "
-				"rbp: %016lx   rsp: %016lx\n",
+	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
 				regs->si, regs->di, regs->bp, regs->sp);
 #endif
 	put_cpu_var(pf_reason);
@@ -251,10 +246,15 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
+	/*
+	 * XXX: This might not get called, if the probe is removed while
+	 * trace hit is on flight.
+	 */
+
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
 	if (my_reason->active_traces) {
-		printk(KERN_EMERG MODULE_NAME ": unexpected post handler");
+		pr_emerg(MODULE_NAME ": unexpected post handler");
 		BUG();
 	}
 
@@ -283,16 +283,15 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 	atomic_t *drop = &per_cpu(dropped, cpu);
 	int count;
 	if (relay_buf_full(buf)) {
-		if (atomic_inc_return(drop) == 1) {
-			printk(KERN_ERR MODULE_NAME ": cpu %d buffer full!\n",
-									cpu);
-		}
+		if (atomic_inc_return(drop) == 1)
+			pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu);
 		return 0;
-	} else if ((count = atomic_read(drop))) {
-		printk(KERN_ERR MODULE_NAME
-					": cpu %d buffer no longer full, "
-					"missed %d events.\n",
-					cpu, count);
+	}
+	count = atomic_read(drop);
+	if (count) {
+		pr_err(MODULE_NAME ": cpu %d buffer no longer full, "
+						"missed %d events.\n",
+						cpu, count);
 		atomic_sub(count, drop);
 	}
 
@@ -407,8 +406,8 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 	/* Don't trace the low PCI/ISA area, it's always mapped.. */
 	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
 					(offset + size > ISA_START_ADDRESS)) {
-		printk(KERN_NOTICE MODULE_NAME ": Ignoring map of low "
-						"PCI/ISA area (0x%lx-0x%lx)\n",
+		pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area "
+						"(0x%lx-0x%lx)\n",
 						offset, offset + size);
 		return;
 	}
@@ -418,7 +417,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_cache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -428,7 +427,7 @@ EXPORT_SYMBOL(ioremap_cache_trace);
 void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
 {
 	void __iomem *p = ioremap_nocache(offset, size);
-	printk(KERN_DEBUG MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
+	pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
 							offset, size, p);
 	ioremap_trace_core(offset, size, p);
 	return p;
@@ -455,7 +454,7 @@ void iounmap_trace(volatile void __iomem *addr)
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
-	printk(KERN_DEBUG MODULE_NAME ": Unmapping %p.\n", addr);
+	pr_debug(MODULE_NAME ": Unmapping %p.\n", addr);
 	record_timestamp(&event.header);
 
 	spin_lock(&trace_list_lock);
@@ -481,7 +480,7 @@ static void clear_trace_list(void)
 
 	spin_lock(&trace_list_lock);
 	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		printk(KERN_WARNING MODULE_NAME ": purging non-iounmapped "
+		pr_warning(MODULE_NAME ": purging non-iounmapped "
 					"trace @0x%08lx, size 0x%lx.\n",
 					trace->probe.addr, trace->probe.len);
 		if (!nommiotrace)
@@ -500,39 +499,37 @@ static int __init init(void)
 
 	dir = debugfs_create_dir(APP_DIR, NULL);
 	if (!dir) {
-		printk(KERN_ERR MODULE_NAME
-				": Couldn't create relay app directory.\n");
+		pr_err(MODULE_NAME ": Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
 
 	chan = create_channel(subbuf_size, n_subbufs);
 	if (!chan) {
 		debugfs_remove(dir);
-		printk(KERN_ERR MODULE_NAME
-				": relay app channel creation failed\n");
+		pr_err(MODULE_NAME ": relay app channel creation failed\n");
 		return -ENOMEM;
 	}
 
-	init_kmmio();
+	reference_kmmio();
 
 	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
 	if (proc_marker_file)
 		proc_marker_file->write_proc = write_marker;
 
-	printk(KERN_DEBUG MODULE_NAME ": loaded.\n");
+	pr_debug(MODULE_NAME ": loaded.\n");
 	if (nommiotrace)
-		printk(KERN_DEBUG MODULE_NAME ": MMIO tracing disabled.\n");
+		pr_info(MODULE_NAME ": MMIO tracing disabled.\n");
 	if (ISA_trace)
-		printk(KERN_WARNING MODULE_NAME
-				": Warning! low ISA range will be traced.\n");
+		pr_warning(MODULE_NAME ": Warning! low ISA range will be "
+								"traced.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unload...\n");
+	pr_debug(MODULE_NAME ": unload...\n");
 	clear_trace_list();
-	cleanup_kmmio();
+	unreference_kmmio();
 	remove_proc_entry(MARKER_FILE, NULL);
 	destroy_channel();
 	if (dir)
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
index 67ea520dde62..efa1911e20ca 100644
--- a/arch/x86/kernel/mmiotrace/pf_in.c
+++ b/arch/x86/kernel/mmiotrace/pf_in.c
@@ -19,7 +19,7 @@
  *
  */
 
-/*  $Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp $
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
  *  Copyright by Intel Crop., 2002
  *  Louis Zhuang (louis.zhuang@intel.com)
  *
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
index 40e66b0e6480..5ecff578672b 100644
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -41,8 +41,7 @@ static void do_test(void)
 {
 	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
 	if (!p) {
-		printk(KERN_ERR MODULE_NAME ": could not ioremap IO memory, "
-							"aborting.\n");
+		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
 	do_write_test(p);
@@ -53,14 +52,14 @@ static void do_test(void)
 static int __init init(void)
 {
 	if (mmio_address == 0) {
-		printk(KERN_ERR MODULE_NAME ": you have to use the module "
-						"argument mmio_address.\n");
-		printk(KERN_ERR MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+		pr_err(MODULE_NAME ": you have to use the module argument "
+							"mmio_address.\n");
+		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
 				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
 		return -ENXIO;
 	}
 
-	printk(KERN_WARNING MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
 					"in PCI address space, and writing "
 					"rubbish in there.\n", mmio_address);
 	do_test();
@@ -69,7 +68,7 @@ static int __init init(void)
 
 static void __exit cleanup(void)
 {
-	printk(KERN_DEBUG MODULE_NAME ": unloaded.\n");
+	pr_debug(MODULE_NAME ": unloaded.\n");
 }
 
 module_init(init);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index e9a086a1a9ff..8c828a68d3b6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -10,6 +10,7 @@
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/ptrace.h>
+#include <linux/mmiotrace.h>
 #include <linux/mman.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -49,60 +50,14 @@
 #define PF_RSVD		(1<<3)
 #define PF_INSTR	(1<<4)
 
-#ifdef CONFIG_MMIOTRACE_HOOKS
-static pf_handler_func mmiotrace_pf_handler; /* protected by RCU */
-static DEFINE_SPINLOCK(mmiotrace_handler_lock);
-
-int mmiotrace_register_pf(pf_handler_func new_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler)
-		ret = -EBUSY;
-	else
-		mmiotrace_pf_handler = new_pfh;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_register_pf);
-
-/**
- * mmiotrace_unregister_pf:
- * The caller must ensure @old_pfh is not in use anymore before freeing it.
- * This function does not guarantee it. The handler function pointer is
- * protected by RCU, so you can do this by e.g. calling synchronize_rcu().
- */
-int mmiotrace_unregister_pf(pf_handler_func old_pfh)
-{
-	int ret = 0;
-	unsigned long flags;
-	spin_lock_irqsave(&mmiotrace_handler_lock, flags);
-	if (mmiotrace_pf_handler != old_pfh)
-		ret = -EPERM;
-	else
-		mmiotrace_pf_handler = NULL;
-	spin_unlock_irqrestore(&mmiotrace_handler_lock, flags);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(mmiotrace_unregister_pf);
-#endif /* CONFIG_MMIOTRACE_HOOKS */
-
-/* returns non-zero if do_page_fault() should return */
-static inline int call_mmiotrace(struct pt_regs *regs,
-					unsigned long error_code,
-					unsigned long address)
+static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
 {
 #ifdef CONFIG_MMIOTRACE_HOOKS
-	int ret = 0;
-	rcu_read_lock();
-	if (mmiotrace_pf_handler)
-		ret = mmiotrace_pf_handler(regs, error_code, address);
-	rcu_read_unlock();
-	return ret;
-#else
-	return 0;
+	if (unlikely(is_kmmio_active()))
+		if (kmmio_handler(regs, addr) == 1)
+			return -1;
 #endif
+	return 0;
 }
 
 static inline int notify_page_fault(struct pt_regs *regs)
@@ -657,7 +612,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 
 	if (notify_page_fault(regs))
 		return;
-	if (call_mmiotrace(regs, error_code, address))
+	if (unlikely(kmmio_fault(regs, address)))
 		return;
 
 	/*
diff --git a/include/asm-x86/kdebug.h b/include/asm-x86/kdebug.h
index 7063281040da..96651bb59ba1 100644
--- a/include/asm-x86/kdebug.h
+++ b/include/asm-x86/kdebug.h
@@ -35,11 +35,4 @@ extern void show_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
-typedef int (*pf_handler_func)(struct pt_regs *regs,
-				unsigned long error_code,
-				unsigned long address);
-
-extern int mmiotrace_register_pf(pf_handler_func new_pfh);
-extern int mmiotrace_unregister_pf(pf_handler_func old_pfh);
-
 #endif
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 6ec288f1fe24..d87a6cd8b686 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -3,6 +3,44 @@
 
 #include <asm/types.h>
 
+#ifdef __KERNEL__
+
+#include <linux/list.h>
+
+struct kmmio_probe;
+struct pt_regs;
+
+typedef void (*kmmio_pre_handler_t)(struct kmmio_probe *,
+				struct pt_regs *, unsigned long addr);
+typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
+				unsigned long condition, struct pt_regs *);
+
+struct kmmio_probe {
+	struct list_head list;
+	unsigned long addr; /* start location of the probe point */
+	unsigned long len; /* length of the probe region */
+	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
+	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+};
+
+/* kmmio is active by some kmmio_probes? */
+static inline int is_kmmio_active(void)
+{
+	extern unsigned int kmmio_count;
+	return kmmio_count;
+}
+
+extern void reference_kmmio(void);
+extern void unreference_kmmio(void);
+extern int register_kmmio_probe(struct kmmio_probe *p);
+extern void unregister_kmmio_probe(struct kmmio_probe *p);
+
+/* Called from page fault handler. */
+extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
+
+#endif /* __KERNEL__ */
+
+
 /*
  * If you change anything here, you must bump MMIO_VERSION.
  * This is the relay data format for user space.
-- 
cgit v1.2.3


From d61fc44853f46fb002228b18aa5f30db21fcd4ac Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: x86: mmiotrace, preview 2

Kconfig.debug, Makefile and testmmiotrace.c style fixes.
Use real mutex instead of mutex.
Fix failure path in register probe func.
kmmio: RCU read-locked over single stepping.
Generate mapping id's.
Make mmio-mod.c built-in and rewrite its locking.
Add debugfs file to enable/disable mmiotracing.
kmmio: use irqsave spinlocks.
Lots of cleanups in mmio-mod.c
Marker file moved from /proc into debugfs.
Call mmiotrace entrypoints directly from ioremap.c.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug                    |  20 +-
 arch/x86/kernel/mmiotrace/Makefile        |   2 +-
 arch/x86/kernel/mmiotrace/kmmio.c         |  72 +++---
 arch/x86/kernel/mmiotrace/mmio-mod.c      | 397 ++++++++++++++++++++----------
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  15 +-
 arch/x86/mm/ioremap.c                     |   9 +-
 include/linux/mmiotrace.h                 |  18 +-
 7 files changed, 332 insertions(+), 201 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 9491c0ae03a3..aa0d6462b1fc 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -170,22 +170,19 @@ config IOMMU_LEAK
 
 config MMIOTRACE_HOOKS
 	bool
-	default n
 
 config MMIOTRACE
-	tristate "Memory mapped IO tracing"
+	bool "Memory mapped IO tracing"
 	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
 	select MMIOTRACE_HOOKS
-	default n
+	default y
 	help
-	  This will build a kernel module called mmiotrace.
-	  Making this a built-in is heavily discouraged.
-
-	  Mmiotrace traces Memory Mapped I/O access and is meant for debugging
-	  and reverse engineering. The kernel module offers wrapped
-	  versions of the ioremap family of functions. The driver to be traced
-	  must be modified to call these wrappers. A user space program is
-	  required to collect the MMIO data.
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. A user space program is
+	  required to collect the MMIO data from debugfs files.
+	  Tracing is disabled by default and can be enabled from a debugfs
+	  file.
 
 	  See http://nouveau.freedesktop.org/wiki/MmioTrace
 	  If you are not helping to develop drivers, say N.
@@ -193,7 +190,6 @@ config MMIOTRACE
 config MMIOTRACE_TEST
 	tristate "Test module for mmiotrace"
 	depends on MMIOTRACE && m
-	default n
 	help
 	  This is a dumb module for testing mmiotrace. It is very dangerous
 	  as it will write garbage to IO memory starting at a given address.
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
index cf1e747b463e..dbcd8d50fb8d 100644
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ b/arch/x86/kernel/mmiotrace/Makefile
@@ -1,4 +1,4 @@
 obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
-mmiotrace-objs			:= pf_in.o mmio-mod.o
+mmiotrace-y			:= pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index 539a9b19588f..efb467933087 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -19,6 +19,7 @@
 #include <linux/preempt.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
+#include <linux/mutex.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
 #include <asm/errno.h>
@@ -59,7 +60,7 @@ struct kmmio_context {
 static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 								void *args);
 
-static DECLARE_MUTEX(kmmio_init_mutex);
+static DEFINE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
 /* These are protected by kmmio_lock */
@@ -90,7 +91,7 @@ static struct notifier_block nb_die = {
  */
 void reference_kmmio(void)
 {
-	down(&kmmio_init_mutex);
+	mutex_lock(&kmmio_init_mutex);
 	spin_lock_irq(&kmmio_lock);
 	if (!kmmio_initialized) {
 		int i;
@@ -101,7 +102,7 @@ void reference_kmmio(void)
 	}
 	kmmio_initialized++;
 	spin_unlock_irq(&kmmio_lock);
-	up(&kmmio_init_mutex);
+	mutex_unlock(&kmmio_init_mutex);
 }
 EXPORT_SYMBOL_GPL(reference_kmmio);
 
@@ -115,7 +116,7 @@ void unreference_kmmio(void)
 {
 	bool unreg = false;
 
-	down(&kmmio_init_mutex);
+	mutex_lock(&kmmio_init_mutex);
 	spin_lock_irq(&kmmio_lock);
 
 	if (kmmio_initialized == 1) {
@@ -128,7 +129,7 @@ void unreference_kmmio(void)
 
 	if (unreg)
 		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
-	up(&kmmio_init_mutex);
+	mutex_unlock(&kmmio_init_mutex);
 }
 EXPORT_SYMBOL(unreference_kmmio);
 
@@ -244,17 +245,13 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * Preemption is now disabled to prevent process switch during
 	 * single stepping. We can only handle one active kmmio trace
 	 * per cpu, so ensure that we finish it before something else
-	 * gets to run.
-	 *
-	 * XXX what if an interrupt occurs between returning from
-	 * do_page_fault() and entering the single-step exception handler?
-	 * And that interrupt triggers a kmmio trap?
-	 * XXX If we tracing an interrupt service routine or whatever, is
-	 * this enough to keep it on the current cpu?
+	 * gets to run. We also hold the RCU read lock over single
+	 * stepping to avoid looking up the probe and kmmio_fault_page
+	 * again.
 	 */
 	preempt_disable();
-
 	rcu_read_lock();
+
 	faultpage = get_kmmio_fault_page(addr);
 	if (!faultpage) {
 		/*
@@ -287,14 +284,24 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	if (ctx->probe && ctx->probe->pre_handler)
 		ctx->probe->pre_handler(ctx->probe, regs, addr);
 
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
 	regs->flags |= TF_MASK;
 	regs->flags &= ~IF_MASK;
 
 	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
 
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
 	put_cpu_var(kmmio_ctx);
-	rcu_read_unlock();
 	return 1;
 
 no_kmmio_ctx:
@@ -313,32 +320,15 @@ no_kmmio:
 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 {
 	int ret = 0;
-	struct kmmio_probe *probe;
-	struct kmmio_fault_page *faultpage;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
 	if (!ctx->active)
 		goto out;
 
-	rcu_read_lock();
-
-	faultpage = get_kmmio_fault_page(ctx->addr);
-	probe = get_kmmio_probe(ctx->addr);
-	if (faultpage != ctx->fpage || probe != ctx->probe) {
-		/*
-		 * The trace setup changed after kmmio_handler() and before
-		 * running this respective post handler. User does not want
-		 * the result anymore.
-		 */
-		ctx->probe = NULL;
-		ctx->fpage = NULL;
-	}
-
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
 
-	if (ctx->fpage)
-		arm_kmmio_fault_page(ctx->fpage->page, NULL);
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
 	regs->flags &= ~TF_MASK;
 	regs->flags |= ctx->saved_flags;
@@ -346,6 +336,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	/* These were acquired in kmmio_handler(). */
 	ctx->active--;
 	BUG_ON(ctx->active);
+	rcu_read_unlock();
 	preempt_enable_no_resched();
 
 	/*
@@ -355,8 +346,6 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	 */
 	if (!(regs->flags & TF_MASK))
 		ret = 1;
-
-	rcu_read_unlock();
 out:
 	put_cpu_var(kmmio_ctx);
 	return ret;
@@ -411,15 +400,16 @@ static void release_kmmio_fault_page(unsigned long page,
 
 int register_kmmio_probe(struct kmmio_probe *p)
 {
+	unsigned long flags;
 	int ret = 0;
 	unsigned long size = 0;
 
-	spin_lock_irq(&kmmio_lock);
-	kmmio_count++;
+	spin_lock_irqsave(&kmmio_lock, flags);
 	if (get_kmmio_probe(p->addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
+	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < p->len) {
 		if (add_kmmio_fault_page(p->addr + size))
@@ -427,7 +417,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
 		size += PAGE_SIZE;
 	}
 out:
-	spin_unlock_irq(&kmmio_lock);
+	spin_unlock_irqrestore(&kmmio_lock, flags);
 	/*
 	 * XXX: What should I do here?
 	 * Here was a call to global_flush_tlb(), but it does not exist
@@ -478,7 +468,8 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 
 /*
  * Remove a kmmio probe. You have to synchronize_rcu() before you can be
- * sure that the callbacks will not be called anymore.
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
  *
  * Unregistering a kmmio fault page has three steps:
  * 1. release_kmmio_fault_page()
@@ -490,18 +481,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
  */
 void unregister_kmmio_probe(struct kmmio_probe *p)
 {
+	unsigned long flags;
 	unsigned long size = 0;
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
 
-	spin_lock_irq(&kmmio_lock);
+	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < p->len) {
 		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
 	list_del_rcu(&p->list);
 	kmmio_count--;
-	spin_unlock_irq(&kmmio_lock);
+	spin_unlock_irqrestore(&kmmio_lock, flags);
 
 	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
 	if (!drelease) {
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index e1a508588f03..738644061e4e 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -19,6 +19,8 @@
  *
  * Derived from the read-mod example from relay-examples by Tom Zanussi.
  */
+#define DEBUG 1
+
 #include <linux/module.h>
 #include <linux/relay.h>
 #include <linux/debugfs.h>
@@ -34,12 +36,12 @@
 
 #include "pf_in.h"
 
-/* This app's relay channel files will appear in /debug/mmio-trace */
-#define APP_DIR		"mmio-trace"
-/* the marker injection file in /proc */
-#define MARKER_FILE     "mmio-marker"
+#define NAME "mmiotrace: "
 
-#define MODULE_NAME     "mmiotrace"
+/* This app's relay channel files will appear in /debug/mmio-trace */
+static const char APP_DIR[] = "mmio-trace";
+/* the marker injection file in /debug/APP_DIR */
+static const char MARKER_FILE[] = "mmio-marker";
 
 struct trap_reason {
 	unsigned long addr;
@@ -48,6 +50,15 @@ struct trap_reason {
 	int active_traces;
 };
 
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	unsigned long phys;
+	unsigned long id;
+};
+
+static const size_t subbuf_size = 256*1024;
+
 /* Accessed per-cpu. */
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
 static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
@@ -55,33 +66,53 @@ static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
 
-static struct file_operations mmio_fops = {
-	.owner = THIS_MODULE,
-};
+static struct dentry *dir;
+static struct dentry *enabled_file;
+static struct dentry *marker_file;
 
-static const size_t subbuf_size = 256*1024;
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
 static struct rchan *chan;
-static struct dentry *dir;
-static struct proc_dir_entry *proc_marker_file;
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that chan is valid.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
 
 /* module parameters */
-static unsigned int      n_subbufs = 32*4;
-static unsigned long filter_offset;
-static int                 nommiotrace;
-static int               ISA_trace;
-static int                trace_pc;
+static unsigned int	n_subbufs = 32*4;
+static unsigned long	filter_offset;
+static int		nommiotrace;
+static int		ISA_trace;
+static int		trace_pc;
+static int		enable_now;
 
 module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
+module_param(enable_now, bool, 0);
 
 MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
 MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
 
 static void record_timestamp(struct mm_io_header *header)
 {
@@ -93,15 +124,15 @@ static void record_timestamp(struct mm_io_header *header)
 }
 
 /*
- * Write callback for the /proc entry:
+ * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
  */
-static int write_marker(struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
+static ssize_t write_marker(struct file *file, const char __user *buffer,
+						size_t count, loff_t *ppos)
 {
 	char *event = NULL;
 	struct mm_io_header *headp;
-	int len = (count > 65535) ? 65535 : count;
+	ssize_t len = (count > 65535) ? 65535 : count;
 
 	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
 	if (!event)
@@ -117,7 +148,12 @@ static int write_marker(struct file *file, const char __user *buffer,
 		return -EFAULT;
 	}
 
-	relay_write(chan, event, sizeof(*headp) + len);
+	spin_lock_irq(&trace_lock);
+	if (is_enabled())
+		relay_write(chan, event, sizeof(*headp) + len);
+	else
+		len = -EINVAL;
+	spin_unlock_irq(&trace_lock);
 	kfree(event);
 	return len;
 }
@@ -128,19 +164,18 @@ static void print_pte(unsigned long address)
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-		pr_err(MODULE_NAME ": Error in %s: no pte for page 0x%08lx\n",
+		pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
 							__func__, address);
 		return;
 	}
 
 	if (level == PG_LEVEL_2M) {
-		pr_emerg(MODULE_NAME ": 4MB pages are not currently "
-						"supported: %lx\n", address);
+		pr_emerg(NAME "4MB pages are not currently supported: "
+							"0x%08lx\n", address);
 		BUG();
 	}
-	pr_info(MODULE_NAME ": pte for 0x%lx: 0x%lx 0x%lx\n",
-					address, pte_val(*pte),
-					pte_val(*pte) & _PAGE_PRESENT);
+	pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte),
+						pte_val(*pte) & _PAGE_PRESENT);
 }
 
 /*
@@ -150,22 +185,18 @@ static void print_pte(unsigned long address)
 static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
 {
 	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	pr_emerg(MODULE_NAME ": unexpected fault for address: %lx, "
-					"last fault for address: %lx\n",
+	pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
+					"last fault for address: 0x%08lx\n",
 					addr, my_reason->addr);
 	print_pte(addr);
+	print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
 #ifdef __i386__
-	print_symbol(KERN_EMERG "faulting EIP is at %s\n", regs->ip);
-	print_symbol(KERN_EMERG "last faulting EIP was at %s\n",
-							my_reason->ip);
 	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
 			regs->ax, regs->bx, regs->cx, regs->dx);
 	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 			regs->si, regs->di, regs->bp, regs->sp);
 #else
-	print_symbol(KERN_EMERG "faulting RIP is at %s\n", regs->ip);
-	print_symbol(KERN_EMERG "last faulting RIP was at %s\n",
-							my_reason->ip);
 	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
 					regs->ax, regs->cx, regs->dx);
 	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
@@ -197,6 +228,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	my_trace->header.pid = 0;
 	my_trace->header.data_len = sizeof(struct mm_io_rw);
 	my_trace->rw.address = addr;
+	/*
+	 * struct remap_trace *trace = p->user_data;
+	 * phys = addr - trace->probe.addr + trace->phys;
+	 */
 
 	/*
 	 * Only record the program counter when requested.
@@ -246,15 +281,10 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
 	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
 
-	/*
-	 * XXX: This might not get called, if the probe is removed while
-	 * trace hit is on flight.
-	 */
-
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
 	if (my_reason->active_traces) {
-		pr_emerg(MODULE_NAME ": unexpected post handler");
+		pr_emerg(NAME "unexpected post handler");
 		BUG();
 	}
 
@@ -284,20 +314,23 @@ static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
 	int count;
 	if (relay_buf_full(buf)) {
 		if (atomic_inc_return(drop) == 1)
-			pr_err(MODULE_NAME ": cpu %d buffer full!\n", cpu);
+			pr_err(NAME "cpu %d buffer full!\n", cpu);
 		return 0;
 	}
 	count = atomic_read(drop);
 	if (count) {
-		pr_err(MODULE_NAME ": cpu %d buffer no longer full, "
-						"missed %d events.\n",
-						cpu, count);
+		pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n",
+								cpu, count);
 		atomic_sub(count, drop);
 	}
 
 	return 1;
 }
 
+static struct file_operations mmio_fops = {
+	.owner = THIS_MODULE,
+};
+
 /* file_create() callback.  Creates relay file in debugfs. */
 static struct dentry *create_buf_file_handler(const char *filename,
 						struct dentry *parent,
@@ -333,34 +366,10 @@ static struct rchan_callbacks relay_callbacks = {
 	.remove_buf_file = remove_buf_file_handler,
 };
 
-/*
- * create_channel - creates channel /debug/APP_DIR/cpuXXX
- * Returns channel on success, NULL otherwise
- */
-static struct rchan *create_channel(unsigned size, unsigned n)
-{
-	return relay_open("cpu", dir, size, n, &relay_callbacks, NULL);
-}
-
-/* destroy_channel - destroys channel /debug/APP_DIR/cpuXXX */
-static void destroy_channel(void)
-{
-	if (chan) {
-		relay_close(chan);
-		chan = NULL;
-	}
-}
-
-struct remap_trace {
-	struct list_head list;
-	struct kmmio_probe probe;
-};
-static LIST_HEAD(trace_list);
-static DEFINE_SPINLOCK(trace_list_lock);
-
-static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
+static void ioremap_trace_core(unsigned long offset, unsigned long size,
 							void __iomem *addr)
 {
+	static atomic_t next_id;
 	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
 	struct mm_io_header_map event = {
 		.header = {
@@ -380,61 +389,49 @@ static void do_ioremap_trace_core(unsigned long offset, unsigned long size,
 	};
 	record_timestamp(&event.header);
 
+	if (!trace) {
+		pr_err(NAME "kmalloc failed in ioremap\n");
+		return;
+	}
+
 	*trace = (struct remap_trace) {
 		.probe = {
 			.addr = (unsigned long)addr,
 			.len = size,
 			.pre_handler = pre,
 			.post_handler = post,
-		}
+			.user_data = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
 	};
 
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
 	relay_write(chan, &event, sizeof(event));
-	spin_lock(&trace_list_lock);
 	list_add_tail(&trace->list, &trace_list);
-	spin_unlock(&trace_list_lock);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
 }
 
-static void ioremap_trace_core(unsigned long offset, unsigned long size,
-							void __iomem *addr)
+void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
 {
-	if ((filter_offset) && (offset != filter_offset))
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
 		return;
 
-	/* Don't trace the low PCI/ISA area, it's always mapped.. */
-	if (!ISA_trace && (offset < ISA_END_ADDRESS) &&
-					(offset + size > ISA_START_ADDRESS)) {
-		pr_notice(MODULE_NAME ": Ignoring map of low PCI/ISA area "
-						"(0x%lx-0x%lx)\n",
-						offset, offset + size);
+	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
 		return;
-	}
-	do_ioremap_trace_core(offset, size, addr);
-}
-
-void __iomem *ioremap_cache_trace(unsigned long offset, unsigned long size)
-{
-	void __iomem *p = ioremap_cache(offset, size);
-	pr_debug(MODULE_NAME ": ioremap_cache(0x%lx, 0x%lx) = %p\n",
-							offset, size, p);
-	ioremap_trace_core(offset, size, p);
-	return p;
+	ioremap_trace_core(offset, size, addr);
 }
-EXPORT_SYMBOL(ioremap_cache_trace);
 
-void __iomem *ioremap_nocache_trace(unsigned long offset, unsigned long size)
-{
-	void __iomem *p = ioremap_nocache(offset, size);
-	pr_debug(MODULE_NAME ": ioremap_nocache(0x%lx, 0x%lx) = %p\n",
-							offset, size, p);
-	ioremap_trace_core(offset, size, p);
-	return p;
-}
-EXPORT_SYMBOL(ioremap_nocache_trace);
-
-void iounmap_trace(volatile void __iomem *addr)
+static void iounmap_trace_core(volatile void __iomem *addr)
 {
 	struct mm_io_header_map event = {
 		.header = {
@@ -454,84 +451,212 @@ void iounmap_trace(volatile void __iomem *addr)
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
-	pr_debug(MODULE_NAME ": Unmapping %p.\n", addr);
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug(NAME "Unmapping %p.\n", addr);
 	record_timestamp(&event.header);
 
-	spin_lock(&trace_list_lock);
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
 	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
 		if ((unsigned long)addr == trace->probe.addr) {
 			if (!nommiotrace)
 				unregister_kmmio_probe(&trace->probe);
 			list_del(&trace->list);
-			kfree(trace);
+			found_trace = trace;
 			break;
 		}
 	}
-	spin_unlock(&trace_list_lock);
 	relay_write(chan, &event, sizeof(event));
-	iounmap(addr);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
 }
-EXPORT_SYMBOL(iounmap_trace);
 
 static void clear_trace_list(void)
 {
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
 
-	spin_lock(&trace_list_lock);
-	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		pr_warning(MODULE_NAME ": purging non-iounmapped "
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice(NAME "purging non-iounmapped "
 					"trace @0x%08lx, size 0x%lx.\n",
 					trace->probe.addr, trace->probe.len);
 		if (!nommiotrace)
 			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
 		list_del(&trace->list);
 		kfree(trace);
+	}
+}
+
+static ssize_t read_enabled_file_bool(struct file *file,
+		char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[3];
+
+	if (is_enabled())
+		buf[0] = '1';
+	else
+		buf[0] = '0';
+	buf[1] = '\n';
+	buf[2] = '\0';
+	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
+}
+
+static void enable_mmiotrace(void);
+static void disable_mmiotrace(void);
+
+static ssize_t write_enabled_file_bool(struct file *file,
+		const char __user *user_buf, size_t count, loff_t *ppos)
+{
+	char buf[32];
+	int buf_size = min(count, (sizeof(buf)-1));
+
+	if (copy_from_user(buf, user_buf, buf_size))
+		return -EFAULT;
+
+	switch (buf[0]) {
+	case 'y':
+	case 'Y':
+	case '1':
+		enable_mmiotrace();
+		break;
+	case 'n':
+	case 'N':
+	case '0':
+		disable_mmiotrace();
 		break;
 	}
-	spin_unlock(&trace_list_lock);
+
+	return count;
+}
+
+/* this ripped from kernel/kprobes.c */
+static struct file_operations fops_enabled = {
+	.owner =	THIS_MODULE,
+	.read =		read_enabled_file_bool,
+	.write =	write_enabled_file_bool
+};
+
+static struct file_operations fops_marker = {
+	.owner =	THIS_MODULE,
+	.write =	write_marker
+};
+
+static void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+	chan = relay_open("cpu", dir, subbuf_size, n_subbufs,
+						&relay_callbacks, NULL);
+	if (!chan) {
+		pr_err(NAME "relay app channel creation failed.\n");
+		goto out;
+	}
+
+	reference_kmmio();
+
+	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
+								&fops_marker);
+	if (!marker_file)
+		pr_err(NAME "marker file creation failed.\n");
+
+	if (nommiotrace)
+		pr_info(NAME "MMIO tracing disabled.\n");
+	if (ISA_trace)
+		pr_warning(NAME "Warning! low ISA range will be traced.\n");
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info(NAME "enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+static void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	unreference_kmmio();
+	if (marker_file) {
+		debugfs_remove(marker_file);
+		marker_file = NULL;
+	}
+	if (chan) {
+		relay_close(chan);
+		chan = NULL;
+	}
+
+	pr_info(NAME "disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
 }
 
 static int __init init(void)
 {
+	pr_debug(NAME "load...\n");
 	if (n_subbufs < 2)
 		return -EINVAL;
 
 	dir = debugfs_create_dir(APP_DIR, NULL);
 	if (!dir) {
-		pr_err(MODULE_NAME ": Couldn't create relay app directory.\n");
+		pr_err(NAME "Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
 
-	chan = create_channel(subbuf_size, n_subbufs);
-	if (!chan) {
+	enabled_file = debugfs_create_file("enabled", 0600, dir, NULL,
+								&fops_enabled);
+	if (!enabled_file) {
+		pr_err(NAME "Couldn't create enabled file.\n");
 		debugfs_remove(dir);
-		pr_err(MODULE_NAME ": relay app channel creation failed\n");
 		return -ENOMEM;
 	}
 
-	reference_kmmio();
-
-	proc_marker_file = create_proc_entry(MARKER_FILE, 0, NULL);
-	if (proc_marker_file)
-		proc_marker_file->write_proc = write_marker;
+	if (enable_now)
+		enable_mmiotrace();
 
-	pr_debug(MODULE_NAME ": loaded.\n");
-	if (nommiotrace)
-		pr_info(MODULE_NAME ": MMIO tracing disabled.\n");
-	if (ISA_trace)
-		pr_warning(MODULE_NAME ": Warning! low ISA range will be "
-								"traced.\n");
 	return 0;
 }
 
 static void __exit cleanup(void)
 {
-	pr_debug(MODULE_NAME ": unload...\n");
-	clear_trace_list();
-	unreference_kmmio();
-	remove_proc_entry(MARKER_FILE, NULL);
-	destroy_channel();
+	pr_debug(NAME "unload...\n");
+	if (enabled_file)
+		debugfs_remove(enabled_file);
+	disable_mmiotrace();
 	if (dir)
 		debugfs_remove(dir);
 }
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
index 5ecff578672b..cfa60b227c8d 100644
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ b/arch/x86/kernel/mmiotrace/testmmiotrace.c
@@ -4,10 +4,6 @@
 #include <linux/module.h>
 #include <asm/io.h>
 
-extern void __iomem *ioremap_nocache_trace(unsigned long offset,
-						unsigned long size);
-extern void iounmap_trace(volatile void __iomem *addr);
-
 #define MODULE_NAME "testmmiotrace"
 
 static unsigned long mmio_address;
@@ -28,25 +24,24 @@ static void do_write_test(void __iomem *p)
 static void do_read_test(void __iomem *p)
 {
 	unsigned int i;
-	volatile unsigned int v;
 	for (i = 0; i < 256; i++)
-		v = ioread8(p + i);
+		ioread8(p + i);
 	for (i = 1024; i < (5 * 1024); i += 2)
-		v = ioread16(p + i);
+		ioread16(p + i);
 	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		v = ioread32(p + i);
+		ioread32(p + i);
 }
 
 static void do_test(void)
 {
-	void __iomem *p = ioremap_nocache_trace(mmio_address, 0x4000);
+	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
 	if (!p) {
 		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
 		return;
 	}
 	do_write_test(p);
 	do_read_test(p);
-	iounmap_trace(p);
+	iounmap(p);
 }
 
 static int __init init(void)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 71bb3159031a..8927c878544d 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
+#include <linux/mmiotrace.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
@@ -126,6 +127,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	unsigned long new_prot_val;
 	pgprot_t prot;
 	int retval;
+	void __iomem *ret_addr;
 
 	/* Don't allow wraparound or zero size */
 	last_addr = phys_addr + size - 1;
@@ -233,7 +235,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
-	return (void __iomem *) (vaddr + offset);
+	ret_addr = (void __iomem *) (vaddr + offset);
+	mmiotrace_ioremap(phys_addr, size, ret_addr);
+
+	return ret_addr;
 }
 
 /**
@@ -325,6 +330,8 @@ void iounmap(volatile void __iomem *addr)
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);
 
+	mmiotrace_iounmap(addr);
+
 	/* Use the vm area unlocked, assuming the caller
 	   ensures there isn't another iounmap for the same address
 	   in parallel. Reuse of the virtual address is prevented by
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index d87a6cd8b686..cb5efd0c7f51 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -16,11 +16,12 @@ typedef void (*kmmio_post_handler_t)(struct kmmio_probe *,
 				unsigned long condition, struct pt_regs *);
 
 struct kmmio_probe {
-	struct list_head list;
+	struct list_head list; /* kmmio internal list */
 	unsigned long addr; /* start location of the probe point */
 	unsigned long len; /* length of the probe region */
 	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
 	kmmio_post_handler_t post_handler; /* Called after addr is executed */
+	void *user_data;
 };
 
 /* kmmio is active by some kmmio_probes? */
@@ -38,6 +39,21 @@ extern void unregister_kmmio_probe(struct kmmio_probe *p);
 /* Called from page fault handler. */
 extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
+/* Called from ioremap.c */
+#ifdef CONFIG_MMIOTRACE
+extern void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr);
+extern void mmiotrace_iounmap(volatile void __iomem *addr);
+#else
+static inline void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+{
+}
+static inline void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+}
+#endif /* CONFIG_MMIOTRACE_HOOKS */
+
 #endif /* __KERNEL__ */
 
 
-- 
cgit v1.2.3


From f984b51e0779a6dd30feedc41404013ca54e5d05 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: add mmiotrace plugin

On Sat, 22 Mar 2008 13:07:47 +0100
Ingo Molnar <mingo@elte.hu> wrote:

> > > i'd suggest the following: pull x86.git and sched-devel.git into a
> > > single tree [the two will combine without rejects]. Then try to add a
> > > kernel/tracing/trace_mmiotrace.c ftrace plugin. The trace_sysprof.c
> > > plugin might be a good example.
> >
> > I did this and now I have mmiotrace enabled/disabled via the tracing
> > framework (what do we call this, since ftrace is one of the tracers?).
>
> cool! could you send the patches for that? (even if they are not fully
> functional yet)

Patch attached in the end. Nice to see how much code disappeared. I tried
to mark all the features I had to break with XXX-comments.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug               |   3 +-
 arch/x86/kernel/mmiotrace/mmio-mod.c | 208 +++++------------------------------
 include/linux/mmiotrace.h            |   6 +
 kernel/trace/Makefile                |   1 +
 kernel/trace/trace_mmiotrace.c       |  84 ++++++++++++++
 5 files changed, 123 insertions(+), 179 deletions(-)
 create mode 100644 kernel/trace/trace_mmiotrace.c

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index aa0d6462b1fc..7e4b8494078e 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -173,7 +173,8 @@ config MMIOTRACE_HOOKS
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && RELAY && DEBUG_FS
+	depends on DEBUG_KERNEL && RELAY
+	select TRACING
 	select MMIOTRACE_HOOKS
 	default y
 	help
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 738644061e4e..c7a67d7e482b 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -22,9 +22,8 @@
 #define DEBUG 1
 
 #include <linux/module.h>
-#include <linux/relay.h>
 #include <linux/debugfs.h>
-#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/version.h>
 #include <linux/kallsyms.h>
@@ -63,18 +62,18 @@ static const size_t subbuf_size = 256*1024;
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
 static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
 
+#if 0 /* XXX: no way gather this info anymore */
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
+#endif
 
 static struct dentry *dir;
-static struct dentry *enabled_file;
 static struct dentry *marker_file;
 
 static DEFINE_MUTEX(mmiotrace_mutex);
 static DEFINE_SPINLOCK(trace_lock);
 static atomic_t mmiotrace_enabled;
 static LIST_HEAD(trace_list);		/* struct remap_trace */
-static struct rchan *chan;
 
 /*
  * Locking in this file:
@@ -93,36 +92,24 @@ static unsigned long	filter_offset;
 static int		nommiotrace;
 static int		ISA_trace;
 static int		trace_pc;
-static int		enable_now;
 
 module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
-module_param(enable_now, bool, 0);
 
 MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
 MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
-MODULE_PARM_DESC(enable_now, "Start mmiotracing immediately on module load.");
 
 static bool is_enabled(void)
 {
 	return atomic_read(&mmiotrace_enabled);
 }
 
-static void record_timestamp(struct mm_io_header *header)
-{
-	struct timespec now;
-
-	getnstimeofday(&now);
-	header->sec = now.tv_sec;
-	header->nsec = now.tv_nsec;
-}
-
 /*
  * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
@@ -141,7 +128,6 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	headp = (struct mm_io_header *)event;
 	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
 	headp->data_len = len;
-	record_timestamp(headp);
 
 	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
 		kfree(event);
@@ -149,9 +135,11 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	}
 
 	spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
 	if (is_enabled())
 		relay_write(chan, event, sizeof(*headp) + len);
 	else
+#endif
 		len = -EINVAL;
 	spin_unlock_irq(&trace_lock);
 	kfree(event);
@@ -242,7 +230,11 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	else
 		my_trace->rw.pc = 0;
 
-	record_timestamp(&my_trace->header);
+	/*
+	 * XXX: the timestamp recorded will be *after* the tracing has been
+	 * done, not at the time we hit the instruction. SMP implications
+	 * on event ordering?
+	 */
 
 	switch (type) {
 	case REG_READ:
@@ -295,77 +287,19 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	default:
 		break;
 	}
-	relay_write(chan, my_trace, sizeof(*my_trace));
+
+	/*
+	 * XXX: Several required values are ignored:
+	 * - mapping id
+	 * - program counter
+	 * Also the address should be physical, not virtual.
+	 */
+	mmio_trace_record(my_trace->header.type, my_trace->rw.address,
+							my_trace->rw.value);
 	put_cpu_var(cpu_trace);
 	put_cpu_var(pf_reason);
 }
 
-/*
- * subbuf_start() relay callback.
- *
- * Defined so that we know when events are dropped due to the buffer-full
- * condition.
- */
-static int subbuf_start_handler(struct rchan_buf *buf, void *subbuf,
-					void *prev_subbuf, size_t prev_padding)
-{
-	unsigned int cpu = buf->cpu;
-	atomic_t *drop = &per_cpu(dropped, cpu);
-	int count;
-	if (relay_buf_full(buf)) {
-		if (atomic_inc_return(drop) == 1)
-			pr_err(NAME "cpu %d buffer full!\n", cpu);
-		return 0;
-	}
-	count = atomic_read(drop);
-	if (count) {
-		pr_err(NAME "cpu %d buffer no longer full, missed %d events.\n",
-								cpu, count);
-		atomic_sub(count, drop);
-	}
-
-	return 1;
-}
-
-static struct file_operations mmio_fops = {
-	.owner = THIS_MODULE,
-};
-
-/* file_create() callback.  Creates relay file in debugfs. */
-static struct dentry *create_buf_file_handler(const char *filename,
-						struct dentry *parent,
-						int mode,
-						struct rchan_buf *buf,
-						int *is_global)
-{
-	struct dentry *buf_file;
-
-	mmio_fops.read = relay_file_operations.read;
-	mmio_fops.open = relay_file_operations.open;
-	mmio_fops.poll = relay_file_operations.poll;
-	mmio_fops.mmap = relay_file_operations.mmap;
-	mmio_fops.release = relay_file_operations.release;
-	mmio_fops.splice_read = relay_file_operations.splice_read;
-
-	buf_file = debugfs_create_file(filename, mode, parent, buf,
-								&mmio_fops);
-
-	return buf_file;
-}
-
-/* file_remove() default callback.  Removes relay file in debugfs. */
-static int remove_buf_file_handler(struct dentry *dentry)
-{
-	debugfs_remove(dentry);
-	return 0;
-}
-
-static struct rchan_callbacks relay_callbacks = {
-	.subbuf_start = subbuf_start_handler,
-	.create_buf_file = create_buf_file_handler,
-	.remove_buf_file = remove_buf_file_handler,
-};
-
 static void ioremap_trace_core(unsigned long offset, unsigned long size,
 							void __iomem *addr)
 {
@@ -387,7 +321,6 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 			.pc   = 0
 		}
 	};
-	record_timestamp(&event.header);
 
 	if (!trace) {
 		pr_err(NAME "kmalloc failed in ioremap\n");
@@ -410,7 +343,10 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 	if (!is_enabled())
 		goto not_enabled;
 
-	relay_write(chan, &event, sizeof(event));
+	/*
+	 * XXX: Insufficient data recorded!
+	 */
+	mmio_trace_record(event.header.type, event.map.addr, event.map.len);
 	list_add_tail(&trace->list, &trace_list);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
@@ -454,7 +390,6 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 	struct remap_trace *found_trace = NULL;
 
 	pr_debug(NAME "Unmapping %p.\n", addr);
-	record_timestamp(&event.header);
 
 	spin_lock_irq(&trace_lock);
 	if (!is_enabled())
@@ -469,7 +404,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 			break;
 		}
 	}
-	relay_write(chan, &event, sizeof(event));
+	mmio_trace_record(event.header.type, event.map.addr,
+					found_trace ? found_trace->id : -1);
 
 not_enabled:
 	spin_unlock_irq(&trace_lock);
@@ -512,77 +448,23 @@ static void clear_trace_list(void)
 	}
 }
 
-static ssize_t read_enabled_file_bool(struct file *file,
-		char __user *user_buf, size_t count, loff_t *ppos)
-{
-	char buf[3];
-
-	if (is_enabled())
-		buf[0] = '1';
-	else
-		buf[0] = '0';
-	buf[1] = '\n';
-	buf[2] = '\0';
-	return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
-}
-
-static void enable_mmiotrace(void);
-static void disable_mmiotrace(void);
-
-static ssize_t write_enabled_file_bool(struct file *file,
-		const char __user *user_buf, size_t count, loff_t *ppos)
-{
-	char buf[32];
-	int buf_size = min(count, (sizeof(buf)-1));
-
-	if (copy_from_user(buf, user_buf, buf_size))
-		return -EFAULT;
-
-	switch (buf[0]) {
-	case 'y':
-	case 'Y':
-	case '1':
-		enable_mmiotrace();
-		break;
-	case 'n':
-	case 'N':
-	case '0':
-		disable_mmiotrace();
-		break;
-	}
-
-	return count;
-}
-
-/* this ripped from kernel/kprobes.c */
-static struct file_operations fops_enabled = {
-	.owner =	THIS_MODULE,
-	.read =		read_enabled_file_bool,
-	.write =	write_enabled_file_bool
-};
-
 static struct file_operations fops_marker = {
 	.owner =	THIS_MODULE,
 	.write =	write_marker
 };
 
-static void enable_mmiotrace(void)
+void enable_mmiotrace(void)
 {
 	mutex_lock(&mmiotrace_mutex);
 	if (is_enabled())
 		goto out;
 
-	chan = relay_open("cpu", dir, subbuf_size, n_subbufs,
-						&relay_callbacks, NULL);
-	if (!chan) {
-		pr_err(NAME "relay app channel creation failed.\n");
-		goto out;
-	}
-
 	reference_kmmio();
 
+#if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
+#endif
 	if (!marker_file)
 		pr_err(NAME "marker file creation failed.\n");
 
@@ -598,7 +480,7 @@ out:
 	mutex_unlock(&mmiotrace_mutex);
 }
 
-static void disable_mmiotrace(void)
+void disable_mmiotrace(void)
 {
 	mutex_lock(&mmiotrace_mutex);
 	if (!is_enabled())
@@ -615,17 +497,13 @@ static void disable_mmiotrace(void)
 		debugfs_remove(marker_file);
 		marker_file = NULL;
 	}
-	if (chan) {
-		relay_close(chan);
-		chan = NULL;
-	}
 
 	pr_info(NAME "disabled.\n");
 out:
 	mutex_unlock(&mmiotrace_mutex);
 }
 
-static int __init init(void)
+int __init init_mmiotrace(void)
 {
 	pr_debug(NAME "load...\n");
 	if (n_subbufs < 2)
@@ -636,31 +514,5 @@ static int __init init(void)
 		pr_err(NAME "Couldn't create relay app directory.\n");
 		return -ENOMEM;
 	}
-
-	enabled_file = debugfs_create_file("enabled", 0600, dir, NULL,
-								&fops_enabled);
-	if (!enabled_file) {
-		pr_err(NAME "Couldn't create enabled file.\n");
-		debugfs_remove(dir);
-		return -ENOMEM;
-	}
-
-	if (enable_now)
-		enable_mmiotrace();
-
 	return 0;
 }
-
-static void __exit cleanup(void)
-{
-	pr_debug(NAME "unload...\n");
-	if (enabled_file)
-		debugfs_remove(enabled_file);
-	disable_mmiotrace();
-	if (dir)
-		debugfs_remove(dir);
-}
-
-module_init(init);
-module_exit(cleanup);
-MODULE_LICENSE("GPL");
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index cb5efd0c7f51..579b3b06c90e 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,6 +54,12 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 }
 #endif /* CONFIG_MMIOTRACE_HOOKS */
 
+/* in kernel/trace/trace_mmiotrace.c */
+extern int __init init_mmiotrace(void);
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
+
 #endif /* __KERNEL__ */
 
 
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d9efbbfa2bdf..c44a7dce9086 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -18,5 +18,6 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
 obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
 obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
+obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..e4dd03cc5aa6
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,84 @@
+/*
+ * Memory mapped I/O tracing
+ *
+ * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#define DEBUG 1
+
+#include <linux/kernel.h>
+#include <linux/mmiotrace.h>
+
+#include "trace.h"
+
+extern void
+__trace_special(void *__tr, void *__data,
+		unsigned long arg1, unsigned long arg2, unsigned long arg3);
+
+static struct trace_array *mmio_trace_array;
+
+
+static void mmio_trace_init(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	mmio_trace_array = tr;
+	if (tr->ctrl)
+		enable_mmiotrace();
+}
+
+static void mmio_trace_reset(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		disable_mmiotrace();
+}
+
+static void mmio_trace_ctrl_update(struct trace_array *tr)
+{
+	pr_debug("in %s\n", __func__);
+	if (tr->ctrl)
+		enable_mmiotrace();
+	else
+		disable_mmiotrace();
+}
+
+static struct tracer mmio_tracer __read_mostly =
+{
+	.name		= "mmiotrace",
+	.init		= mmio_trace_init,
+	.reset		= mmio_trace_reset,
+	.ctrl_update	= mmio_trace_ctrl_update,
+};
+
+__init static int init_mmio_trace(void)
+{
+	int ret = init_mmiotrace();
+	if (ret)
+		return ret;
+	return register_tracer(&mmio_tracer);
+}
+device_initcall(init_mmio_trace);
+
+void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+
+	if (!current || current->pid == 0) {
+		/*
+		 * XXX: This is a problem. We need to able to record, no
+		 * matter what. tracing_generic_entry_update() would crash.
+		 */
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("Error in %s: no current.\n", __func__);
+		return;
+	}
+	if (!tr || !data) {
+		static unsigned limit;
+		if (limit++ < 12)
+			pr_err("%s: no tr or data\n", __func__);
+		return;
+	}
+	__trace_special(tr, data, type, addr, arg);
+}
-- 
cgit v1.2.3


From bd8ac686c73c7e925fcfe0b02dc4e7b947127864 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:57 +0200
Subject: ftrace: mmiotrace, updates

here is a patch that makes mmiotrace work almost well within the tracing
framework. The patch applies on top of my previous patch. I have my own
output formatting in place now.

Summary of changes:
- fix the NULL dereference that was due to not calling tracing_reset()
- add print_line() callback into struct tracer
- implement print_line() for mmiotrace, producing up-to-spec text
- add my output header, but that is not really called in the right place
- rewrote the main structs in mmiotrace
- added two new trace entry types: TRACE_MMIO_RW and TRACE_MMIO_MAP
- made some functions in trace.c non-static
- check current==NULL in tracing_generic_entry_update()
- fix(?) comparison in trace_seq_printf()

Things seem to work fine except a few issues. Markers (text lines injected
into mmiotrace log) are missing, I did not feel hacking them in before we
have variable length entries. My output header is printed only for 'trace'
file, but not 'trace_pipe'. For some reason, despite my quick fix,
iter->trace is NULL in print_trace_line() when called from 'trace_pipe'
file, which means I don't get proper output formatting.

I only tried by loading nouveau.ko, which just detects the card, and that
is traced fine. I didn't try further. Map, two reads and unmap. Works
perfectly.

I am missing the information about overflows, I'd prefer to have a
counter for lost events. I didn't try, but I guess currently there is no
way of knowning when it overflows?

So, not too far from being fully operational, it seems :-)
And looking at the diffstat, there also is some 700-900 lines of user space
code that just became obsolete.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug               |   2 +-
 arch/x86/kernel/mmiotrace/mmio-mod.c | 140 ++++++++++----------------------
 include/linux/mmiotrace.h            |  85 ++++++--------------
 kernel/trace/trace.c                 |  34 ++++++++
 kernel/trace/trace.h                 |  14 ++++
 kernel/trace/trace_mmiotrace.c       | 151 ++++++++++++++++++++++++++++-------
 6 files changed, 238 insertions(+), 188 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 7e4b8494078e..1d6de0d67f99 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && RELAY
+	depends on DEBUG_KERNEL
 	select TRACING
 	select MMIOTRACE_HOOKS
 	default y
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index c7a67d7e482b..62abc281a512 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -37,11 +37,6 @@
 
 #define NAME "mmiotrace: "
 
-/* This app's relay channel files will appear in /debug/mmio-trace */
-static const char APP_DIR[] = "mmio-trace";
-/* the marker injection file in /debug/APP_DIR */
-static const char MARKER_FILE[] = "mmio-marker";
-
 struct trap_reason {
 	unsigned long addr;
 	unsigned long ip;
@@ -56,18 +51,15 @@ struct remap_trace {
 	unsigned long id;
 };
 
-static const size_t subbuf_size = 256*1024;
-
 /* Accessed per-cpu. */
 static DEFINE_PER_CPU(struct trap_reason, pf_reason);
-static DEFINE_PER_CPU(struct mm_io_header_rw, cpu_trace);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
 
 #if 0 /* XXX: no way gather this info anymore */
 /* Access to this is not per-cpu. */
 static DEFINE_PER_CPU(atomic_t, dropped);
 #endif
 
-static struct dentry *dir;
 static struct dentry *marker_file;
 
 static DEFINE_MUTEX(mmiotrace_mutex);
@@ -82,24 +74,21 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
  *   and trace_lock.
  * - Routines depending on is_enabled() must take trace_lock.
  * - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that chan is valid.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
  * - pre/post callbacks assume the effect of is_enabled() being true.
  */
 
 /* module parameters */
-static unsigned int	n_subbufs = 32*4;
 static unsigned long	filter_offset;
 static int		nommiotrace;
 static int		ISA_trace;
 static int		trace_pc;
 
-module_param(n_subbufs, uint, 0);
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
 module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
 
-MODULE_PARM_DESC(n_subbufs, "Number of 256kB buffers, default 128.");
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
 MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
@@ -110,6 +99,7 @@ static bool is_enabled(void)
 	return atomic_read(&mmiotrace_enabled);
 }
 
+#if 0 /* XXX: needs rewrite */
 /*
  * Write callback for the debugfs entry:
  * Read a marker and write it to the mmio trace log
@@ -145,6 +135,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 	kfree(event);
 	return len;
 }
+#endif
 
 static void print_pte(unsigned long address)
 {
@@ -198,9 +189,10 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 						unsigned long addr)
 {
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 	const unsigned long instptr = instruction_pointer(regs);
 	const enum reason_type type = get_ins_type(instptr);
+	struct remap_trace *trace = p->user_data;
 
 	/* it doesn't make sense to have more than one active trace per cpu */
 	if (my_reason->active_traces)
@@ -212,23 +204,17 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	my_reason->addr = addr;
 	my_reason->ip = instptr;
 
-	my_trace->header.type = MMIO_MAGIC;
-	my_trace->header.pid = 0;
-	my_trace->header.data_len = sizeof(struct mm_io_rw);
-	my_trace->rw.address = addr;
-	/*
-	 * struct remap_trace *trace = p->user_data;
-	 * phys = addr - trace->probe.addr + trace->phys;
-	 */
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
 
 	/*
 	 * Only record the program counter when requested.
 	 * It may taint clean-room reverse engineering.
 	 */
 	if (trace_pc)
-		my_trace->rw.pc = instptr;
+		my_trace->pc = instptr;
 	else
-		my_trace->rw.pc = 0;
+		my_trace->pc = 0;
 
 	/*
 	 * XXX: the timestamp recorded will be *after* the tracing has been
@@ -238,28 +224,25 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 
 	switch (type) {
 	case REG_READ:
-		my_trace->header.type |=
-			(MMIO_READ << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
+		my_trace->opcode = MMIO_READ;
+		my_trace->width = get_ins_mem_width(instptr);
 		break;
 	case REG_WRITE:
-		my_trace->header.type |=
-			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		my_trace->rw.value = get_ins_reg_val(instptr, regs);
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_reg_val(instptr, regs);
 		break;
 	case IMM_WRITE:
-		my_trace->header.type |=
-			(MMIO_WRITE << MMIO_OPCODE_SHIFT) |
-			(get_ins_mem_width(instptr) << MMIO_WIDTH_SHIFT);
-		my_trace->rw.value = get_ins_imm_val(instptr);
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_imm_val(instptr);
 		break;
 	default:
 		{
 			unsigned char *ip = (unsigned char *)instptr;
-			my_trace->header.type |=
-					(MMIO_UNKNOWN_OP << MMIO_OPCODE_SHIFT);
-			my_trace->rw.value = (*ip) << 16 | *(ip + 1) << 8 |
+			my_trace->opcode = MMIO_UNKNOWN_OP;
+			my_trace->width = 0;
+			my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
 								*(ip + 2);
 		}
 	}
@@ -271,7 +254,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 							struct pt_regs *regs)
 {
 	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mm_io_header_rw *my_trace = &get_cpu_var(cpu_trace);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 
 	/* this should always return the active_trace count to 0 */
 	my_reason->active_traces--;
@@ -282,20 +265,13 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 
 	switch (my_reason->type) {
 	case REG_READ:
-		my_trace->rw.value = get_ins_reg_val(my_reason->ip, regs);
+		my_trace->value = get_ins_reg_val(my_reason->ip, regs);
 		break;
 	default:
 		break;
 	}
 
-	/*
-	 * XXX: Several required values are ignored:
-	 * - mapping id
-	 * - program counter
-	 * Also the address should be physical, not virtual.
-	 */
-	mmio_trace_record(my_trace->header.type, my_trace->rw.address,
-							my_trace->rw.value);
+	mmio_trace_rw(my_trace);
 	put_cpu_var(cpu_trace);
 	put_cpu_var(pf_reason);
 }
@@ -305,21 +281,11 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 {
 	static atomic_t next_id;
 	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
-	struct mm_io_header_map event = {
-		.header = {
-			.type = MMIO_MAGIC |
-					(MMIO_PROBE << MMIO_OPCODE_SHIFT),
-			.sec = 0,
-			.nsec = 0,
-			.pid = 0,
-			.data_len = sizeof(struct mm_io_map)
-		},
-		.map = {
-			.phys = offset,
-			.addr = (unsigned long)addr,
-			.len  = size,
-			.pc   = 0
-		}
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
 	};
 
 	if (!trace) {
@@ -338,15 +304,13 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 		.phys = offset,
 		.id = atomic_inc_return(&next_id)
 	};
+	map.map_id = trace->id;
 
 	spin_lock_irq(&trace_lock);
 	if (!is_enabled())
 		goto not_enabled;
 
-	/*
-	 * XXX: Insufficient data recorded!
-	 */
-	mmio_trace_record(event.header.type, event.map.addr, event.map.len);
+	mmio_trace_mapping(&map);
 	list_add_tail(&trace->list, &trace_list);
 	if (!nommiotrace)
 		register_kmmio_probe(&trace->probe);
@@ -369,21 +333,11 @@ mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
 
 static void iounmap_trace_core(volatile void __iomem *addr)
 {
-	struct mm_io_header_map event = {
-		.header = {
-			.type = MMIO_MAGIC |
-				(MMIO_UNPROBE << MMIO_OPCODE_SHIFT),
-			.sec = 0,
-			.nsec = 0,
-			.pid = 0,
-			.data_len = sizeof(struct mm_io_map)
-		},
-		.map = {
-			.phys = 0,
-			.addr = (unsigned long)addr,
-			.len  = 0,
-			.pc   = 0
-		}
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
 	};
 	struct remap_trace *trace;
 	struct remap_trace *tmp;
@@ -404,8 +358,8 @@ static void iounmap_trace_core(volatile void __iomem *addr)
 			break;
 		}
 	}
-	mmio_trace_record(event.header.type, event.map.addr,
-					found_trace ? found_trace->id : -1);
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
 
 not_enabled:
 	spin_unlock_irq(&trace_lock);
@@ -448,10 +402,12 @@ static void clear_trace_list(void)
 	}
 }
 
+#if 0 /* XXX: out of order */
 static struct file_operations fops_marker = {
 	.owner =	THIS_MODULE,
 	.write =	write_marker
 };
+#endif
 
 void enable_mmiotrace(void)
 {
@@ -464,9 +420,9 @@ void enable_mmiotrace(void)
 #if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
-#endif
 	if (!marker_file)
 		pr_err(NAME "marker file creation failed.\n");
+#endif
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
@@ -502,17 +458,3 @@ void disable_mmiotrace(void)
 out:
 	mutex_unlock(&mmiotrace_mutex);
 }
-
-int __init init_mmiotrace(void)
-{
-	pr_debug(NAME "load...\n");
-	if (n_subbufs < 2)
-		return -EINVAL;
-
-	dir = debugfs_create_dir(APP_DIR, NULL);
-	if (!dir) {
-		pr_err(NAME "Couldn't create relay app directory.\n");
-		return -ENOMEM;
-	}
-	return 0;
-}
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 579b3b06c90e..c88a9c197d22 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -54,73 +54,38 @@ static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 }
 #endif /* CONFIG_MMIOTRACE_HOOKS */
 
-/* in kernel/trace/trace_mmiotrace.c */
-extern int __init init_mmiotrace(void);
-extern void enable_mmiotrace(void);
-extern void disable_mmiotrace(void);
-extern void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg);
-
-#endif /* __KERNEL__ */
-
-
-/*
- * If you change anything here, you must bump MMIO_VERSION.
- * This is the relay data format for user space.
- */
-#define MMIO_VERSION 0x04
-
-/* mm_io_header.type */
-#define MMIO_OPCODE_MASK 0xff
-#define MMIO_OPCODE_SHIFT 0
-#define MMIO_WIDTH_MASK 0xff00
-#define MMIO_WIDTH_SHIFT 8
-#define MMIO_MAGIC (0x6f000000 | (MMIO_VERSION<<16))
-#define MMIO_MAGIC_MASK 0xffff0000
-
-enum mm_io_opcode {          /* payload type: */
-	MMIO_READ = 0x1,     /* struct mm_io_rw */
-	MMIO_WRITE = 0x2,    /* struct mm_io_rw */
-	MMIO_PROBE = 0x3,    /* struct mm_io_map */
-	MMIO_UNPROBE = 0x4,  /* struct mm_io_map */
+enum mm_io_opcode {
+	MMIO_READ = 0x1,     /* struct mmiotrace_rw */
+	MMIO_WRITE = 0x2,    /* struct mmiotrace_rw */
+	MMIO_PROBE = 0x3,    /* struct mmiotrace_map */
+	MMIO_UNPROBE = 0x4,  /* struct mmiotrace_map */
 	MMIO_MARKER = 0x5,   /* raw char data */
-	MMIO_UNKNOWN_OP = 0x6, /* struct mm_io_rw */
+	MMIO_UNKNOWN_OP = 0x6, /* struct mmiotrace_rw */
 };
 
-struct mm_io_header {
-	__u32 type;     /* see MMIO_* macros above */
-	__u32 sec;      /* timestamp */
-	__u32 nsec;
-	__u32 pid;      /* PID of the process, or 0 for kernel core */
-	__u16 data_len; /* length of the following payload */
+struct mmiotrace_rw {
+	unsigned long phys;	/* PCI address of register */
+	unsigned long value;
+	unsigned long pc;	/* optional program counter */
+	int map_id;
+	unsigned char opcode;	/* one of MMIO_{READ,WRITE,UNKNOWN_OP} */
+	unsigned char width;	/* size of register access in bytes */
 };
 
-struct mm_io_rw {
-	__u64 address; /* virtual address of register */
-	__u64 value;
-	__u64 pc;      /* optional program counter */
+struct mmiotrace_map {
+	unsigned long phys;	/* base address in PCI space */
+	unsigned long virt;	/* base virtual address */
+	unsigned long len;	/* mapping size */
+	int map_id;
+	unsigned char opcode;	/* MMIO_PROBE or MMIO_UNPROBE */
 };
 
-struct mm_io_map {
-	__u64 phys;  /* base address in PCI space */
-	__u64 addr;  /* base virtual address */
-	__u64 len;   /* mapping size */
-	__u64 pc;    /* optional program counter */
-};
-
-
-/*
- * These structures are used to allow a single relay_write()
- * call to write a full packet.
- */
-
-struct mm_io_header_rw {
-	struct mm_io_header header;
-	struct mm_io_rw rw;
-} __attribute__((packed));
+/* in kernel/trace/trace_mmiotrace.c */
+extern void enable_mmiotrace(void);
+extern void disable_mmiotrace(void);
+extern void mmio_trace_rw(struct mmiotrace_rw *rw);
+extern void mmio_trace_mapping(struct mmiotrace_map *map);
 
-struct mm_io_header_map {
-	struct mm_io_header header;
-	struct mm_io_map map;
-} __attribute__((packed));
+#endif /* __KERNEL__ */
 
 #endif /* MMIOTRACE_H */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3271916ff033..d14fe49e9638 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -831,6 +831,40 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags);
 }
 
+#ifdef CONFIG_MMIOTRACE
+void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_rw *rw)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_RW;
+	entry->mmiorw		= *rw;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+
+void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
+						struct mmiotrace_map *map)
+{
+	struct trace_entry *entry;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&data->lock, irq_flags);
+	entry			= tracing_get_trace_entry(tr, data);
+	tracing_generic_entry_update(entry, 0);
+	entry->type		= TRACE_MMIO_MAP;
+	entry->mmiomap		= *map;
+	spin_unlock_irqrestore(&data->lock, irq_flags);
+
+	trace_wake_up();
+}
+#endif
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c460e85e94ed..0ef9ef74c806 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,6 +5,7 @@
 #include <asm/atomic.h>
 #include <linux/sched.h>
 #include <linux/clocksource.h>
+#include <linux/mmiotrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -14,6 +15,8 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_SPECIAL,
+	TRACE_MMIO_RW,
+	TRACE_MMIO_MAP,
 
 	__TRACE_LAST_TYPE
 };
@@ -75,6 +78,8 @@ struct trace_entry {
 		struct ctx_switch_entry		ctx;
 		struct special_entry		special;
 		struct stack_entry		stack;
+		struct mmiotrace_rw		mmiorw;
+		struct mmiotrace_map		mmiomap;
 	};
 };
 
@@ -255,6 +260,15 @@ extern unsigned long ftrace_update_tot_cnt;
 extern int DYN_FTRACE_TEST_NAME(void);
 #endif
 
+#ifdef CONFIG_MMIOTRACE
+extern void __trace_mmiotrace_rw(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_rw *rw);
+extern void __trace_mmiotrace_map(struct trace_array *tr,
+				struct trace_array_cpu *data,
+				struct mmiotrace_map *map);
+#endif
+
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 #ifdef CONFIG_FTRACE
 extern int trace_selftest_startup_function(struct tracer *trace,
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index e4dd03cc5aa6..3a12b1ad0c63 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,19 +11,26 @@
 
 #include "trace.h"
 
-extern void
-__trace_special(void *__tr, void *__data,
-		unsigned long arg1, unsigned long arg2, unsigned long arg3);
-
 static struct trace_array *mmio_trace_array;
 
+static void mmio_reset_data(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr->data[cpu]);
+}
 
 static void mmio_trace_init(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
 	mmio_trace_array = tr;
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
+	}
 }
 
 static void mmio_trace_reset(struct trace_array *tr)
@@ -31,15 +38,110 @@ static void mmio_trace_reset(struct trace_array *tr)
 	pr_debug("in %s\n", __func__);
 	if (tr->ctrl)
 		disable_mmiotrace();
+	mmio_reset_data(tr);
+	mmio_trace_array = NULL;
 }
 
 static void mmio_trace_ctrl_update(struct trace_array *tr)
 {
 	pr_debug("in %s\n", __func__);
-	if (tr->ctrl)
+	if (tr->ctrl) {
+		mmio_reset_data(tr);
 		enable_mmiotrace();
-	else
+	} else {
 		disable_mmiotrace();
+	}
+}
+
+/* XXX: This is not called for trace_pipe file! */
+void mmio_print_header(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	trace_seq_printf(s, "VERSION broken 20070824\n");
+	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+}
+
+static int mmio_print_rw(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_rw *rw	= &entry->mmiorw;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_READ:
+		ret = trace_seq_printf(s,
+			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_WRITE:
+		ret = trace_seq_printf(s,
+			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			rw->value, rw->pc, entry->pid);
+		break;
+	case MMIO_UNKNOWN_OP:
+		ret = trace_seq_printf(s,
+			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id, rw->phys,
+			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
+			(rw->value >> 0) & 0xff, rw->pc, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "rw what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+static int mmio_print_map(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+	struct mmiotrace_map *m	= &entry->mmiomap;
+	struct trace_seq *s	= &iter->seq;
+	unsigned long long t	= ns2usecs(entry->t);
+	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned secs		= (unsigned long)t;
+	int ret = 1;
+
+	switch (entry->mmiorw.opcode) {
+	case MMIO_PROBE:
+		ret = trace_seq_printf(s,
+			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			0UL, entry->pid);
+		break;
+	case MMIO_UNPROBE:
+		ret = trace_seq_printf(s,
+			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			secs, usec_rem, m->map_id, 0UL, entry->pid);
+		break;
+	default:
+		ret = trace_seq_printf(s, "map what?\n");
+		break;
+	}
+	if (ret)
+		return 1;
+	return 0;
+}
+
+/* return 0 to abort printing without consuming current entry in pipe mode */
+static int mmio_print_line(struct trace_iterator *iter)
+{
+	switch (iter->ent->type) {
+	case TRACE_MMIO_RW:
+		return mmio_print_rw(iter);
+	case TRACE_MMIO_MAP:
+		return mmio_print_map(iter);
+	default:
+		return 1; /* ignore unknown entries */
+	}
 }
 
 static struct tracer mmio_tracer __read_mostly =
@@ -47,38 +149,31 @@ static struct tracer mmio_tracer __read_mostly =
 	.name		= "mmiotrace",
 	.init		= mmio_trace_init,
 	.reset		= mmio_trace_reset,
+	.open		= mmio_print_header,
 	.ctrl_update	= mmio_trace_ctrl_update,
+	.print_line	= mmio_print_line,
 };
 
 __init static int init_mmio_trace(void)
 {
-	int ret = init_mmiotrace();
-	if (ret)
-		return ret;
 	return register_tracer(&mmio_tracer);
 }
 device_initcall(init_mmio_trace);
 
-void mmio_trace_record(u32 type, unsigned long addr, unsigned long arg)
+void mmio_trace_rw(struct mmiotrace_rw *rw)
 {
 	struct trace_array *tr = mmio_trace_array;
 	struct trace_array_cpu *data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_rw(tr, data, rw);
+}
 
-	if (!current || current->pid == 0) {
-		/*
-		 * XXX: This is a problem. We need to able to record, no
-		 * matter what. tracing_generic_entry_update() would crash.
-		 */
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("Error in %s: no current.\n", __func__);
-		return;
-	}
-	if (!tr || !data) {
-		static unsigned limit;
-		if (limit++ < 12)
-			pr_err("%s: no tr or data\n", __func__);
-		return;
-	}
-	__trace_special(tr, data, type, addr, arg);
+void mmio_trace_mapping(struct mmiotrace_map *map)
+{
+	struct trace_array *tr = mmio_trace_array;
+	struct trace_array_cpu *data;
+
+	preempt_disable();
+	data = tr->data[smp_processor_id()];
+	__trace_mmiotrace_map(tr, data, map);
+	preempt_enable();
 }
-- 
cgit v1.2.3


From 138295373ccf7625fcb0218dfea114837983bc39 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: ftrace: mmiotrace update, #2

another weekend, another patch. This should apply on top of my previous patch
from March 23rd.

Summary of changes:
- Print PCI device list in output header
- work around recursive probe hits on SMP
- refactor dis/arm_kmmio_fault_page() and add check for page levels
- remove un/reference_kmmio(), the die notifier hook is registered
permanently into the list
- explicitly check for single stepping in die notifier callback

I have tested this version on my UP Athlon64 desktop with Nouveau, and
SMP Core 2 Duo laptop with the proprietary nvidia driver. Both systems
are 64-bit. One previously unknown bug crept into daylight: the ftrace
framework's output routines print the first entry last after buffer has
wrapped around.

The most important regressions compared to non-ftrace mmiotrace at this
time are:
- failure of trace_pipe file
- illegal lines in output file
- unaware of losing data due to buffer full

Personally I'd like to see these three solved before submitting to
mainline. Other issues may come up once we know when we lose events.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/kmmio.c    | 186 ++++++++++++++---------------------
 arch/x86/kernel/mmiotrace/mmio-mod.c |   3 -
 include/linux/mmiotrace.h            |   2 -
 kernel/trace/trace_mmiotrace.c       |  47 ++++++++-
 4 files changed, 120 insertions(+), 118 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index efb467933087..cd0d95fe4fe6 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -5,15 +5,12 @@
  *     2008 Pekka Paalanen <pq@iki.fi>
  */
 
-#include <linux/version.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/kernel.h>
-#include <linux/mm.h>
 #include <linux/uaccess.h>
 #include <linux/ptrace.h>
 #include <linux/preempt.h>
@@ -22,10 +19,9 @@
 #include <linux/mutex.h>
 #include <asm/io.h>
 #include <asm/cacheflush.h>
-#include <asm/errno.h>
 #include <asm/tlbflush.h>
-#include <asm/pgtable.h>
-
+#include <asm/errno.h>
+#include <asm/debugreg.h>
 #include <linux/mmiotrace.h>
 
 #define KMMIO_PAGE_HASH_BITS 4
@@ -57,14 +53,9 @@ struct kmmio_context {
 	int active;
 };
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args);
-
-static DEFINE_MUTEX(kmmio_init_mutex);
 static DEFINE_SPINLOCK(kmmio_lock);
 
-/* These are protected by kmmio_lock */
-static int kmmio_initialized;
+/* Protected by kmmio_lock */
 unsigned int kmmio_count;
 
 /* Read-protected by RCU, write-protected by kmmio_lock. */
@@ -79,60 +70,6 @@ static struct list_head *kmmio_page_list(unsigned long page)
 /* Accessed per-cpu */
 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
 
-/* protected by kmmio_init_mutex */
-static struct notifier_block nb_die = {
-	.notifier_call = kmmio_die_notifier
-};
-
-/**
- * Makes sure kmmio is initialized and usable.
- * This must be called before any other kmmio function defined here.
- * May sleep.
- */
-void reference_kmmio(void)
-{
-	mutex_lock(&kmmio_init_mutex);
-	spin_lock_irq(&kmmio_lock);
-	if (!kmmio_initialized) {
-		int i;
-		for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-			INIT_LIST_HEAD(&kmmio_page_table[i]);
-		if (register_die_notifier(&nb_die))
-			BUG();
-	}
-	kmmio_initialized++;
-	spin_unlock_irq(&kmmio_lock);
-	mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL_GPL(reference_kmmio);
-
-/**
- * Clean up kmmio after use. This must be called for every call to
- * reference_kmmio(). All probes registered after the corresponding
- * reference_kmmio() must have been unregistered when calling this.
- * May sleep.
- */
-void unreference_kmmio(void)
-{
-	bool unreg = false;
-
-	mutex_lock(&kmmio_init_mutex);
-	spin_lock_irq(&kmmio_lock);
-
-	if (kmmio_initialized == 1) {
-		BUG_ON(is_kmmio_active());
-		unreg = true;
-	}
-	kmmio_initialized--;
-	BUG_ON(kmmio_initialized < 0);
-	spin_unlock_irq(&kmmio_lock);
-
-	if (unreg)
-		unregister_die_notifier(&nb_die); /* calls sync_rcu() */
-	mutex_unlock(&kmmio_init_mutex);
-}
-EXPORT_SYMBOL(unreference_kmmio);
-
 /*
  * this is basically a dynamic stabbing problem:
  * Could use the existing prio tree code or
@@ -167,58 +104,56 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+static void set_page_present(unsigned long addr, bool present, int *pglevel)
 {
-	unsigned long address = page & PAGE_MASK;
+	pteval_t pteval;
+	pmdval_t pmdval;
 	int level;
-	pte_t *pte = lookup_address(address, &level);
+	pmd_t *pmd;
+	pte_t *pte = lookup_address(addr, &level);
 
 	if (!pte) {
-		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-							__func__, page);
+		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
 		return;
 	}
 
-	if (level == PG_LEVEL_2M) {
-		pmd_t *pmd = (pmd_t *)pte;
-		set_pmd(pmd, __pmd(pmd_val(*pmd) & ~_PAGE_PRESENT));
-	} else {
-		/* PG_LEVEL_4K */
-		set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT));
+	if (pglevel)
+		*pglevel = level;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		pmd = (pmd_t *)pte;
+		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+		if (present)
+			pmdval |= _PAGE_PRESENT;
+		set_pmd(pmd, __pmd(pmdval));
+		break;
+
+	case PG_LEVEL_4K:
+		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+		if (present)
+			pteval |= _PAGE_PRESENT;
+		set_pte_atomic(pte, __pte(pteval));
+		break;
+
+	default:
+		pr_err("kmmio: unexpected page level 0x%x.\n", level);
+		return;
 	}
 
-	if (page_level)
-		*page_level = level;
+	__flush_tlb_one(addr);
+}
 
-	__flush_tlb_one(page);
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+	set_page_present(page & PAGE_MASK, false, page_level);
 }
 
 /** Mark the given page as present. */
 static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
 {
-	unsigned long address = page & PAGE_MASK;
-	int level;
-	pte_t *pte = lookup_address(address, &level);
-
-	if (!pte) {
-		pr_err("kmmio: Error in %s: no pte for page 0x%08lx\n",
-							__func__, page);
-		return;
-	}
-
-	if (level == PG_LEVEL_2M) {
-		pmd_t *pmd = (pmd_t *)pte;
-		set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_PRESENT));
-	} else {
-		/* PG_LEVEL_4K */
-		set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT));
-	}
-
-	if (page_level)
-		*page_level = level;
-
-	__flush_tlb_one(page);
+	set_page_present(page & PAGE_MASK, true, page_level);
 }
 
 /*
@@ -240,6 +175,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 {
 	struct kmmio_context *ctx;
 	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
 
 	/*
 	 * Preemption is now disabled to prevent process switch during
@@ -257,21 +193,35 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 		/*
 		 * Either this page fault is not caused by kmmio, or
 		 * another CPU just pulled the kmmio probe from under
-		 * our feet. In the latter case all hell breaks loose.
+		 * our feet. The latter case should not be possible.
 		 */
 		goto no_kmmio;
 	}
 
 	ctx = &get_cpu_var(kmmio_ctx);
 	if (ctx->active) {
+		disarm_kmmio_fault_page(faultpage->page, NULL);
+		if (addr == ctx->addr) {
+			/*
+			 * On SMP we sometimes get recursive probe hits on the
+			 * same address. Context is already saved, fall out.
+			 */
+			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+						"address 0x%08lx.\n",
+						smp_processor_id(), addr);
+			ret = 1;
+			goto no_kmmio_ctx;
+		}
 		/*
 		 * Prevent overwriting already in-flight context.
-		 * If this page fault really was due to kmmio trap,
-		 * all hell breaks loose.
+		 * This should not happen, let's hope disarming at least
+		 * prevents a panic.
 		 */
 		pr_emerg("kmmio: recursive probe hit on CPU %d, "
 					"for address 0x%08lx. Ignoring.\n",
 					smp_processor_id(), addr);
+		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+					ctx->addr);
 		goto no_kmmio_ctx;
 	}
 	ctx->active++;
@@ -302,14 +252,14 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 */
 
 	put_cpu_var(kmmio_ctx);
-	return 1;
+	return 1; /* fault handled */
 
 no_kmmio_ctx:
 	put_cpu_var(kmmio_ctx);
 no_kmmio:
 	rcu_read_unlock();
 	preempt_enable_no_resched();
-	return 0; /* page fault not handled by kmmio */
+	return ret;
 }
 
 /*
@@ -322,8 +272,11 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	int ret = 0;
 	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
-	if (!ctx->active)
+	if (!ctx->active) {
+		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+							smp_processor_id());
 		goto out;
+	}
 
 	if (ctx->probe && ctx->probe->post_handler)
 		ctx->probe->post_handler(ctx->probe, condition, regs);
@@ -525,9 +478,22 @@ static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
 {
 	struct die_args *arg = args;
 
-	if (val == DIE_DEBUG)
+	if (val == DIE_DEBUG && (arg->err & DR_STEP))
 		if (post_kmmio_handler(arg->err, arg->regs) == 1)
 			return NOTIFY_STOP;
 
 	return NOTIFY_DONE;
 }
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+static int __init init_kmmio(void)
+{
+	int i;
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+	return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
index 62abc281a512..8256546d49bf 100644
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ b/arch/x86/kernel/mmiotrace/mmio-mod.c
@@ -415,8 +415,6 @@ void enable_mmiotrace(void)
 	if (is_enabled())
 		goto out;
 
-	reference_kmmio();
-
 #if 0 /* XXX: tracing does not support text entries */
 	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
 								&fops_marker);
@@ -448,7 +446,6 @@ void disable_mmiotrace(void)
 	spin_unlock_irq(&trace_lock);
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
-	unreference_kmmio();
 	if (marker_file) {
 		debugfs_remove(marker_file);
 		marker_file = NULL;
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index c88a9c197d22..dd6b64b160fc 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -31,8 +31,6 @@ static inline int is_kmmio_active(void)
 	return kmmio_count;
 }
 
-extern void reference_kmmio(void);
-extern void unreference_kmmio(void);
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3a12b1ad0c63..361472b5788c 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -8,6 +8,7 @@
 
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
+#include <linux/pci.h>
 
 #include "trace.h"
 
@@ -53,12 +54,52 @@ static void mmio_trace_ctrl_update(struct trace_array *tr)
 	}
 }
 
+static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
+{
+	int ret = 0;
+	int i;
+	resource_size_t start, end;
+	const struct pci_driver *drv = pci_dev_driver(dev);
+
+	/* XXX: incomplete checks for trace_seq_printf() return value */
+	ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
+				dev->bus->number, dev->devfn,
+				dev->vendor, dev->device, dev->irq);
+	/*
+	 * XXX: is pci_resource_to_user() appropriate, since we are
+	 * supposed to interpret the __ioremap() phys_addr argument based on
+	 * these printed values?
+	 */
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			(unsigned long long)(start |
+			(dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
+	}
+	for (i = 0; i < 7; i++) {
+		pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
+		ret += trace_seq_printf(s, " %llx",
+			dev->resource[i].start < dev->resource[i].end ?
+			(unsigned long long)(end - start) + 1 : 0);
+	}
+	if (drv)
+		ret += trace_seq_printf(s, " %s\n", drv->name);
+	else
+		ret += trace_seq_printf(s, " \n");
+	return ret;
+}
+
 /* XXX: This is not called for trace_pipe file! */
-void mmio_print_header(struct trace_iterator *iter)
+static void mmio_print_header(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
-	trace_seq_printf(s, "VERSION broken 20070824\n");
-	/* TODO: print /proc/bus/pci/devices contents as PCIDEV lines */
+	struct pci_dev *dev = NULL;
+
+	trace_seq_printf(s, "VERSION 20070824\n");
+
+	for_each_pci_dev(dev)
+		mmio_print_pcidev(s, dev);
+	/* XXX: return value? What if header is very long? */
 }
 
 static int mmio_print_rw(struct trace_iterator *iter)
-- 
cgit v1.2.3


From 49023168261a7f9a2fd4a1ca1adbfea922556015 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 12 May 2008 21:20:58 +0200
Subject: mmiotrace: cleanup

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/mmiotrace/kmmio.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
index cd0d95fe4fe6..3ad27b8504a5 100644
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ b/arch/x86/kernel/mmiotrace/kmmio.c
@@ -228,7 +228,7 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 
 	ctx->fpage = faultpage;
 	ctx->probe = get_kmmio_probe(addr);
-	ctx->saved_flags = (regs->flags & (TF_MASK|IF_MASK));
+	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
 	ctx->addr = addr;
 
 	if (ctx->probe && ctx->probe->pre_handler)
@@ -238,8 +238,8 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
 	 * Enable single-stepping and disable interrupts for the faulting
 	 * context. Local interrupts must not get enabled during stepping.
 	 */
-	regs->flags |= TF_MASK;
-	regs->flags &= ~IF_MASK;
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
 
 	/* Now we set present bit in PTE and single step. */
 	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
@@ -283,7 +283,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 
 	arm_kmmio_fault_page(ctx->fpage->page, NULL);
 
-	regs->flags &= ~TF_MASK;
+	regs->flags &= ~X86_EFLAGS_TF;
 	regs->flags |= ctx->saved_flags;
 
 	/* These were acquired in kmmio_handler(). */
@@ -297,7 +297,7 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
 	 * will have TF set, in which case, continue the remaining processing
 	 * of do_debug, as if this is not a probe hit.
 	 */
-	if (!(regs->flags & TF_MASK))
+	if (!(regs->flags & X86_EFLAGS_TF))
 		ret = 1;
 out:
 	put_cpu_var(kmmio_ctx);
-- 
cgit v1.2.3


From ff3a3e9ba5e4273a8bc10570adab4a390fb90757 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: x86 mmiotrace: move files into arch/x86/mm/.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/Makefile                  |   2 -
 arch/x86/kernel/mmiotrace/Makefile        |   4 -
 arch/x86/kernel/mmiotrace/kmmio.c         | 499 ------------------------------
 arch/x86/kernel/mmiotrace/mmio-mod.c      | 457 ---------------------------
 arch/x86/kernel/mmiotrace/pf_in.c         | 489 -----------------------------
 arch/x86/kernel/mmiotrace/pf_in.h         |  39 ---
 arch/x86/kernel/mmiotrace/testmmiotrace.c |  71 -----
 arch/x86/mm/Makefile                      |   5 +
 arch/x86/mm/kmmio.c                       | 499 ++++++++++++++++++++++++++++++
 arch/x86/mm/mmio-mod.c                    | 457 +++++++++++++++++++++++++++
 arch/x86/mm/pf_in.c                       | 489 +++++++++++++++++++++++++++++
 arch/x86/mm/pf_in.h                       |  39 +++
 arch/x86/mm/testmmiotrace.c               |  71 +++++
 13 files changed, 1560 insertions(+), 1561 deletions(-)
 delete mode 100644 arch/x86/kernel/mmiotrace/Makefile
 delete mode 100644 arch/x86/kernel/mmiotrace/kmmio.c
 delete mode 100644 arch/x86/kernel/mmiotrace/mmio-mod.c
 delete mode 100644 arch/x86/kernel/mmiotrace/pf_in.c
 delete mode 100644 arch/x86/kernel/mmiotrace/pf_in.h
 delete mode 100644 arch/x86/kernel/mmiotrace/testmmiotrace.c
 create mode 100644 arch/x86/mm/kmmio.c
 create mode 100644 arch/x86/mm/mmio-mod.c
 create mode 100644 arch/x86/mm/pf_in.c
 create mode 100644 arch/x86/mm/pf_in.h
 create mode 100644 arch/x86/mm/testmmiotrace.c

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a51ac153685e..739d49acd2f1 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -79,8 +79,6 @@ obj-$(CONFIG_KGDB)		+= kgdb.o
 obj-$(CONFIG_VM86)		+= vm86_32.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
 
-obj-$(CONFIG_MMIOTRACE)		+= mmiotrace/
-
 obj-$(CONFIG_HPET_TIMER) 	+= hpet.o
 
 obj-$(CONFIG_K8_NB)		+= k8.o
diff --git a/arch/x86/kernel/mmiotrace/Makefile b/arch/x86/kernel/mmiotrace/Makefile
deleted file mode 100644
index dbcd8d50fb8d..000000000000
--- a/arch/x86/kernel/mmiotrace/Makefile
+++ /dev/null
@@ -1,4 +0,0 @@
-obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
-obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
-mmiotrace-y			:= pf_in.o mmio-mod.o
-obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
diff --git a/arch/x86/kernel/mmiotrace/kmmio.c b/arch/x86/kernel/mmiotrace/kmmio.c
deleted file mode 100644
index 3ad27b8504a5..000000000000
--- a/arch/x86/kernel/mmiotrace/kmmio.c
+++ /dev/null
@@ -1,499 +0,0 @@
-/* Support for MMIO probes.
- * Benfit many code from kprobes
- * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
- *     2007 Alexander Eichner
- *     2008 Pekka Paalanen <pq@iki.fi>
- */
-
-#include <linux/list.h>
-#include <linux/spinlock.h>
-#include <linux/hash.h>
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/uaccess.h>
-#include <linux/ptrace.h>
-#include <linux/preempt.h>
-#include <linux/percpu.h>
-#include <linux/kdebug.h>
-#include <linux/mutex.h>
-#include <asm/io.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-#include <asm/errno.h>
-#include <asm/debugreg.h>
-#include <linux/mmiotrace.h>
-
-#define KMMIO_PAGE_HASH_BITS 4
-#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
-
-struct kmmio_fault_page {
-	struct list_head list;
-	struct kmmio_fault_page *release_next;
-	unsigned long page; /* location of the fault page */
-
-	/*
-	 * Number of times this page has been registered as a part
-	 * of a probe. If zero, page is disarmed and this may be freed.
-	 * Used only by writers (RCU).
-	 */
-	int count;
-};
-
-struct kmmio_delayed_release {
-	struct rcu_head rcu;
-	struct kmmio_fault_page *release_list;
-};
-
-struct kmmio_context {
-	struct kmmio_fault_page *fpage;
-	struct kmmio_probe *probe;
-	unsigned long saved_flags;
-	unsigned long addr;
-	int active;
-};
-
-static DEFINE_SPINLOCK(kmmio_lock);
-
-/* Protected by kmmio_lock */
-unsigned int kmmio_count;
-
-/* Read-protected by RCU, write-protected by kmmio_lock. */
-static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
-static LIST_HEAD(kmmio_probes);
-
-static struct list_head *kmmio_page_list(unsigned long page)
-{
-	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
-}
-
-/* Accessed per-cpu */
-static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
-
-/*
- * this is basically a dynamic stabbing problem:
- * Could use the existing prio tree code or
- * Possible better implementations:
- * The Interval Skip List: A Data Structure for Finding All Intervals That
- * Overlap a Point (might be simple)
- * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
- */
-/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
-static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
-{
-	struct kmmio_probe *p;
-	list_for_each_entry_rcu(p, &kmmio_probes, list) {
-		if (addr >= p->addr && addr <= (p->addr + p->len))
-			return p;
-	}
-	return NULL;
-}
-
-/* You must be holding RCU read lock. */
-static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
-{
-	struct list_head *head;
-	struct kmmio_fault_page *p;
-
-	page &= PAGE_MASK;
-	head = kmmio_page_list(page);
-	list_for_each_entry_rcu(p, head, list) {
-		if (p->page == page)
-			return p;
-	}
-	return NULL;
-}
-
-static void set_page_present(unsigned long addr, bool present, int *pglevel)
-{
-	pteval_t pteval;
-	pmdval_t pmdval;
-	int level;
-	pmd_t *pmd;
-	pte_t *pte = lookup_address(addr, &level);
-
-	if (!pte) {
-		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
-		return;
-	}
-
-	if (pglevel)
-		*pglevel = level;
-
-	switch (level) {
-	case PG_LEVEL_2M:
-		pmd = (pmd_t *)pte;
-		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
-		if (present)
-			pmdval |= _PAGE_PRESENT;
-		set_pmd(pmd, __pmd(pmdval));
-		break;
-
-	case PG_LEVEL_4K:
-		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
-		if (present)
-			pteval |= _PAGE_PRESENT;
-		set_pte_atomic(pte, __pte(pteval));
-		break;
-
-	default:
-		pr_err("kmmio: unexpected page level 0x%x.\n", level);
-		return;
-	}
-
-	__flush_tlb_one(addr);
-}
-
-/** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, int *page_level)
-{
-	set_page_present(page & PAGE_MASK, false, page_level);
-}
-
-/** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
-{
-	set_page_present(page & PAGE_MASK, true, page_level);
-}
-
-/*
- * This is being called from do_page_fault().
- *
- * We may be in an interrupt or a critical section. Also prefecthing may
- * trigger a page fault. We may be in the middle of process switch.
- * We cannot take any locks, because we could be executing especially
- * within a kmmio critical section.
- *
- * Local interrupts are disabled, so preemption cannot happen.
- * Do not enable interrupts, do not sleep, and watch out for other CPUs.
- */
-/*
- * Interrupts are disabled on entry as trap3 is an interrupt gate
- * and they remain disabled thorough out this function.
- */
-int kmmio_handler(struct pt_regs *regs, unsigned long addr)
-{
-	struct kmmio_context *ctx;
-	struct kmmio_fault_page *faultpage;
-	int ret = 0; /* default to fault not handled */
-
-	/*
-	 * Preemption is now disabled to prevent process switch during
-	 * single stepping. We can only handle one active kmmio trace
-	 * per cpu, so ensure that we finish it before something else
-	 * gets to run. We also hold the RCU read lock over single
-	 * stepping to avoid looking up the probe and kmmio_fault_page
-	 * again.
-	 */
-	preempt_disable();
-	rcu_read_lock();
-
-	faultpage = get_kmmio_fault_page(addr);
-	if (!faultpage) {
-		/*
-		 * Either this page fault is not caused by kmmio, or
-		 * another CPU just pulled the kmmio probe from under
-		 * our feet. The latter case should not be possible.
-		 */
-		goto no_kmmio;
-	}
-
-	ctx = &get_cpu_var(kmmio_ctx);
-	if (ctx->active) {
-		disarm_kmmio_fault_page(faultpage->page, NULL);
-		if (addr == ctx->addr) {
-			/*
-			 * On SMP we sometimes get recursive probe hits on the
-			 * same address. Context is already saved, fall out.
-			 */
-			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
-						"address 0x%08lx.\n",
-						smp_processor_id(), addr);
-			ret = 1;
-			goto no_kmmio_ctx;
-		}
-		/*
-		 * Prevent overwriting already in-flight context.
-		 * This should not happen, let's hope disarming at least
-		 * prevents a panic.
-		 */
-		pr_emerg("kmmio: recursive probe hit on CPU %d, "
-					"for address 0x%08lx. Ignoring.\n",
-					smp_processor_id(), addr);
-		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
-					ctx->addr);
-		goto no_kmmio_ctx;
-	}
-	ctx->active++;
-
-	ctx->fpage = faultpage;
-	ctx->probe = get_kmmio_probe(addr);
-	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
-	ctx->addr = addr;
-
-	if (ctx->probe && ctx->probe->pre_handler)
-		ctx->probe->pre_handler(ctx->probe, regs, addr);
-
-	/*
-	 * Enable single-stepping and disable interrupts for the faulting
-	 * context. Local interrupts must not get enabled during stepping.
-	 */
-	regs->flags |= X86_EFLAGS_TF;
-	regs->flags &= ~X86_EFLAGS_IF;
-
-	/* Now we set present bit in PTE and single step. */
-	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
-
-	/*
-	 * If another cpu accesses the same page while we are stepping,
-	 * the access will not be caught. It will simply succeed and the
-	 * only downside is we lose the event. If this becomes a problem,
-	 * the user should drop to single cpu before tracing.
-	 */
-
-	put_cpu_var(kmmio_ctx);
-	return 1; /* fault handled */
-
-no_kmmio_ctx:
-	put_cpu_var(kmmio_ctx);
-no_kmmio:
-	rcu_read_unlock();
-	preempt_enable_no_resched();
-	return ret;
-}
-
-/*
- * Interrupts are disabled on entry as trap1 is an interrupt gate
- * and they remain disabled thorough out this function.
- * This must always get called as the pair to kmmio_handler().
- */
-static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
-{
-	int ret = 0;
-	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
-
-	if (!ctx->active) {
-		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
-							smp_processor_id());
-		goto out;
-	}
-
-	if (ctx->probe && ctx->probe->post_handler)
-		ctx->probe->post_handler(ctx->probe, condition, regs);
-
-	arm_kmmio_fault_page(ctx->fpage->page, NULL);
-
-	regs->flags &= ~X86_EFLAGS_TF;
-	regs->flags |= ctx->saved_flags;
-
-	/* These were acquired in kmmio_handler(). */
-	ctx->active--;
-	BUG_ON(ctx->active);
-	rcu_read_unlock();
-	preempt_enable_no_resched();
-
-	/*
-	 * if somebody else is singlestepping across a probe point, flags
-	 * will have TF set, in which case, continue the remaining processing
-	 * of do_debug, as if this is not a probe hit.
-	 */
-	if (!(regs->flags & X86_EFLAGS_TF))
-		ret = 1;
-out:
-	put_cpu_var(kmmio_ctx);
-	return ret;
-}
-
-/* You must be holding kmmio_lock. */
-static int add_kmmio_fault_page(unsigned long page)
-{
-	struct kmmio_fault_page *f;
-
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
-	if (f) {
-		if (!f->count)
-			arm_kmmio_fault_page(f->page, NULL);
-		f->count++;
-		return 0;
-	}
-
-	f = kmalloc(sizeof(*f), GFP_ATOMIC);
-	if (!f)
-		return -1;
-
-	f->count = 1;
-	f->page = page;
-	list_add_rcu(&f->list, kmmio_page_list(f->page));
-
-	arm_kmmio_fault_page(f->page, NULL);
-
-	return 0;
-}
-
-/* You must be holding kmmio_lock. */
-static void release_kmmio_fault_page(unsigned long page,
-				struct kmmio_fault_page **release_list)
-{
-	struct kmmio_fault_page *f;
-
-	page &= PAGE_MASK;
-	f = get_kmmio_fault_page(page);
-	if (!f)
-		return;
-
-	f->count--;
-	BUG_ON(f->count < 0);
-	if (!f->count) {
-		disarm_kmmio_fault_page(f->page, NULL);
-		f->release_next = *release_list;
-		*release_list = f;
-	}
-}
-
-int register_kmmio_probe(struct kmmio_probe *p)
-{
-	unsigned long flags;
-	int ret = 0;
-	unsigned long size = 0;
-
-	spin_lock_irqsave(&kmmio_lock, flags);
-	if (get_kmmio_probe(p->addr)) {
-		ret = -EEXIST;
-		goto out;
-	}
-	kmmio_count++;
-	list_add_rcu(&p->list, &kmmio_probes);
-	while (size < p->len) {
-		if (add_kmmio_fault_page(p->addr + size))
-			pr_err("kmmio: Unable to set page fault.\n");
-		size += PAGE_SIZE;
-	}
-out:
-	spin_unlock_irqrestore(&kmmio_lock, flags);
-	/*
-	 * XXX: What should I do here?
-	 * Here was a call to global_flush_tlb(), but it does not exist
-	 * anymore. It seems it's not needed after all.
-	 */
-	return ret;
-}
-EXPORT_SYMBOL(register_kmmio_probe);
-
-static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
-{
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
-	struct kmmio_fault_page *p = dr->release_list;
-	while (p) {
-		struct kmmio_fault_page *next = p->release_next;
-		BUG_ON(p->count);
-		kfree(p);
-		p = next;
-	}
-	kfree(dr);
-}
-
-static void remove_kmmio_fault_pages(struct rcu_head *head)
-{
-	struct kmmio_delayed_release *dr = container_of(
-						head,
-						struct kmmio_delayed_release,
-						rcu);
-	struct kmmio_fault_page *p = dr->release_list;
-	struct kmmio_fault_page **prevp = &dr->release_list;
-	unsigned long flags;
-	spin_lock_irqsave(&kmmio_lock, flags);
-	while (p) {
-		if (!p->count)
-			list_del_rcu(&p->list);
-		else
-			*prevp = p->release_next;
-		prevp = &p->release_next;
-		p = p->release_next;
-	}
-	spin_unlock_irqrestore(&kmmio_lock, flags);
-	/* This is the real RCU destroy call. */
-	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
-}
-
-/*
- * Remove a kmmio probe. You have to synchronize_rcu() before you can be
- * sure that the callbacks will not be called anymore. Only after that
- * you may actually release your struct kmmio_probe.
- *
- * Unregistering a kmmio fault page has three steps:
- * 1. release_kmmio_fault_page()
- *    Disarm the page, wait a grace period to let all faults finish.
- * 2. remove_kmmio_fault_pages()
- *    Remove the pages from kmmio_page_table.
- * 3. rcu_free_kmmio_fault_pages()
- *    Actally free the kmmio_fault_page structs as with RCU.
- */
-void unregister_kmmio_probe(struct kmmio_probe *p)
-{
-	unsigned long flags;
-	unsigned long size = 0;
-	struct kmmio_fault_page *release_list = NULL;
-	struct kmmio_delayed_release *drelease;
-
-	spin_lock_irqsave(&kmmio_lock, flags);
-	while (size < p->len) {
-		release_kmmio_fault_page(p->addr + size, &release_list);
-		size += PAGE_SIZE;
-	}
-	list_del_rcu(&p->list);
-	kmmio_count--;
-	spin_unlock_irqrestore(&kmmio_lock, flags);
-
-	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
-	if (!drelease) {
-		pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
-		return;
-	}
-	drelease->release_list = release_list;
-
-	/*
-	 * This is not really RCU here. We have just disarmed a set of
-	 * pages so that they cannot trigger page faults anymore. However,
-	 * we cannot remove the pages from kmmio_page_table,
-	 * because a probe hit might be in flight on another CPU. The
-	 * pages are collected into a list, and they will be removed from
-	 * kmmio_page_table when it is certain that no probe hit related to
-	 * these pages can be in flight. RCU grace period sounds like a
-	 * good choice.
-	 *
-	 * If we removed the pages too early, kmmio page fault handler might
-	 * not find the respective kmmio_fault_page and determine it's not
-	 * a kmmio fault, when it actually is. This would lead to madness.
-	 */
-	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
-}
-EXPORT_SYMBOL(unregister_kmmio_probe);
-
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-								void *args)
-{
-	struct die_args *arg = args;
-
-	if (val == DIE_DEBUG && (arg->err & DR_STEP))
-		if (post_kmmio_handler(arg->err, arg->regs) == 1)
-			return NOTIFY_STOP;
-
-	return NOTIFY_DONE;
-}
-
-static struct notifier_block nb_die = {
-	.notifier_call = kmmio_die_notifier
-};
-
-static int __init init_kmmio(void)
-{
-	int i;
-	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
-		INIT_LIST_HEAD(&kmmio_page_table[i]);
-	return register_die_notifier(&nb_die);
-}
-fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/kernel/mmiotrace/mmio-mod.c b/arch/x86/kernel/mmiotrace/mmio-mod.c
deleted file mode 100644
index 8256546d49bf..000000000000
--- a/arch/x86/kernel/mmiotrace/mmio-mod.c
+++ /dev/null
@@ -1,457 +0,0 @@
-/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
- *
- * Copyright (C) IBM Corporation, 2005
- *               Jeff Muizelaar, 2006, 2007
- *               Pekka Paalanen, 2008 <pq@iki.fi>
- *
- * Derived from the read-mod example from relay-examples by Tom Zanussi.
- */
-#define DEBUG 1
-
-#include <linux/module.h>
-#include <linux/debugfs.h>
-#include <linux/uaccess.h>
-#include <asm/io.h>
-#include <linux/version.h>
-#include <linux/kallsyms.h>
-#include <asm/pgtable.h>
-#include <linux/mmiotrace.h>
-#include <asm/e820.h> /* for ISA_START_ADDRESS */
-#include <asm/atomic.h>
-#include <linux/percpu.h>
-
-#include "pf_in.h"
-
-#define NAME "mmiotrace: "
-
-struct trap_reason {
-	unsigned long addr;
-	unsigned long ip;
-	enum reason_type type;
-	int active_traces;
-};
-
-struct remap_trace {
-	struct list_head list;
-	struct kmmio_probe probe;
-	unsigned long phys;
-	unsigned long id;
-};
-
-/* Accessed per-cpu. */
-static DEFINE_PER_CPU(struct trap_reason, pf_reason);
-static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
-
-#if 0 /* XXX: no way gather this info anymore */
-/* Access to this is not per-cpu. */
-static DEFINE_PER_CPU(atomic_t, dropped);
-#endif
-
-static struct dentry *marker_file;
-
-static DEFINE_MUTEX(mmiotrace_mutex);
-static DEFINE_SPINLOCK(trace_lock);
-static atomic_t mmiotrace_enabled;
-static LIST_HEAD(trace_list);		/* struct remap_trace */
-
-/*
- * Locking in this file:
- * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
- * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
- *   and trace_lock.
- * - Routines depending on is_enabled() must take trace_lock.
- * - trace_list users must hold trace_lock.
- * - is_enabled() guarantees that mmio_trace_record is allowed.
- * - pre/post callbacks assume the effect of is_enabled() being true.
- */
-
-/* module parameters */
-static unsigned long	filter_offset;
-static int		nommiotrace;
-static int		ISA_trace;
-static int		trace_pc;
-
-module_param(filter_offset, ulong, 0);
-module_param(nommiotrace, bool, 0);
-module_param(ISA_trace, bool, 0);
-module_param(trace_pc, bool, 0);
-
-MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
-MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
-MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
-MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
-
-static bool is_enabled(void)
-{
-	return atomic_read(&mmiotrace_enabled);
-}
-
-#if 0 /* XXX: needs rewrite */
-/*
- * Write callback for the debugfs entry:
- * Read a marker and write it to the mmio trace log
- */
-static ssize_t write_marker(struct file *file, const char __user *buffer,
-						size_t count, loff_t *ppos)
-{
-	char *event = NULL;
-	struct mm_io_header *headp;
-	ssize_t len = (count > 65535) ? 65535 : count;
-
-	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
-	if (!event)
-		return -ENOMEM;
-
-	headp = (struct mm_io_header *)event;
-	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
-	headp->data_len = len;
-
-	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
-		kfree(event);
-		return -EFAULT;
-	}
-
-	spin_lock_irq(&trace_lock);
-#if 0 /* XXX: convert this to use tracing */
-	if (is_enabled())
-		relay_write(chan, event, sizeof(*headp) + len);
-	else
-#endif
-		len = -EINVAL;
-	spin_unlock_irq(&trace_lock);
-	kfree(event);
-	return len;
-}
-#endif
-
-static void print_pte(unsigned long address)
-{
-	int level;
-	pte_t *pte = lookup_address(address, &level);
-
-	if (!pte) {
-		pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
-							__func__, address);
-		return;
-	}
-
-	if (level == PG_LEVEL_2M) {
-		pr_emerg(NAME "4MB pages are not currently supported: "
-							"0x%08lx\n", address);
-		BUG();
-	}
-	pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte),
-						pte_val(*pte) & _PAGE_PRESENT);
-}
-
-/*
- * For some reason the pre/post pairs have been called in an
- * unmatched order. Report and die.
- */
-static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
-{
-	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
-					"last fault for address: 0x%08lx\n",
-					addr, my_reason->addr);
-	print_pte(addr);
-	print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
-	print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
-#ifdef __i386__
-	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
-			regs->ax, regs->bx, regs->cx, regs->dx);
-	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
-			regs->si, regs->di, regs->bp, regs->sp);
-#else
-	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
-					regs->ax, regs->cx, regs->dx);
-	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
-				regs->si, regs->di, regs->bp, regs->sp);
-#endif
-	put_cpu_var(pf_reason);
-	BUG();
-}
-
-static void pre(struct kmmio_probe *p, struct pt_regs *regs,
-						unsigned long addr)
-{
-	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
-	const unsigned long instptr = instruction_pointer(regs);
-	const enum reason_type type = get_ins_type(instptr);
-	struct remap_trace *trace = p->user_data;
-
-	/* it doesn't make sense to have more than one active trace per cpu */
-	if (my_reason->active_traces)
-		die_kmmio_nesting_error(regs, addr);
-	else
-		my_reason->active_traces++;
-
-	my_reason->type = type;
-	my_reason->addr = addr;
-	my_reason->ip = instptr;
-
-	my_trace->phys = addr - trace->probe.addr + trace->phys;
-	my_trace->map_id = trace->id;
-
-	/*
-	 * Only record the program counter when requested.
-	 * It may taint clean-room reverse engineering.
-	 */
-	if (trace_pc)
-		my_trace->pc = instptr;
-	else
-		my_trace->pc = 0;
-
-	/*
-	 * XXX: the timestamp recorded will be *after* the tracing has been
-	 * done, not at the time we hit the instruction. SMP implications
-	 * on event ordering?
-	 */
-
-	switch (type) {
-	case REG_READ:
-		my_trace->opcode = MMIO_READ;
-		my_trace->width = get_ins_mem_width(instptr);
-		break;
-	case REG_WRITE:
-		my_trace->opcode = MMIO_WRITE;
-		my_trace->width = get_ins_mem_width(instptr);
-		my_trace->value = get_ins_reg_val(instptr, regs);
-		break;
-	case IMM_WRITE:
-		my_trace->opcode = MMIO_WRITE;
-		my_trace->width = get_ins_mem_width(instptr);
-		my_trace->value = get_ins_imm_val(instptr);
-		break;
-	default:
-		{
-			unsigned char *ip = (unsigned char *)instptr;
-			my_trace->opcode = MMIO_UNKNOWN_OP;
-			my_trace->width = 0;
-			my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
-								*(ip + 2);
-		}
-	}
-	put_cpu_var(cpu_trace);
-	put_cpu_var(pf_reason);
-}
-
-static void post(struct kmmio_probe *p, unsigned long condition,
-							struct pt_regs *regs)
-{
-	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
-	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
-
-	/* this should always return the active_trace count to 0 */
-	my_reason->active_traces--;
-	if (my_reason->active_traces) {
-		pr_emerg(NAME "unexpected post handler");
-		BUG();
-	}
-
-	switch (my_reason->type) {
-	case REG_READ:
-		my_trace->value = get_ins_reg_val(my_reason->ip, regs);
-		break;
-	default:
-		break;
-	}
-
-	mmio_trace_rw(my_trace);
-	put_cpu_var(cpu_trace);
-	put_cpu_var(pf_reason);
-}
-
-static void ioremap_trace_core(unsigned long offset, unsigned long size,
-							void __iomem *addr)
-{
-	static atomic_t next_id;
-	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
-	struct mmiotrace_map map = {
-		.phys = offset,
-		.virt = (unsigned long)addr,
-		.len = size,
-		.opcode = MMIO_PROBE
-	};
-
-	if (!trace) {
-		pr_err(NAME "kmalloc failed in ioremap\n");
-		return;
-	}
-
-	*trace = (struct remap_trace) {
-		.probe = {
-			.addr = (unsigned long)addr,
-			.len = size,
-			.pre_handler = pre,
-			.post_handler = post,
-			.user_data = trace
-		},
-		.phys = offset,
-		.id = atomic_inc_return(&next_id)
-	};
-	map.map_id = trace->id;
-
-	spin_lock_irq(&trace_lock);
-	if (!is_enabled())
-		goto not_enabled;
-
-	mmio_trace_mapping(&map);
-	list_add_tail(&trace->list, &trace_list);
-	if (!nommiotrace)
-		register_kmmio_probe(&trace->probe);
-
-not_enabled:
-	spin_unlock_irq(&trace_lock);
-}
-
-void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
-{
-	if (!is_enabled()) /* recheck and proper locking in *_core() */
-		return;
-
-	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
-	if ((filter_offset) && (offset != filter_offset))
-		return;
-	ioremap_trace_core(offset, size, addr);
-}
-
-static void iounmap_trace_core(volatile void __iomem *addr)
-{
-	struct mmiotrace_map map = {
-		.phys = 0,
-		.virt = (unsigned long)addr,
-		.len = 0,
-		.opcode = MMIO_UNPROBE
-	};
-	struct remap_trace *trace;
-	struct remap_trace *tmp;
-	struct remap_trace *found_trace = NULL;
-
-	pr_debug(NAME "Unmapping %p.\n", addr);
-
-	spin_lock_irq(&trace_lock);
-	if (!is_enabled())
-		goto not_enabled;
-
-	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		if ((unsigned long)addr == trace->probe.addr) {
-			if (!nommiotrace)
-				unregister_kmmio_probe(&trace->probe);
-			list_del(&trace->list);
-			found_trace = trace;
-			break;
-		}
-	}
-	map.map_id = (found_trace) ? found_trace->id : -1;
-	mmio_trace_mapping(&map);
-
-not_enabled:
-	spin_unlock_irq(&trace_lock);
-	if (found_trace) {
-		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
-		kfree(found_trace);
-	}
-}
-
-void mmiotrace_iounmap(volatile void __iomem *addr)
-{
-	might_sleep();
-	if (is_enabled()) /* recheck and proper locking in *_core() */
-		iounmap_trace_core(addr);
-}
-
-static void clear_trace_list(void)
-{
-	struct remap_trace *trace;
-	struct remap_trace *tmp;
-
-	/*
-	 * No locking required, because the caller ensures we are in a
-	 * critical section via mutex, and is_enabled() is false,
-	 * i.e. nothing can traverse or modify this list.
-	 * Caller also ensures is_enabled() cannot change.
-	 */
-	list_for_each_entry(trace, &trace_list, list) {
-		pr_notice(NAME "purging non-iounmapped "
-					"trace @0x%08lx, size 0x%lx.\n",
-					trace->probe.addr, trace->probe.len);
-		if (!nommiotrace)
-			unregister_kmmio_probe(&trace->probe);
-	}
-	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
-
-	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
-		list_del(&trace->list);
-		kfree(trace);
-	}
-}
-
-#if 0 /* XXX: out of order */
-static struct file_operations fops_marker = {
-	.owner =	THIS_MODULE,
-	.write =	write_marker
-};
-#endif
-
-void enable_mmiotrace(void)
-{
-	mutex_lock(&mmiotrace_mutex);
-	if (is_enabled())
-		goto out;
-
-#if 0 /* XXX: tracing does not support text entries */
-	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
-								&fops_marker);
-	if (!marker_file)
-		pr_err(NAME "marker file creation failed.\n");
-#endif
-
-	if (nommiotrace)
-		pr_info(NAME "MMIO tracing disabled.\n");
-	if (ISA_trace)
-		pr_warning(NAME "Warning! low ISA range will be traced.\n");
-	spin_lock_irq(&trace_lock);
-	atomic_inc(&mmiotrace_enabled);
-	spin_unlock_irq(&trace_lock);
-	pr_info(NAME "enabled.\n");
-out:
-	mutex_unlock(&mmiotrace_mutex);
-}
-
-void disable_mmiotrace(void)
-{
-	mutex_lock(&mmiotrace_mutex);
-	if (!is_enabled())
-		goto out;
-
-	spin_lock_irq(&trace_lock);
-	atomic_dec(&mmiotrace_enabled);
-	BUG_ON(is_enabled());
-	spin_unlock_irq(&trace_lock);
-
-	clear_trace_list(); /* guarantees: no more kmmio callbacks */
-	if (marker_file) {
-		debugfs_remove(marker_file);
-		marker_file = NULL;
-	}
-
-	pr_info(NAME "disabled.\n");
-out:
-	mutex_unlock(&mmiotrace_mutex);
-}
diff --git a/arch/x86/kernel/mmiotrace/pf_in.c b/arch/x86/kernel/mmiotrace/pf_in.c
deleted file mode 100644
index efa1911e20ca..000000000000
--- a/arch/x86/kernel/mmiotrace/pf_in.c
+++ /dev/null
@@ -1,489 +0,0 @@
-/*
- *  Fault Injection Test harness (FI)
- *  Copyright (C) Intel Crop.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version 2
- *  of the License, or (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
- *  USA.
- *
- */
-
-/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
- *  Copyright by Intel Crop., 2002
- *  Louis Zhuang (louis.zhuang@intel.com)
- *
- *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
- */
-
-#include <linux/module.h>
-#include <linux/ptrace.h> /* struct pt_regs */
-#include "pf_in.h"
-
-#ifdef __i386__
-/* IA32 Manual 3, 2-1 */
-static unsigned char prefix_codes[] = {
-	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
-	0x65, 0x2E, 0x3E, 0x66, 0x67
-};
-/* IA32 Manual 3, 3-432*/
-static unsigned int reg_rop[] = {
-	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
-};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
-static unsigned int imm_wop[] = { 0xC6, 0xC7 };
-/* IA32 Manual 3, 3-432*/
-static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
-static unsigned int rw32[] = {
-	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
-};
-static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
-static unsigned int mw16[] = { 0xB70F, 0xBF0F };
-static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
-static unsigned int mw64[] = {};
-#else /* not __i386__ */
-static unsigned char prefix_codes[] = {
-	0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
-	0xF0, 0xF3, 0xF2,
-	/* REX Prefixes */
-	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
-	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
-};
-/* AMD64 Manual 3, Appendix A*/
-static unsigned int reg_rop[] = {
-	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
-};
-static unsigned int reg_wop[] = { 0x88, 0x89 };
-static unsigned int imm_wop[] = { 0xC6, 0xC7 };
-static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
-static unsigned int rw32[] = {
-	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
-};
-/* 8 bit only */
-static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
-/* 16 bit only */
-static unsigned int mw16[] = { 0xB70F, 0xBF0F };
-/* 16 or 32 bit */
-static unsigned int mw32[] = { 0xC7 };
-/* 16, 32 or 64 bit */
-static unsigned int mw64[] = { 0x89, 0x8B };
-#endif /* not __i386__ */
-
-static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
-								int *rexr)
-{
-	int i;
-	unsigned char *p = addr;
-	*shorted = 0;
-	*enlarged = 0;
-	*rexr = 0;
-
-restart:
-	for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
-		if (*p == prefix_codes[i]) {
-			if (*p == 0x66)
-				*shorted = 1;
-#ifdef __amd64__
-			if ((*p & 0xf8) == 0x48)
-				*enlarged = 1;
-			if ((*p & 0xf4) == 0x44)
-				*rexr = 1;
-#endif
-			p++;
-			goto restart;
-		}
-	}
-
-	return (p - addr);
-}
-
-static int get_opcode(unsigned char *addr, unsigned int *opcode)
-{
-	int len;
-
-	if (*addr == 0x0F) {
-		/* 0x0F is extension instruction */
-		*opcode = *(unsigned short *)addr;
-		len = 2;
-	} else {
-		*opcode = *addr;
-		len = 1;
-	}
-
-	return len;
-}
-
-#define CHECK_OP_TYPE(opcode, array, type) \
-	for (i = 0; i < ARRAY_SIZE(array); i++) { \
-		if (array[i] == opcode) { \
-			rv = type; \
-			goto exit; \
-		} \
-	}
-
-enum reason_type get_ins_type(unsigned long ins_addr)
-{
-	unsigned int opcode;
-	unsigned char *p;
-	int shorted, enlarged, rexr;
-	int i;
-	enum reason_type rv = OTHERS;
-
-	p = (unsigned char *)ins_addr;
-	p += skip_prefix(p, &shorted, &enlarged, &rexr);
-	p += get_opcode(p, &opcode);
-
-	CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
-	CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
-	CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
-
-exit:
-	return rv;
-}
-#undef CHECK_OP_TYPE
-
-static unsigned int get_ins_reg_width(unsigned long ins_addr)
-{
-	unsigned int opcode;
-	unsigned char *p;
-	int i, shorted, enlarged, rexr;
-
-	p = (unsigned char *)ins_addr;
-	p += skip_prefix(p, &shorted, &enlarged, &rexr);
-	p += get_opcode(p, &opcode);
-
-	for (i = 0; i < ARRAY_SIZE(rw8); i++)
-		if (rw8[i] == opcode)
-			return 1;
-
-	for (i = 0; i < ARRAY_SIZE(rw32); i++)
-		if (rw32[i] == opcode)
-			return (shorted ? 2 : (enlarged ? 8 : 4));
-
-	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
-	return 0;
-}
-
-unsigned int get_ins_mem_width(unsigned long ins_addr)
-{
-	unsigned int opcode;
-	unsigned char *p;
-	int i, shorted, enlarged, rexr;
-
-	p = (unsigned char *)ins_addr;
-	p += skip_prefix(p, &shorted, &enlarged, &rexr);
-	p += get_opcode(p, &opcode);
-
-	for (i = 0; i < ARRAY_SIZE(mw8); i++)
-		if (mw8[i] == opcode)
-			return 1;
-
-	for (i = 0; i < ARRAY_SIZE(mw16); i++)
-		if (mw16[i] == opcode)
-			return 2;
-
-	for (i = 0; i < ARRAY_SIZE(mw32); i++)
-		if (mw32[i] == opcode)
-			return shorted ? 2 : 4;
-
-	for (i = 0; i < ARRAY_SIZE(mw64); i++)
-		if (mw64[i] == opcode)
-			return shorted ? 2 : (enlarged ? 8 : 4);
-
-	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
-	return 0;
-}
-
-/*
- * Define register ident in mod/rm byte.
- * Note: these are NOT the same as in ptrace-abi.h.
- */
-enum {
-	arg_AL = 0,
-	arg_CL = 1,
-	arg_DL = 2,
-	arg_BL = 3,
-	arg_AH = 4,
-	arg_CH = 5,
-	arg_DH = 6,
-	arg_BH = 7,
-
-	arg_AX = 0,
-	arg_CX = 1,
-	arg_DX = 2,
-	arg_BX = 3,
-	arg_SP = 4,
-	arg_BP = 5,
-	arg_SI = 6,
-	arg_DI = 7,
-#ifdef __amd64__
-	arg_R8  = 8,
-	arg_R9  = 9,
-	arg_R10 = 10,
-	arg_R11 = 11,
-	arg_R12 = 12,
-	arg_R13 = 13,
-	arg_R14 = 14,
-	arg_R15 = 15
-#endif
-};
-
-static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
-{
-	unsigned char *rv = NULL;
-
-	switch (no) {
-	case arg_AL:
-		rv = (unsigned char *)&regs->ax;
-		break;
-	case arg_BL:
-		rv = (unsigned char *)&regs->bx;
-		break;
-	case arg_CL:
-		rv = (unsigned char *)&regs->cx;
-		break;
-	case arg_DL:
-		rv = (unsigned char *)&regs->dx;
-		break;
-	case arg_AH:
-		rv = 1 + (unsigned char *)&regs->ax;
-		break;
-	case arg_BH:
-		rv = 1 + (unsigned char *)&regs->bx;
-		break;
-	case arg_CH:
-		rv = 1 + (unsigned char *)&regs->cx;
-		break;
-	case arg_DH:
-		rv = 1 + (unsigned char *)&regs->dx;
-		break;
-#ifdef __amd64__
-	case arg_R8:
-		rv = (unsigned char *)&regs->r8;
-		break;
-	case arg_R9:
-		rv = (unsigned char *)&regs->r9;
-		break;
-	case arg_R10:
-		rv = (unsigned char *)&regs->r10;
-		break;
-	case arg_R11:
-		rv = (unsigned char *)&regs->r11;
-		break;
-	case arg_R12:
-		rv = (unsigned char *)&regs->r12;
-		break;
-	case arg_R13:
-		rv = (unsigned char *)&regs->r13;
-		break;
-	case arg_R14:
-		rv = (unsigned char *)&regs->r14;
-		break;
-	case arg_R15:
-		rv = (unsigned char *)&regs->r15;
-		break;
-#endif
-	default:
-		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
-		break;
-	}
-	return rv;
-}
-
-static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
-{
-	unsigned long *rv = NULL;
-
-	switch (no) {
-	case arg_AX:
-		rv = &regs->ax;
-		break;
-	case arg_BX:
-		rv = &regs->bx;
-		break;
-	case arg_CX:
-		rv = &regs->cx;
-		break;
-	case arg_DX:
-		rv = &regs->dx;
-		break;
-	case arg_SP:
-		rv = &regs->sp;
-		break;
-	case arg_BP:
-		rv = &regs->bp;
-		break;
-	case arg_SI:
-		rv = &regs->si;
-		break;
-	case arg_DI:
-		rv = &regs->di;
-		break;
-#ifdef __amd64__
-	case arg_R8:
-		rv = &regs->r8;
-		break;
-	case arg_R9:
-		rv = &regs->r9;
-		break;
-	case arg_R10:
-		rv = &regs->r10;
-		break;
-	case arg_R11:
-		rv = &regs->r11;
-		break;
-	case arg_R12:
-		rv = &regs->r12;
-		break;
-	case arg_R13:
-		rv = &regs->r13;
-		break;
-	case arg_R14:
-		rv = &regs->r14;
-		break;
-	case arg_R15:
-		rv = &regs->r15;
-		break;
-#endif
-	default:
-		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
-	}
-
-	return rv;
-}
-
-unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
-{
-	unsigned int opcode;
-	unsigned char mod_rm;
-	int reg;
-	unsigned char *p;
-	int i, shorted, enlarged, rexr;
-	unsigned long rv;
-
-	p = (unsigned char *)ins_addr;
-	p += skip_prefix(p, &shorted, &enlarged, &rexr);
-	p += get_opcode(p, &opcode);
-	for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
-		if (reg_rop[i] == opcode) {
-			rv = REG_READ;
-			goto do_work;
-		}
-
-	for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
-		if (reg_wop[i] == opcode) {
-			rv = REG_WRITE;
-			goto do_work;
-		}
-
-	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
-							"0x%02x\n", opcode);
-	goto err;
-
-do_work:
-	mod_rm = *p;
-	reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
-	switch (get_ins_reg_width(ins_addr)) {
-	case 1:
-		return *get_reg_w8(reg, regs);
-
-	case 2:
-		return *(unsigned short *)get_reg_w32(reg, regs);
-
-	case 4:
-		return *(unsigned int *)get_reg_w32(reg, regs);
-
-#ifdef __amd64__
-	case 8:
-		return *(unsigned long *)get_reg_w32(reg, regs);
-#endif
-
-	default:
-		printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
-	}
-
-err:
-	return 0;
-}
-
-unsigned long get_ins_imm_val(unsigned long ins_addr)
-{
-	unsigned int opcode;
-	unsigned char mod_rm;
-	unsigned char mod;
-	unsigned char *p;
-	int i, shorted, enlarged, rexr;
-	unsigned long rv;
-
-	p = (unsigned char *)ins_addr;
-	p += skip_prefix(p, &shorted, &enlarged, &rexr);
-	p += get_opcode(p, &opcode);
-	for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
-		if (imm_wop[i] == opcode) {
-			rv = IMM_WRITE;
-			goto do_work;
-		}
-
-	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
-							"0x%02x\n", opcode);
-	goto err;
-
-do_work:
-	mod_rm = *p;
-	mod = mod_rm >> 6;
-	p++;
-	switch (mod) {
-	case 0:
-		/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
-		/* AMD64: XXX Check for address size prefix? */
-		if ((mod_rm & 0x7) == 0x5)
-			p += 4;
-		break;
-
-	case 1:
-		p += 1;
-		break;
-
-	case 2:
-		p += 4;
-		break;
-
-	case 3:
-	default:
-		printk(KERN_ERR "mmiotrace: not a memory access instruction "
-						"at 0x%lx, rm_mod=0x%02x\n",
-						ins_addr, mod_rm);
-	}
-
-	switch (get_ins_reg_width(ins_addr)) {
-	case 1:
-		return *(unsigned char *)p;
-
-	case 2:
-		return *(unsigned short *)p;
-
-	case 4:
-		return *(unsigned int *)p;
-
-#ifdef __amd64__
-	case 8:
-		return *(unsigned long *)p;
-#endif
-
-	default:
-		printk(KERN_ERR "mmiotrace: Error: width.\n");
-	}
-
-err:
-	return 0;
-}
diff --git a/arch/x86/kernel/mmiotrace/pf_in.h b/arch/x86/kernel/mmiotrace/pf_in.h
deleted file mode 100644
index e05341a51a27..000000000000
--- a/arch/x86/kernel/mmiotrace/pf_in.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- *  Fault Injection Test harness (FI)
- *  Copyright (C) Intel Crop.
- *
- *  This program is free software; you can redistribute it and/or
- *  modify it under the terms of the GNU General Public License
- *  as published by the Free Software Foundation; either version 2
- *  of the License, or (at your option) any later version.
- *
- *  This program is distributed in the hope that it will be useful,
- *  but WITHOUT ANY WARRANTY; without even the implied warranty of
- *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- *  GNU General Public License for more details.
- *
- *  You should have received a copy of the GNU General Public License
- *  along with this program; if not, write to the Free Software
- *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
- *  USA.
- *
- */
-
-#ifndef __PF_H_
-#define __PF_H_
-
-enum reason_type {
-	NOT_ME,	/* page fault is not in regions */
-	NOTHING,	/* access others point in regions */
-	REG_READ,	/* read from addr to reg */
-	REG_WRITE,	/* write from reg to addr */
-	IMM_WRITE,	/* write from imm to addr */
-	OTHERS	/* Other instructions can not intercept */
-};
-
-enum reason_type get_ins_type(unsigned long ins_addr);
-unsigned int get_ins_mem_width(unsigned long ins_addr);
-unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
-unsigned long get_ins_imm_val(unsigned long ins_addr);
-
-#endif /* __PF_H_ */
diff --git a/arch/x86/kernel/mmiotrace/testmmiotrace.c b/arch/x86/kernel/mmiotrace/testmmiotrace.c
deleted file mode 100644
index cfa60b227c8d..000000000000
--- a/arch/x86/kernel/mmiotrace/testmmiotrace.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/*
- * Written by Pekka Paalanen, 2008 <pq@iki.fi>
- */
-#include <linux/module.h>
-#include <asm/io.h>
-
-#define MODULE_NAME "testmmiotrace"
-
-static unsigned long mmio_address;
-module_param(mmio_address, ulong, 0);
-MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
-
-static void do_write_test(void __iomem *p)
-{
-	unsigned int i;
-	for (i = 0; i < 256; i++)
-		iowrite8(i, p + i);
-	for (i = 1024; i < (5 * 1024); i += 2)
-		iowrite16(i * 12 + 7, p + i);
-	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		iowrite32(i * 212371 + 13, p + i);
-}
-
-static void do_read_test(void __iomem *p)
-{
-	unsigned int i;
-	for (i = 0; i < 256; i++)
-		ioread8(p + i);
-	for (i = 1024; i < (5 * 1024); i += 2)
-		ioread16(p + i);
-	for (i = (5 * 1024); i < (16 * 1024); i += 4)
-		ioread32(p + i);
-}
-
-static void do_test(void)
-{
-	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
-	if (!p) {
-		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
-		return;
-	}
-	do_write_test(p);
-	do_read_test(p);
-	iounmap(p);
-}
-
-static int __init init(void)
-{
-	if (mmio_address == 0) {
-		pr_err(MODULE_NAME ": you have to use the module argument "
-							"mmio_address.\n");
-		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
-				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
-		return -ENXIO;
-	}
-
-	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
-					"in PCI address space, and writing "
-					"rubbish in there.\n", mmio_address);
-	do_test();
-	return 0;
-}
-
-static void __exit cleanup(void)
-{
-	pr_debug(MODULE_NAME ": unloaded.\n");
-}
-
-module_init(init);
-module_exit(cleanup);
-MODULE_LICENSE("GPL");
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index b7b3e4c7cfc9..07dab503c9e3 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -8,6 +8,11 @@ obj-$(CONFIG_X86_PTDUMP)	+= dump_pagetables.o
 
 obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
+obj-$(CONFIG_MMIOTRACE_HOOKS)	+= kmmio.o
+obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
+mmiotrace-y			:= pf_in.o mmio-mod.o
+obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
+
 ifeq ($(CONFIG_X86_32),y)
 obj-$(CONFIG_NUMA)		+= discontig_32.o
 else
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
new file mode 100644
index 000000000000..3ad27b8504a5
--- /dev/null
+++ b/arch/x86/mm/kmmio.c
@@ -0,0 +1,499 @@
+/* Support for MMIO probes.
+ * Benfit many code from kprobes
+ * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>.
+ *     2007 Alexander Eichner
+ *     2008 Pekka Paalanen <pq@iki.fi>
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/uaccess.h>
+#include <linux/ptrace.h>
+#include <linux/preempt.h>
+#include <linux/percpu.h>
+#include <linux/kdebug.h>
+#include <linux/mutex.h>
+#include <asm/io.h>
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+#include <asm/errno.h>
+#include <asm/debugreg.h>
+#include <linux/mmiotrace.h>
+
+#define KMMIO_PAGE_HASH_BITS 4
+#define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS)
+
+struct kmmio_fault_page {
+	struct list_head list;
+	struct kmmio_fault_page *release_next;
+	unsigned long page; /* location of the fault page */
+
+	/*
+	 * Number of times this page has been registered as a part
+	 * of a probe. If zero, page is disarmed and this may be freed.
+	 * Used only by writers (RCU).
+	 */
+	int count;
+};
+
+struct kmmio_delayed_release {
+	struct rcu_head rcu;
+	struct kmmio_fault_page *release_list;
+};
+
+struct kmmio_context {
+	struct kmmio_fault_page *fpage;
+	struct kmmio_probe *probe;
+	unsigned long saved_flags;
+	unsigned long addr;
+	int active;
+};
+
+static DEFINE_SPINLOCK(kmmio_lock);
+
+/* Protected by kmmio_lock */
+unsigned int kmmio_count;
+
+/* Read-protected by RCU, write-protected by kmmio_lock. */
+static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE];
+static LIST_HEAD(kmmio_probes);
+
+static struct list_head *kmmio_page_list(unsigned long page)
+{
+	return &kmmio_page_table[hash_long(page, KMMIO_PAGE_HASH_BITS)];
+}
+
+/* Accessed per-cpu */
+static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx);
+
+/*
+ * this is basically a dynamic stabbing problem:
+ * Could use the existing prio tree code or
+ * Possible better implementations:
+ * The Interval Skip List: A Data Structure for Finding All Intervals That
+ * Overlap a Point (might be simple)
+ * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup
+ */
+/* Get the kmmio at this addr (if any). You must be holding RCU read lock. */
+static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
+{
+	struct kmmio_probe *p;
+	list_for_each_entry_rcu(p, &kmmio_probes, list) {
+		if (addr >= p->addr && addr <= (p->addr + p->len))
+			return p;
+	}
+	return NULL;
+}
+
+/* You must be holding RCU read lock. */
+static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
+{
+	struct list_head *head;
+	struct kmmio_fault_page *p;
+
+	page &= PAGE_MASK;
+	head = kmmio_page_list(page);
+	list_for_each_entry_rcu(p, head, list) {
+		if (p->page == page)
+			return p;
+	}
+	return NULL;
+}
+
+static void set_page_present(unsigned long addr, bool present, int *pglevel)
+{
+	pteval_t pteval;
+	pmdval_t pmdval;
+	int level;
+	pmd_t *pmd;
+	pte_t *pte = lookup_address(addr, &level);
+
+	if (!pte) {
+		pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+		return;
+	}
+
+	if (pglevel)
+		*pglevel = level;
+
+	switch (level) {
+	case PG_LEVEL_2M:
+		pmd = (pmd_t *)pte;
+		pmdval = pmd_val(*pmd) & ~_PAGE_PRESENT;
+		if (present)
+			pmdval |= _PAGE_PRESENT;
+		set_pmd(pmd, __pmd(pmdval));
+		break;
+
+	case PG_LEVEL_4K:
+		pteval = pte_val(*pte) & ~_PAGE_PRESENT;
+		if (present)
+			pteval |= _PAGE_PRESENT;
+		set_pte_atomic(pte, __pte(pteval));
+		break;
+
+	default:
+		pr_err("kmmio: unexpected page level 0x%x.\n", level);
+		return;
+	}
+
+	__flush_tlb_one(addr);
+}
+
+/** Mark the given page as not present. Access to it will trigger a fault. */
+static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+	set_page_present(page & PAGE_MASK, false, page_level);
+}
+
+/** Mark the given page as present. */
+static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
+{
+	set_page_present(page & PAGE_MASK, true, page_level);
+}
+
+/*
+ * This is being called from do_page_fault().
+ *
+ * We may be in an interrupt or a critical section. Also prefecthing may
+ * trigger a page fault. We may be in the middle of process switch.
+ * We cannot take any locks, because we could be executing especially
+ * within a kmmio critical section.
+ *
+ * Local interrupts are disabled, so preemption cannot happen.
+ * Do not enable interrupts, do not sleep, and watch out for other CPUs.
+ */
+/*
+ * Interrupts are disabled on entry as trap3 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ */
+int kmmio_handler(struct pt_regs *regs, unsigned long addr)
+{
+	struct kmmio_context *ctx;
+	struct kmmio_fault_page *faultpage;
+	int ret = 0; /* default to fault not handled */
+
+	/*
+	 * Preemption is now disabled to prevent process switch during
+	 * single stepping. We can only handle one active kmmio trace
+	 * per cpu, so ensure that we finish it before something else
+	 * gets to run. We also hold the RCU read lock over single
+	 * stepping to avoid looking up the probe and kmmio_fault_page
+	 * again.
+	 */
+	preempt_disable();
+	rcu_read_lock();
+
+	faultpage = get_kmmio_fault_page(addr);
+	if (!faultpage) {
+		/*
+		 * Either this page fault is not caused by kmmio, or
+		 * another CPU just pulled the kmmio probe from under
+		 * our feet. The latter case should not be possible.
+		 */
+		goto no_kmmio;
+	}
+
+	ctx = &get_cpu_var(kmmio_ctx);
+	if (ctx->active) {
+		disarm_kmmio_fault_page(faultpage->page, NULL);
+		if (addr == ctx->addr) {
+			/*
+			 * On SMP we sometimes get recursive probe hits on the
+			 * same address. Context is already saved, fall out.
+			 */
+			pr_debug("kmmio: duplicate probe hit on CPU %d, for "
+						"address 0x%08lx.\n",
+						smp_processor_id(), addr);
+			ret = 1;
+			goto no_kmmio_ctx;
+		}
+		/*
+		 * Prevent overwriting already in-flight context.
+		 * This should not happen, let's hope disarming at least
+		 * prevents a panic.
+		 */
+		pr_emerg("kmmio: recursive probe hit on CPU %d, "
+					"for address 0x%08lx. Ignoring.\n",
+					smp_processor_id(), addr);
+		pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
+					ctx->addr);
+		goto no_kmmio_ctx;
+	}
+	ctx->active++;
+
+	ctx->fpage = faultpage;
+	ctx->probe = get_kmmio_probe(addr);
+	ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
+	ctx->addr = addr;
+
+	if (ctx->probe && ctx->probe->pre_handler)
+		ctx->probe->pre_handler(ctx->probe, regs, addr);
+
+	/*
+	 * Enable single-stepping and disable interrupts for the faulting
+	 * context. Local interrupts must not get enabled during stepping.
+	 */
+	regs->flags |= X86_EFLAGS_TF;
+	regs->flags &= ~X86_EFLAGS_IF;
+
+	/* Now we set present bit in PTE and single step. */
+	disarm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	/*
+	 * If another cpu accesses the same page while we are stepping,
+	 * the access will not be caught. It will simply succeed and the
+	 * only downside is we lose the event. If this becomes a problem,
+	 * the user should drop to single cpu before tracing.
+	 */
+
+	put_cpu_var(kmmio_ctx);
+	return 1; /* fault handled */
+
+no_kmmio_ctx:
+	put_cpu_var(kmmio_ctx);
+no_kmmio:
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+	return ret;
+}
+
+/*
+ * Interrupts are disabled on entry as trap1 is an interrupt gate
+ * and they remain disabled thorough out this function.
+ * This must always get called as the pair to kmmio_handler().
+ */
+static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
+{
+	int ret = 0;
+	struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
+
+	if (!ctx->active) {
+		pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+							smp_processor_id());
+		goto out;
+	}
+
+	if (ctx->probe && ctx->probe->post_handler)
+		ctx->probe->post_handler(ctx->probe, condition, regs);
+
+	arm_kmmio_fault_page(ctx->fpage->page, NULL);
+
+	regs->flags &= ~X86_EFLAGS_TF;
+	regs->flags |= ctx->saved_flags;
+
+	/* These were acquired in kmmio_handler(). */
+	ctx->active--;
+	BUG_ON(ctx->active);
+	rcu_read_unlock();
+	preempt_enable_no_resched();
+
+	/*
+	 * if somebody else is singlestepping across a probe point, flags
+	 * will have TF set, in which case, continue the remaining processing
+	 * of do_debug, as if this is not a probe hit.
+	 */
+	if (!(regs->flags & X86_EFLAGS_TF))
+		ret = 1;
+out:
+	put_cpu_var(kmmio_ctx);
+	return ret;
+}
+
+/* You must be holding kmmio_lock. */
+static int add_kmmio_fault_page(unsigned long page)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (f) {
+		if (!f->count)
+			arm_kmmio_fault_page(f->page, NULL);
+		f->count++;
+		return 0;
+	}
+
+	f = kmalloc(sizeof(*f), GFP_ATOMIC);
+	if (!f)
+		return -1;
+
+	f->count = 1;
+	f->page = page;
+	list_add_rcu(&f->list, kmmio_page_list(f->page));
+
+	arm_kmmio_fault_page(f->page, NULL);
+
+	return 0;
+}
+
+/* You must be holding kmmio_lock. */
+static void release_kmmio_fault_page(unsigned long page,
+				struct kmmio_fault_page **release_list)
+{
+	struct kmmio_fault_page *f;
+
+	page &= PAGE_MASK;
+	f = get_kmmio_fault_page(page);
+	if (!f)
+		return;
+
+	f->count--;
+	BUG_ON(f->count < 0);
+	if (!f->count) {
+		disarm_kmmio_fault_page(f->page, NULL);
+		f->release_next = *release_list;
+		*release_list = f;
+	}
+}
+
+int register_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	int ret = 0;
+	unsigned long size = 0;
+
+	spin_lock_irqsave(&kmmio_lock, flags);
+	if (get_kmmio_probe(p->addr)) {
+		ret = -EEXIST;
+		goto out;
+	}
+	kmmio_count++;
+	list_add_rcu(&p->list, &kmmio_probes);
+	while (size < p->len) {
+		if (add_kmmio_fault_page(p->addr + size))
+			pr_err("kmmio: Unable to set page fault.\n");
+		size += PAGE_SIZE;
+	}
+out:
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/*
+	 * XXX: What should I do here?
+	 * Here was a call to global_flush_tlb(), but it does not exist
+	 * anymore. It seems it's not needed after all.
+	 */
+	return ret;
+}
+EXPORT_SYMBOL(register_kmmio_probe);
+
+static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	while (p) {
+		struct kmmio_fault_page *next = p->release_next;
+		BUG_ON(p->count);
+		kfree(p);
+		p = next;
+	}
+	kfree(dr);
+}
+
+static void remove_kmmio_fault_pages(struct rcu_head *head)
+{
+	struct kmmio_delayed_release *dr = container_of(
+						head,
+						struct kmmio_delayed_release,
+						rcu);
+	struct kmmio_fault_page *p = dr->release_list;
+	struct kmmio_fault_page **prevp = &dr->release_list;
+	unsigned long flags;
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (p) {
+		if (!p->count)
+			list_del_rcu(&p->list);
+		else
+			*prevp = p->release_next;
+		prevp = &p->release_next;
+		p = p->release_next;
+	}
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+	/* This is the real RCU destroy call. */
+	call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages);
+}
+
+/*
+ * Remove a kmmio probe. You have to synchronize_rcu() before you can be
+ * sure that the callbacks will not be called anymore. Only after that
+ * you may actually release your struct kmmio_probe.
+ *
+ * Unregistering a kmmio fault page has three steps:
+ * 1. release_kmmio_fault_page()
+ *    Disarm the page, wait a grace period to let all faults finish.
+ * 2. remove_kmmio_fault_pages()
+ *    Remove the pages from kmmio_page_table.
+ * 3. rcu_free_kmmio_fault_pages()
+ *    Actally free the kmmio_fault_page structs as with RCU.
+ */
+void unregister_kmmio_probe(struct kmmio_probe *p)
+{
+	unsigned long flags;
+	unsigned long size = 0;
+	struct kmmio_fault_page *release_list = NULL;
+	struct kmmio_delayed_release *drelease;
+
+	spin_lock_irqsave(&kmmio_lock, flags);
+	while (size < p->len) {
+		release_kmmio_fault_page(p->addr + size, &release_list);
+		size += PAGE_SIZE;
+	}
+	list_del_rcu(&p->list);
+	kmmio_count--;
+	spin_unlock_irqrestore(&kmmio_lock, flags);
+
+	drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
+	if (!drelease) {
+		pr_crit("kmmio: leaking kmmio_fault_page objects.\n");
+		return;
+	}
+	drelease->release_list = release_list;
+
+	/*
+	 * This is not really RCU here. We have just disarmed a set of
+	 * pages so that they cannot trigger page faults anymore. However,
+	 * we cannot remove the pages from kmmio_page_table,
+	 * because a probe hit might be in flight on another CPU. The
+	 * pages are collected into a list, and they will be removed from
+	 * kmmio_page_table when it is certain that no probe hit related to
+	 * these pages can be in flight. RCU grace period sounds like a
+	 * good choice.
+	 *
+	 * If we removed the pages too early, kmmio page fault handler might
+	 * not find the respective kmmio_fault_page and determine it's not
+	 * a kmmio fault, when it actually is. This would lead to madness.
+	 */
+	call_rcu(&drelease->rcu, remove_kmmio_fault_pages);
+}
+EXPORT_SYMBOL(unregister_kmmio_probe);
+
+static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
+								void *args)
+{
+	struct die_args *arg = args;
+
+	if (val == DIE_DEBUG && (arg->err & DR_STEP))
+		if (post_kmmio_handler(arg->err, arg->regs) == 1)
+			return NOTIFY_STOP;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nb_die = {
+	.notifier_call = kmmio_die_notifier
+};
+
+static int __init init_kmmio(void)
+{
+	int i;
+	for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
+		INIT_LIST_HEAD(&kmmio_page_table[i]);
+	return register_die_notifier(&nb_die);
+}
+fs_initcall(init_kmmio); /* should be before device_initcall() */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
new file mode 100644
index 000000000000..8256546d49bf
--- /dev/null
+++ b/arch/x86/mm/mmio-mod.c
@@ -0,0 +1,457 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *               Jeff Muizelaar, 2006, 2007
+ *               Pekka Paalanen, 2008 <pq@iki.fi>
+ *
+ * Derived from the read-mod example from relay-examples by Tom Zanussi.
+ */
+#define DEBUG 1
+
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <asm/io.h>
+#include <linux/version.h>
+#include <linux/kallsyms.h>
+#include <asm/pgtable.h>
+#include <linux/mmiotrace.h>
+#include <asm/e820.h> /* for ISA_START_ADDRESS */
+#include <asm/atomic.h>
+#include <linux/percpu.h>
+
+#include "pf_in.h"
+
+#define NAME "mmiotrace: "
+
+struct trap_reason {
+	unsigned long addr;
+	unsigned long ip;
+	enum reason_type type;
+	int active_traces;
+};
+
+struct remap_trace {
+	struct list_head list;
+	struct kmmio_probe probe;
+	unsigned long phys;
+	unsigned long id;
+};
+
+/* Accessed per-cpu. */
+static DEFINE_PER_CPU(struct trap_reason, pf_reason);
+static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
+
+#if 0 /* XXX: no way gather this info anymore */
+/* Access to this is not per-cpu. */
+static DEFINE_PER_CPU(atomic_t, dropped);
+#endif
+
+static struct dentry *marker_file;
+
+static DEFINE_MUTEX(mmiotrace_mutex);
+static DEFINE_SPINLOCK(trace_lock);
+static atomic_t mmiotrace_enabled;
+static LIST_HEAD(trace_list);		/* struct remap_trace */
+
+/*
+ * Locking in this file:
+ * - mmiotrace_mutex enforces enable/disable_mmiotrace() critical sections.
+ * - mmiotrace_enabled may be modified only when holding mmiotrace_mutex
+ *   and trace_lock.
+ * - Routines depending on is_enabled() must take trace_lock.
+ * - trace_list users must hold trace_lock.
+ * - is_enabled() guarantees that mmio_trace_record is allowed.
+ * - pre/post callbacks assume the effect of is_enabled() being true.
+ */
+
+/* module parameters */
+static unsigned long	filter_offset;
+static int		nommiotrace;
+static int		ISA_trace;
+static int		trace_pc;
+
+module_param(filter_offset, ulong, 0);
+module_param(nommiotrace, bool, 0);
+module_param(ISA_trace, bool, 0);
+module_param(trace_pc, bool, 0);
+
+MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
+MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
+MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
+MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
+
+static bool is_enabled(void)
+{
+	return atomic_read(&mmiotrace_enabled);
+}
+
+#if 0 /* XXX: needs rewrite */
+/*
+ * Write callback for the debugfs entry:
+ * Read a marker and write it to the mmio trace log
+ */
+static ssize_t write_marker(struct file *file, const char __user *buffer,
+						size_t count, loff_t *ppos)
+{
+	char *event = NULL;
+	struct mm_io_header *headp;
+	ssize_t len = (count > 65535) ? 65535 : count;
+
+	event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
+	if (!event)
+		return -ENOMEM;
+
+	headp = (struct mm_io_header *)event;
+	headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
+	headp->data_len = len;
+
+	if (copy_from_user(event + sizeof(*headp), buffer, len)) {
+		kfree(event);
+		return -EFAULT;
+	}
+
+	spin_lock_irq(&trace_lock);
+#if 0 /* XXX: convert this to use tracing */
+	if (is_enabled())
+		relay_write(chan, event, sizeof(*headp) + len);
+	else
+#endif
+		len = -EINVAL;
+	spin_unlock_irq(&trace_lock);
+	kfree(event);
+	return len;
+}
+#endif
+
+static void print_pte(unsigned long address)
+{
+	int level;
+	pte_t *pte = lookup_address(address, &level);
+
+	if (!pte) {
+		pr_err(NAME "Error in %s: no pte for page 0x%08lx\n",
+							__func__, address);
+		return;
+	}
+
+	if (level == PG_LEVEL_2M) {
+		pr_emerg(NAME "4MB pages are not currently supported: "
+							"0x%08lx\n", address);
+		BUG();
+	}
+	pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte),
+						pte_val(*pte) & _PAGE_PRESENT);
+}
+
+/*
+ * For some reason the pre/post pairs have been called in an
+ * unmatched order. Report and die.
+ */
+static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
+{
+	const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	pr_emerg(NAME "unexpected fault for address: 0x%08lx, "
+					"last fault for address: 0x%08lx\n",
+					addr, my_reason->addr);
+	print_pte(addr);
+	print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
+	print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
+#ifdef __i386__
+	pr_emerg("eax: %08lx   ebx: %08lx   ecx: %08lx   edx: %08lx\n",
+			regs->ax, regs->bx, regs->cx, regs->dx);
+	pr_emerg("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
+			regs->si, regs->di, regs->bp, regs->sp);
+#else
+	pr_emerg("rax: %016lx   rcx: %016lx   rdx: %016lx\n",
+					regs->ax, regs->cx, regs->dx);
+	pr_emerg("rsi: %016lx   rdi: %016lx   rbp: %016lx   rsp: %016lx\n",
+				regs->si, regs->di, regs->bp, regs->sp);
+#endif
+	put_cpu_var(pf_reason);
+	BUG();
+}
+
+static void pre(struct kmmio_probe *p, struct pt_regs *regs,
+						unsigned long addr)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+	const unsigned long instptr = instruction_pointer(regs);
+	const enum reason_type type = get_ins_type(instptr);
+	struct remap_trace *trace = p->user_data;
+
+	/* it doesn't make sense to have more than one active trace per cpu */
+	if (my_reason->active_traces)
+		die_kmmio_nesting_error(regs, addr);
+	else
+		my_reason->active_traces++;
+
+	my_reason->type = type;
+	my_reason->addr = addr;
+	my_reason->ip = instptr;
+
+	my_trace->phys = addr - trace->probe.addr + trace->phys;
+	my_trace->map_id = trace->id;
+
+	/*
+	 * Only record the program counter when requested.
+	 * It may taint clean-room reverse engineering.
+	 */
+	if (trace_pc)
+		my_trace->pc = instptr;
+	else
+		my_trace->pc = 0;
+
+	/*
+	 * XXX: the timestamp recorded will be *after* the tracing has been
+	 * done, not at the time we hit the instruction. SMP implications
+	 * on event ordering?
+	 */
+
+	switch (type) {
+	case REG_READ:
+		my_trace->opcode = MMIO_READ;
+		my_trace->width = get_ins_mem_width(instptr);
+		break;
+	case REG_WRITE:
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_reg_val(instptr, regs);
+		break;
+	case IMM_WRITE:
+		my_trace->opcode = MMIO_WRITE;
+		my_trace->width = get_ins_mem_width(instptr);
+		my_trace->value = get_ins_imm_val(instptr);
+		break;
+	default:
+		{
+			unsigned char *ip = (unsigned char *)instptr;
+			my_trace->opcode = MMIO_UNKNOWN_OP;
+			my_trace->width = 0;
+			my_trace->value = (*ip) << 16 | *(ip + 1) << 8 |
+								*(ip + 2);
+		}
+	}
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void post(struct kmmio_probe *p, unsigned long condition,
+							struct pt_regs *regs)
+{
+	struct trap_reason *my_reason = &get_cpu_var(pf_reason);
+	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
+
+	/* this should always return the active_trace count to 0 */
+	my_reason->active_traces--;
+	if (my_reason->active_traces) {
+		pr_emerg(NAME "unexpected post handler");
+		BUG();
+	}
+
+	switch (my_reason->type) {
+	case REG_READ:
+		my_trace->value = get_ins_reg_val(my_reason->ip, regs);
+		break;
+	default:
+		break;
+	}
+
+	mmio_trace_rw(my_trace);
+	put_cpu_var(cpu_trace);
+	put_cpu_var(pf_reason);
+}
+
+static void ioremap_trace_core(unsigned long offset, unsigned long size,
+							void __iomem *addr)
+{
+	static atomic_t next_id;
+	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	struct mmiotrace_map map = {
+		.phys = offset,
+		.virt = (unsigned long)addr,
+		.len = size,
+		.opcode = MMIO_PROBE
+	};
+
+	if (!trace) {
+		pr_err(NAME "kmalloc failed in ioremap\n");
+		return;
+	}
+
+	*trace = (struct remap_trace) {
+		.probe = {
+			.addr = (unsigned long)addr,
+			.len = size,
+			.pre_handler = pre,
+			.post_handler = post,
+			.user_data = trace
+		},
+		.phys = offset,
+		.id = atomic_inc_return(&next_id)
+	};
+	map.map_id = trace->id;
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
+	mmio_trace_mapping(&map);
+	list_add_tail(&trace->list, &trace_list);
+	if (!nommiotrace)
+		register_kmmio_probe(&trace->probe);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+}
+
+void
+mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+{
+	if (!is_enabled()) /* recheck and proper locking in *_core() */
+		return;
+
+	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
+	if ((filter_offset) && (offset != filter_offset))
+		return;
+	ioremap_trace_core(offset, size, addr);
+}
+
+static void iounmap_trace_core(volatile void __iomem *addr)
+{
+	struct mmiotrace_map map = {
+		.phys = 0,
+		.virt = (unsigned long)addr,
+		.len = 0,
+		.opcode = MMIO_UNPROBE
+	};
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+	struct remap_trace *found_trace = NULL;
+
+	pr_debug(NAME "Unmapping %p.\n", addr);
+
+	spin_lock_irq(&trace_lock);
+	if (!is_enabled())
+		goto not_enabled;
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		if ((unsigned long)addr == trace->probe.addr) {
+			if (!nommiotrace)
+				unregister_kmmio_probe(&trace->probe);
+			list_del(&trace->list);
+			found_trace = trace;
+			break;
+		}
+	}
+	map.map_id = (found_trace) ? found_trace->id : -1;
+	mmio_trace_mapping(&map);
+
+not_enabled:
+	spin_unlock_irq(&trace_lock);
+	if (found_trace) {
+		synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+		kfree(found_trace);
+	}
+}
+
+void mmiotrace_iounmap(volatile void __iomem *addr)
+{
+	might_sleep();
+	if (is_enabled()) /* recheck and proper locking in *_core() */
+		iounmap_trace_core(addr);
+}
+
+static void clear_trace_list(void)
+{
+	struct remap_trace *trace;
+	struct remap_trace *tmp;
+
+	/*
+	 * No locking required, because the caller ensures we are in a
+	 * critical section via mutex, and is_enabled() is false,
+	 * i.e. nothing can traverse or modify this list.
+	 * Caller also ensures is_enabled() cannot change.
+	 */
+	list_for_each_entry(trace, &trace_list, list) {
+		pr_notice(NAME "purging non-iounmapped "
+					"trace @0x%08lx, size 0x%lx.\n",
+					trace->probe.addr, trace->probe.len);
+		if (!nommiotrace)
+			unregister_kmmio_probe(&trace->probe);
+	}
+	synchronize_rcu(); /* unregister_kmmio_probe() requirement */
+
+	list_for_each_entry_safe(trace, tmp, &trace_list, list) {
+		list_del(&trace->list);
+		kfree(trace);
+	}
+}
+
+#if 0 /* XXX: out of order */
+static struct file_operations fops_marker = {
+	.owner =	THIS_MODULE,
+	.write =	write_marker
+};
+#endif
+
+void enable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (is_enabled())
+		goto out;
+
+#if 0 /* XXX: tracing does not support text entries */
+	marker_file = debugfs_create_file("marker", 0660, dir, NULL,
+								&fops_marker);
+	if (!marker_file)
+		pr_err(NAME "marker file creation failed.\n");
+#endif
+
+	if (nommiotrace)
+		pr_info(NAME "MMIO tracing disabled.\n");
+	if (ISA_trace)
+		pr_warning(NAME "Warning! low ISA range will be traced.\n");
+	spin_lock_irq(&trace_lock);
+	atomic_inc(&mmiotrace_enabled);
+	spin_unlock_irq(&trace_lock);
+	pr_info(NAME "enabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
+
+void disable_mmiotrace(void)
+{
+	mutex_lock(&mmiotrace_mutex);
+	if (!is_enabled())
+		goto out;
+
+	spin_lock_irq(&trace_lock);
+	atomic_dec(&mmiotrace_enabled);
+	BUG_ON(is_enabled());
+	spin_unlock_irq(&trace_lock);
+
+	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	if (marker_file) {
+		debugfs_remove(marker_file);
+		marker_file = NULL;
+	}
+
+	pr_info(NAME "disabled.\n");
+out:
+	mutex_unlock(&mmiotrace_mutex);
+}
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
new file mode 100644
index 000000000000..efa1911e20ca
--- /dev/null
+++ b/arch/x86/mm/pf_in.c
@@ -0,0 +1,489 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+/*  Id: pf_in.c,v 1.1.1.1 2002/11/12 05:56:32 brlock Exp
+ *  Copyright by Intel Crop., 2002
+ *  Louis Zhuang (louis.zhuang@intel.com)
+ *
+ *  Bjorn Steinbrink (B.Steinbrink@gmx.de), 2007
+ */
+
+#include <linux/module.h>
+#include <linux/ptrace.h> /* struct pt_regs */
+#include "pf_in.h"
+
+#ifdef __i386__
+/* IA32 Manual 3, 2-1 */
+static unsigned char prefix_codes[] = {
+	0xF0, 0xF2, 0xF3, 0x2E, 0x36, 0x3E, 0x26, 0x64,
+	0x65, 0x2E, 0x3E, 0x66, 0x67
+};
+/* IA32 Manual 3, 3-432*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+/* IA32 Manual 3, 3-432*/
+static unsigned int rw8[] = { 0x88, 0x8A, 0xC6 };
+static unsigned int rw32[] = {
+	0x89, 0x8B, 0xC7, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int mw8[] = { 0x88, 0x8A, 0xC6, 0xB60F, 0xBE0F };
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+static unsigned int mw32[] = { 0x89, 0x8B, 0xC7 };
+static unsigned int mw64[] = {};
+#else /* not __i386__ */
+static unsigned char prefix_codes[] = {
+	0x66, 0x67, 0x2E, 0x3E, 0x26, 0x64, 0x65, 0x36,
+	0xF0, 0xF3, 0xF2,
+	/* REX Prefixes */
+	0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
+	0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f
+};
+/* AMD64 Manual 3, Appendix A*/
+static unsigned int reg_rop[] = {
+	0x8A, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+static unsigned int reg_wop[] = { 0x88, 0x89 };
+static unsigned int imm_wop[] = { 0xC6, 0xC7 };
+static unsigned int rw8[] = { 0xC6, 0x88, 0x8A };
+static unsigned int rw32[] = {
+	0xC7, 0x89, 0x8B, 0xB60F, 0xB70F, 0xBE0F, 0xBF0F
+};
+/* 8 bit only */
+static unsigned int mw8[] = { 0xC6, 0x88, 0x8A, 0xB60F, 0xBE0F };
+/* 16 bit only */
+static unsigned int mw16[] = { 0xB70F, 0xBF0F };
+/* 16 or 32 bit */
+static unsigned int mw32[] = { 0xC7 };
+/* 16, 32 or 64 bit */
+static unsigned int mw64[] = { 0x89, 0x8B };
+#endif /* not __i386__ */
+
+static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged,
+								int *rexr)
+{
+	int i;
+	unsigned char *p = addr;
+	*shorted = 0;
+	*enlarged = 0;
+	*rexr = 0;
+
+restart:
+	for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
+		if (*p == prefix_codes[i]) {
+			if (*p == 0x66)
+				*shorted = 1;
+#ifdef __amd64__
+			if ((*p & 0xf8) == 0x48)
+				*enlarged = 1;
+			if ((*p & 0xf4) == 0x44)
+				*rexr = 1;
+#endif
+			p++;
+			goto restart;
+		}
+	}
+
+	return (p - addr);
+}
+
+static int get_opcode(unsigned char *addr, unsigned int *opcode)
+{
+	int len;
+
+	if (*addr == 0x0F) {
+		/* 0x0F is extension instruction */
+		*opcode = *(unsigned short *)addr;
+		len = 2;
+	} else {
+		*opcode = *addr;
+		len = 1;
+	}
+
+	return len;
+}
+
+#define CHECK_OP_TYPE(opcode, array, type) \
+	for (i = 0; i < ARRAY_SIZE(array); i++) { \
+		if (array[i] == opcode) { \
+			rv = type; \
+			goto exit; \
+		} \
+	}
+
+enum reason_type get_ins_type(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int shorted, enlarged, rexr;
+	int i;
+	enum reason_type rv = OTHERS;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
+	CHECK_OP_TYPE(opcode, reg_wop, REG_WRITE);
+	CHECK_OP_TYPE(opcode, imm_wop, IMM_WRITE);
+
+exit:
+	return rv;
+}
+#undef CHECK_OP_TYPE
+
+static unsigned int get_ins_reg_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(rw8); i++)
+		if (rw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(rw32); i++)
+		if (rw32[i] == opcode)
+			return (shorted ? 2 : (enlarged ? 8 : 4));
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+unsigned int get_ins_mem_width(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+
+	for (i = 0; i < ARRAY_SIZE(mw8); i++)
+		if (mw8[i] == opcode)
+			return 1;
+
+	for (i = 0; i < ARRAY_SIZE(mw16); i++)
+		if (mw16[i] == opcode)
+			return 2;
+
+	for (i = 0; i < ARRAY_SIZE(mw32); i++)
+		if (mw32[i] == opcode)
+			return shorted ? 2 : 4;
+
+	for (i = 0; i < ARRAY_SIZE(mw64); i++)
+		if (mw64[i] == opcode)
+			return shorted ? 2 : (enlarged ? 8 : 4);
+
+	printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
+	return 0;
+}
+
+/*
+ * Define register ident in mod/rm byte.
+ * Note: these are NOT the same as in ptrace-abi.h.
+ */
+enum {
+	arg_AL = 0,
+	arg_CL = 1,
+	arg_DL = 2,
+	arg_BL = 3,
+	arg_AH = 4,
+	arg_CH = 5,
+	arg_DH = 6,
+	arg_BH = 7,
+
+	arg_AX = 0,
+	arg_CX = 1,
+	arg_DX = 2,
+	arg_BX = 3,
+	arg_SP = 4,
+	arg_BP = 5,
+	arg_SI = 6,
+	arg_DI = 7,
+#ifdef __amd64__
+	arg_R8  = 8,
+	arg_R9  = 9,
+	arg_R10 = 10,
+	arg_R11 = 11,
+	arg_R12 = 12,
+	arg_R13 = 13,
+	arg_R14 = 14,
+	arg_R15 = 15
+#endif
+};
+
+static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
+{
+	unsigned char *rv = NULL;
+
+	switch (no) {
+	case arg_AL:
+		rv = (unsigned char *)&regs->ax;
+		break;
+	case arg_BL:
+		rv = (unsigned char *)&regs->bx;
+		break;
+	case arg_CL:
+		rv = (unsigned char *)&regs->cx;
+		break;
+	case arg_DL:
+		rv = (unsigned char *)&regs->dx;
+		break;
+	case arg_AH:
+		rv = 1 + (unsigned char *)&regs->ax;
+		break;
+	case arg_BH:
+		rv = 1 + (unsigned char *)&regs->bx;
+		break;
+	case arg_CH:
+		rv = 1 + (unsigned char *)&regs->cx;
+		break;
+	case arg_DH:
+		rv = 1 + (unsigned char *)&regs->dx;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = (unsigned char *)&regs->r8;
+		break;
+	case arg_R9:
+		rv = (unsigned char *)&regs->r9;
+		break;
+	case arg_R10:
+		rv = (unsigned char *)&regs->r10;
+		break;
+	case arg_R11:
+		rv = (unsigned char *)&regs->r11;
+		break;
+	case arg_R12:
+		rv = (unsigned char *)&regs->r12;
+		break;
+	case arg_R13:
+		rv = (unsigned char *)&regs->r13;
+		break;
+	case arg_R14:
+		rv = (unsigned char *)&regs->r14;
+		break;
+	case arg_R15:
+		rv = (unsigned char *)&regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+		break;
+	}
+	return rv;
+}
+
+static unsigned long *get_reg_w32(int no, struct pt_regs *regs)
+{
+	unsigned long *rv = NULL;
+
+	switch (no) {
+	case arg_AX:
+		rv = &regs->ax;
+		break;
+	case arg_BX:
+		rv = &regs->bx;
+		break;
+	case arg_CX:
+		rv = &regs->cx;
+		break;
+	case arg_DX:
+		rv = &regs->dx;
+		break;
+	case arg_SP:
+		rv = &regs->sp;
+		break;
+	case arg_BP:
+		rv = &regs->bp;
+		break;
+	case arg_SI:
+		rv = &regs->si;
+		break;
+	case arg_DI:
+		rv = &regs->di;
+		break;
+#ifdef __amd64__
+	case arg_R8:
+		rv = &regs->r8;
+		break;
+	case arg_R9:
+		rv = &regs->r9;
+		break;
+	case arg_R10:
+		rv = &regs->r10;
+		break;
+	case arg_R11:
+		rv = &regs->r11;
+		break;
+	case arg_R12:
+		rv = &regs->r12;
+		break;
+	case arg_R13:
+		rv = &regs->r13;
+		break;
+	case arg_R14:
+		rv = &regs->r14;
+		break;
+	case arg_R15:
+		rv = &regs->r15;
+		break;
+#endif
+	default:
+		printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
+	}
+
+	return rv;
+}
+
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	int reg;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
+		if (reg_rop[i] == opcode) {
+			rv = REG_READ;
+			goto do_work;
+		}
+
+	for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
+		if (reg_wop[i] == opcode) {
+			rv = REG_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	reg = ((mod_rm >> 3) & 0x7) | (rexr << 3);
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *get_reg_w8(reg, regs);
+
+	case 2:
+		return *(unsigned short *)get_reg_w32(reg, regs);
+
+	case 4:
+		return *(unsigned int *)get_reg_w32(reg, regs);
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)get_reg_w32(reg, regs);
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error width# %d\n", reg);
+	}
+
+err:
+	return 0;
+}
+
+unsigned long get_ins_imm_val(unsigned long ins_addr)
+{
+	unsigned int opcode;
+	unsigned char mod_rm;
+	unsigned char mod;
+	unsigned char *p;
+	int i, shorted, enlarged, rexr;
+	unsigned long rv;
+
+	p = (unsigned char *)ins_addr;
+	p += skip_prefix(p, &shorted, &enlarged, &rexr);
+	p += get_opcode(p, &opcode);
+	for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
+		if (imm_wop[i] == opcode) {
+			rv = IMM_WRITE;
+			goto do_work;
+		}
+
+	printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
+							"0x%02x\n", opcode);
+	goto err;
+
+do_work:
+	mod_rm = *p;
+	mod = mod_rm >> 6;
+	p++;
+	switch (mod) {
+	case 0:
+		/* if r/m is 5 we have a 32 disp (IA32 Manual 3, Table 2-2)  */
+		/* AMD64: XXX Check for address size prefix? */
+		if ((mod_rm & 0x7) == 0x5)
+			p += 4;
+		break;
+
+	case 1:
+		p += 1;
+		break;
+
+	case 2:
+		p += 4;
+		break;
+
+	case 3:
+	default:
+		printk(KERN_ERR "mmiotrace: not a memory access instruction "
+						"at 0x%lx, rm_mod=0x%02x\n",
+						ins_addr, mod_rm);
+	}
+
+	switch (get_ins_reg_width(ins_addr)) {
+	case 1:
+		return *(unsigned char *)p;
+
+	case 2:
+		return *(unsigned short *)p;
+
+	case 4:
+		return *(unsigned int *)p;
+
+#ifdef __amd64__
+	case 8:
+		return *(unsigned long *)p;
+#endif
+
+	default:
+		printk(KERN_ERR "mmiotrace: Error: width.\n");
+	}
+
+err:
+	return 0;
+}
diff --git a/arch/x86/mm/pf_in.h b/arch/x86/mm/pf_in.h
new file mode 100644
index 000000000000..e05341a51a27
--- /dev/null
+++ b/arch/x86/mm/pf_in.h
@@ -0,0 +1,39 @@
+/*
+ *  Fault Injection Test harness (FI)
+ *  Copyright (C) Intel Crop.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2
+ *  of the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307,
+ *  USA.
+ *
+ */
+
+#ifndef __PF_H_
+#define __PF_H_
+
+enum reason_type {
+	NOT_ME,	/* page fault is not in regions */
+	NOTHING,	/* access others point in regions */
+	REG_READ,	/* read from addr to reg */
+	REG_WRITE,	/* write from reg to addr */
+	IMM_WRITE,	/* write from imm to addr */
+	OTHERS	/* Other instructions can not intercept */
+};
+
+enum reason_type get_ins_type(unsigned long ins_addr);
+unsigned int get_ins_mem_width(unsigned long ins_addr);
+unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs);
+unsigned long get_ins_imm_val(unsigned long ins_addr);
+
+#endif /* __PF_H_ */
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
new file mode 100644
index 000000000000..cfa60b227c8d
--- /dev/null
+++ b/arch/x86/mm/testmmiotrace.c
@@ -0,0 +1,71 @@
+/*
+ * Written by Pekka Paalanen, 2008 <pq@iki.fi>
+ */
+#include <linux/module.h>
+#include <asm/io.h>
+
+#define MODULE_NAME "testmmiotrace"
+
+static unsigned long mmio_address;
+module_param(mmio_address, ulong, 0);
+MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
+
+static void do_write_test(void __iomem *p)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++)
+		iowrite8(i, p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		iowrite16(i * 12 + 7, p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		iowrite32(i * 212371 + 13, p + i);
+}
+
+static void do_read_test(void __iomem *p)
+{
+	unsigned int i;
+	for (i = 0; i < 256; i++)
+		ioread8(p + i);
+	for (i = 1024; i < (5 * 1024); i += 2)
+		ioread16(p + i);
+	for (i = (5 * 1024); i < (16 * 1024); i += 4)
+		ioread32(p + i);
+}
+
+static void do_test(void)
+{
+	void __iomem *p = ioremap_nocache(mmio_address, 0x4000);
+	if (!p) {
+		pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
+		return;
+	}
+	do_write_test(p);
+	do_read_test(p);
+	iounmap(p);
+}
+
+static int __init init(void)
+{
+	if (mmio_address == 0) {
+		pr_err(MODULE_NAME ": you have to use the module argument "
+							"mmio_address.\n");
+		pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
+				" YOU REALLY KNOW WHAT YOU ARE DOING!\n");
+		return -ENXIO;
+	}
+
+	pr_warning(MODULE_NAME ": WARNING: mapping 16 kB @ 0x%08lx "
+					"in PCI address space, and writing "
+					"rubbish in there.\n", mmio_address);
+	do_test();
+	return 0;
+}
+
+static void __exit cleanup(void)
+{
+	pr_debug(MODULE_NAME ": unloaded.\n");
+}
+
+module_init(init);
+module_exit(cleanup);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From e4b37ee68609037ffcaa2fcfae47cd31a605bb9e Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: x86 mmiotrace: remove ISA_trace parameter.

This had become a no-op.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/mmio-mod.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 8256546d49bf..ab2bb776d310 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -81,17 +81,14 @@ static LIST_HEAD(trace_list);		/* struct remap_trace */
 /* module parameters */
 static unsigned long	filter_offset;
 static int		nommiotrace;
-static int		ISA_trace;
 static int		trace_pc;
 
 module_param(filter_offset, ulong, 0);
 module_param(nommiotrace, bool, 0);
-module_param(ISA_trace, bool, 0);
 module_param(trace_pc, bool, 0);
 
 MODULE_PARM_DESC(filter_offset, "Start address of traced mappings.");
 MODULE_PARM_DESC(nommiotrace, "Disable actual MMIO tracing.");
-MODULE_PARM_DESC(ISA_trace, "Do not exclude the low ISA range.");
 MODULE_PARM_DESC(trace_pc, "Record address of faulting instructions.");
 
 static bool is_enabled(void)
@@ -424,8 +421,6 @@ void enable_mmiotrace(void)
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
-	if (ISA_trace)
-		pr_warning(NAME "Warning! low ISA range will be traced.\n");
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
 	spin_unlock_irq(&trace_lock);
-- 
cgit v1.2.3


From c6c67c1afcce71335b18ed8769b1165c468bfb03 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: mmiotrace: add user documentation

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/tracers/mmiotrace.txt | 153 ++++++++++++++++++++++++++++++++++++
 arch/x86/Kconfig.debug              |   8 +-
 2 files changed, 156 insertions(+), 5 deletions(-)
 create mode 100644 Documentation/tracers/mmiotrace.txt

(limited to 'arch/x86')

diff --git a/Documentation/tracers/mmiotrace.txt b/Documentation/tracers/mmiotrace.txt
new file mode 100644
index 000000000000..84246f703875
--- /dev/null
+++ b/Documentation/tracers/mmiotrace.txt
@@ -0,0 +1,153 @@
+		In-kernel memory-mapped I/O tracing
+
+
+Home page and links to optional user space tools:
+
+	http://nouveau.freedesktop.org/wiki/MmioTrace
+
+MMIO tracing was originally developed by Intel around 2003 for their Fault
+Injection Test Harness. In Dec 2006 - Jan 2007, using the code from Intel,
+Jeff Muizelaar created a tool for tracing MMIO accesses with the Nouveau
+project in mind. Since then many people have contributed.
+
+Mmiotrace was built for reverse engineering any memory-mapped IO device with
+the Nouveau project as the first real user. Only x86 and x86_64 architectures
+are supported.
+
+Out-of-tree mmiotrace was originally modified for mainline inclusion and
+ftrace framework by Pekka Paalanen <pq@iki.fi>.
+
+
+Preparation
+-----------
+
+Mmiotrace feature is compiled in by the CONFIG_MMIOTRACE option. Tracing is
+disabled by default, so it is safe to have this set to yes. SMP systems are
+supported, but tracing is unreliable and may miss events if more than one CPU
+is on-line, therefore mmiotrace takes all but one CPU off-line during run-time
+activation [not implemented].
+
+
+Usage Quick Reference
+---------------------
+
+$ mount -t debugfs debugfs /debug
+$ echo mmiotrace > /debug/tracing/current_tracer
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+Start X or whatever.
+$ echo "X is up" > /debug/tracing/marker
+$ echo none > /debug/tracing/current_tracer
+Check kernel log.
+
+
+Usage
+-----
+
+Make sure debugfs is mounted to /debug. If not, (requires root privileges)
+$ mount -t debugfs debugfs /debug
+
+Check that the driver you are about to trace is not loaded.
+
+Activate mmiotrace (requires root privileges):
+$ echo mmiotrace > /debug/tracing/current_tracer
+
+Start storing the trace:
+$ cat /debug/tracing/trace_pipe > mydump.txt &
+The 'cat' process should stay running (sleeping) in the background.
+
+Load the driver you want to trace and use it. Mmiotrace will only catch MMIO
+accesses to areas that are ioremapped while mmiotrace is active.
+
+[Unimplemented feature:]
+During tracing you can place comments (markers) into the trace by
+$ echo "X is up" > /debug/tracing/marker
+This makes it easier to see which part of the (huge) trace corresponds to
+which action. It is recommended to place descriptive markers about what you
+do.
+
+Shut down mmiotrace (requires root privileges):
+$ echo none > /debug/tracing/current_tracer
+The 'cat' process exits. If it does not, kill it by 'fg' and pressing ctrl+c.
+
+[This feature is not implemented yet!]
+Check your kernel log. If there are messages about mmiotrace losing events,
+this is due to buffer overrun, and the trace is incomplete. You should enlarge
+the buffers and try again. [How?]
+
+If you are doing a trace for a driver project, e.g. Nouveau, you should also
+do the following before sending your results:
+$ lspci -vvv > lspci.txt
+$ dmesg > dmesg.txt
+$ tar zcf pciid-nick-mmiotrace.tar.gz mydump.txt lspci.txt dmesg.txt
+and then send the .tar.gz file. The trace compresses considerably. Replace
+"pciid" and "nick" with the PCI ID or model name of your piece of hardware
+under investigation and your nick name.
+
+
+How Mmiotrace Works
+-------------------
+
+Access to hardware IO-memory is gained by mapping addresses from PCI bus by
+calling one of the ioremap_*() functions. Mmiotrace is hooked into the
+__ioremap() function and gets called whenever a mapping is created. Mapping is
+an event that is recorded into the trace log. Note, that ISA range mappings
+are not caught, since the mapping always exists and is returned directly.
+
+MMIO accesses are recorded via page faults. Just before __ioremap() returns,
+the mapped pages are marked as not present. Any access to the pages causes a
+fault. The page fault handler calls mmiotrace to handle the fault. Mmiotrace
+marks the page present, sets TF flag to achieve single stepping and exits the
+fault handler. The instruction that faulted is executed and debug trap is
+entered. Here mmiotrace again marks the page as not present. The instruction
+is decoded to get the type of operation (read/write), data width and the value
+read or written. These are stored to the trace log.
+
+Setting the page present in the page fault handler has a race condition on SMP
+machines. During the single stepping other CPUs may run freely on that page
+and events can be missed without a notice. Re-enabling other CPUs during
+tracing is discouraged.
+
+
+Trace Log Format
+----------------
+
+The raw log is text and easily filtered with e.g. grep and awk. One record is
+one line in the log. A record starts with a keyword, followed by keyword
+dependant arguments. Arguments are separated by a space, or continue until the
+end of line. The format for version 20070824 is as follows:
+
+Explanation	Keyword	Space separated arguments
+---------------------------------------------------------------------------
+
+read event	R	width, timestamp, map id, physical, value, PC, PID
+write event	W	width, timestamp, map id, physical, value, PC, PID
+ioremap event	MAP	timestamp, map id, physical, virtual, length, PC, PID
+iounmap event	UNMAP	timestamp, map id, PC, PID
+marker		MARK	timestamp, text
+version		VERSION	the string "20070824"
+info for reader	LSPCI	one line from lspci -v
+PCI address map	PCIDEV	space separated /proc/bus/pci/devices data
+unk. opcode	UNKNOWN	timestamp, map id, physical, data, PC, PID
+
+Timestamp is in seconds with decimals. Physical is a PCI bus address, virtual
+is a kernel virtual address. Width is the data width in bytes and value is the
+data value. Map id is an arbitrary id number identifying the mapping that was
+used in an operation. PC is the program counter and PID is process id. PC is
+zero if it is not recorded. PID is always zero as tracing MMIO accesses
+originating in user space memory is not yet supported.
+
+For instance, the following awk filter will pass all 32-bit writes that target
+physical addresses in the range [0xfb73ce40, 0xfb800000[
+
+$ awk '/W 4 / { adr=strtonum($5); if (adr >= 0xfb73ce40 &&
+adr < 0xfb800000) print; }'
+
+
+Tools for Developers
+--------------------
+
+The user space tools include utilities for:
+- replacing numeric addresses and values with hardware register names
+- replaying MMIO logs, i.e., re-executing the recorded writes
+
+
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 1d6de0d67f99..b28ace2be1a3 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -180,12 +180,10 @@ config MMIOTRACE
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
-	  implementation and works via page faults. A user space program is
-	  required to collect the MMIO data from debugfs files.
-	  Tracing is disabled by default and can be enabled from a debugfs
-	  file.
+	  implementation and works via page faults. Tracing is disabled by
+	  default and can be enabled run-time.
 
-	  See http://nouveau.freedesktop.org/wiki/MmioTrace
+	  See Documentation/tracers/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
 
 config MMIOTRACE_TEST
-- 
cgit v1.2.3


From 0663bb6cd9a457fbd8ca95c627bb762d07321a39 Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: mmiotrace: fix printk format

Fix gcc printk format warnings:

next-20080415/arch/x86/mm/mmio-mod.c: In function 'print_pte':
next-20080415/arch/x86/mm/mmio-mod.c:154: warning: format '%lx' expects type 'long unsigned int', but argument 3 has type 'pteval_t'
next-20080415/arch/x86/mm/mmio-mod.c:154: warning: format '%lx' expects type 'long unsigned int', but argument 4 has type 'pteval_t'
next-20080415/arch/x86/mm/mmio-mod.c: At top level:
next-20080415/arch/x86/mm/mmio-mod.c:403: warning: 'downed_cpus' defined but not used

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/mmio-mod.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index ab2bb776d310..6d6cac84c045 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -150,8 +150,9 @@ static void print_pte(unsigned long address)
 							"0x%08lx\n", address);
 		BUG();
 	}
-	pr_info(NAME "pte for 0x%lx: 0x%lx 0x%lx\n", address, pte_val(*pte),
-						pte_val(*pte) & _PAGE_PRESENT);
+	pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address,
+		(unsigned long long)pte_val(*pte),
+		(unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
 }
 
 /*
-- 
cgit v1.2.3


From 37b3619257d3190f47f233d7ed626d4b9916462c Mon Sep 17 00:00:00 2001
From: Randy Dunlap
Date: Mon, 12 May 2008 21:20:59 +0200
Subject: x86/mmiotrace: uses/depends on PCI

Don't try to build mmiotrace when CONFIG_PCI=n.

next-20080416/kernel/trace/trace_mmiotrace.c: In function 'mmio_print_pcidev':
next-20080416/kernel/trace/trace_mmiotrace.c:62: error: implicit declaration of function 'pci_dev_driver'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig.debug | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index b28ace2be1a3..1e53df0ba08c 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -173,7 +173,7 @@ config MMIOTRACE_HOOKS
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && PCI
 	select TRACING
 	select MMIOTRACE_HOOKS
 	default y
@@ -181,7 +181,7 @@ config MMIOTRACE
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
 	  implementation and works via page faults. Tracing is disabled by
-	  default and can be enabled run-time.
+	  default and can be enabled at run-time.
 
 	  See Documentation/tracers/mmiotrace.txt.
 	  If you are not helping to develop drivers, say N.
-- 
cgit v1.2.3


From 7423d1115f18627666d475fccc7c62394406ff8c Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:02 +0200
Subject: x86 mmiotrace: dynamically disable non-boot CPUs

From 8979ee55cb6a429c4edd72ebec2244b849f6a79a Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 12 Apr 2008 00:18:57 +0300

Mmiotrace is not reliable with multiple CPUs and may
miss events. Drop to single CPU when mmiotrace is activated.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 6d6cac84c045..1f77d8532037 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -32,6 +32,7 @@
 #include <asm/e820.h> /* for ISA_START_ADDRESS */
 #include <asm/atomic.h>
 #include <linux/percpu.h>
+#include <linux/cpu.h>
 
 #include "pf_in.h"
 
@@ -400,6 +401,64 @@ static void clear_trace_list(void)
 	}
 }
 
+#ifdef CONFIG_HOTPLUG_CPU
+static cpumask_t downed_cpus;
+
+static void enter_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	get_online_cpus();
+	downed_cpus = cpu_online_map;
+	cpu_clear(first_cpu(cpu_online_map), downed_cpus);
+	if (num_online_cpus() > 1)
+		pr_notice(NAME "Disabling non-boot CPUs...\n");
+	put_online_cpus();
+
+	for_each_cpu_mask(cpu, downed_cpus) {
+		err = cpu_down(cpu);
+		if (!err) {
+			pr_info(NAME "CPU%d is down.\n", cpu);
+		} else {
+			pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
+		}
+	}
+	if (num_online_cpus() > 1)
+		pr_warning(NAME "multiple CPUs still online, "
+						"may miss events.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+	int cpu;
+	int err;
+
+	if (cpus_weight(downed_cpus) == 0)
+		return;
+	pr_notice(NAME "Re-enabling CPUs...\n");
+	for_each_cpu_mask(cpu, downed_cpus) {
+		err = cpu_up(cpu);
+		if (!err)
+			pr_info(NAME "enabled CPU%d.\n", cpu);
+		else
+			pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err);
+	}
+}
+
+#else /* !CONFIG_HOTPLUG_CPU */
+static void enter_uniprocessor(void)
+{
+	if (num_online_cpus() > 1)
+		pr_warning(NAME "multiple CPUs are online, may miss events. "
+			"Suggest booting with maxcpus=1 kernel argument.\n");
+}
+
+static void leave_uniprocessor(void)
+{
+}
+#endif
+
 #if 0 /* XXX: out of order */
 static struct file_operations fops_marker = {
 	.owner =	THIS_MODULE,
@@ -422,6 +481,7 @@ void enable_mmiotrace(void)
 
 	if (nommiotrace)
 		pr_info(NAME "MMIO tracing disabled.\n");
+	enter_uniprocessor();
 	spin_lock_irq(&trace_lock);
 	atomic_inc(&mmiotrace_enabled);
 	spin_unlock_irq(&trace_lock);
@@ -442,6 +502,7 @@ void disable_mmiotrace(void)
 	spin_unlock_irq(&trace_lock);
 
 	clear_trace_list(); /* guarantees: no more kmmio callbacks */
+	leave_uniprocessor();
 	if (marker_file) {
 		debugfs_remove(marker_file);
 		marker_file = NULL;
-- 
cgit v1.2.3


From 970e6fa03885f32cc43e42cb08c73a5f54cd8bd9 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: mmiotrace: code style cleanups

From c2da03771e29159627c5c7b9509ec70bce9f91ee Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Mon, 28 Apr 2008 21:25:22 +0300

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c         | 4 ++--
 arch/x86/mm/mmio-mod.c      | 7 +++----
 arch/x86/mm/testmmiotrace.c | 2 +-
 include/linux/mmiotrace.h   | 6 +-----
 4 files changed, 7 insertions(+), 12 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 3ad27b8504a5..6a92d9111b64 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -17,10 +17,10 @@
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/mutex.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
-#include <asm/errno.h>
+#include <linux/errno.h>
 #include <asm/debugreg.h>
 #include <linux/mmiotrace.h>
 
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 1f77d8532037..a8d2a0019da4 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -24,7 +24,7 @@
 #include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
-#include <asm/io.h>
+#include <linux/io.h>
 #include <linux/version.h>
 #include <linux/kallsyms.h>
 #include <asm/pgtable.h>
@@ -418,11 +418,10 @@ static void enter_uniprocessor(void)
 
 	for_each_cpu_mask(cpu, downed_cpus) {
 		err = cpu_down(cpu);
-		if (!err) {
+		if (!err)
 			pr_info(NAME "CPU%d is down.\n", cpu);
-		} else {
+		else
 			pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err);
-		}
 	}
 	if (num_online_cpus() > 1)
 		pr_warning(NAME "multiple CPUs still online, "
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index cfa60b227c8d..d877c5b423ef 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -2,7 +2,7 @@
  * Written by Pekka Paalanen, 2008 <pq@iki.fi>
  */
 #include <linux/module.h>
-#include <asm/io.h>
+#include <linux/io.h>
 
 #define MODULE_NAME "testmmiotrace"
 
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index dd6b64b160fc..de8e91258da7 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -1,9 +1,7 @@
 #ifndef MMIOTRACE_H
 #define MMIOTRACE_H
 
-#include <asm/types.h>
-
-#ifdef __KERNEL__
+#include <linux/types.h>
 
 #include <linux/list.h>
 
@@ -84,6 +82,4 @@ extern void disable_mmiotrace(void);
 extern void mmio_trace_rw(struct mmiotrace_rw *rw);
 extern void mmio_trace_mapping(struct mmiotrace_map *map);
 
-#endif /* __KERNEL__ */
-
 #endif /* MMIOTRACE_H */
-- 
cgit v1.2.3


From 87e547fe41a8b57d6d80afc67a0031fbe477eb0d Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: x86 mmiotrace: fix page-unaligned ioremaps

mmiotrace_ioremap() expects to receive the original unaligned map phys address
and size. Also fix {un,}register_kmmio_probe() to deal properly with
unaligned size.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/ioremap.c  |  4 +++-
 arch/x86/mm/kmmio.c    | 13 +++++++++++--
 arch/x86/mm/mmio-mod.c |  1 +
 3 files changed, 15 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 8927c878544d..a7c80a6e8622 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -123,6 +123,8 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 {
 	unsigned long pfn, offset, vaddr;
 	resource_size_t last_addr;
+	const resource_size_t unaligned_phys_addr = phys_addr;
+	const unsigned long unaligned_size = size;
 	struct vm_struct *area;
 	unsigned long new_prot_val;
 	pgprot_t prot;
@@ -236,7 +238,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 	}
 
 	ret_addr = (void __iomem *) (vaddr + offset);
-	mmiotrace_ioremap(phys_addr, size, ret_addr);
+	mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
 
 	return ret_addr;
 }
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 6a92d9111b64..93b1797666cb 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -351,11 +351,19 @@ static void release_kmmio_fault_page(unsigned long page,
 	}
 }
 
+/*
+ * With page-unaligned ioremaps, one or two armed pages may contain
+ * addresses from outside the intended mapping. Events for these addresses
+ * are currently silently dropped. The events may result only from programming
+ * mistakes by accessing addresses before the beginning or past the end of a
+ * mapping.
+ */
 int register_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long flags;
 	int ret = 0;
 	unsigned long size = 0;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	if (get_kmmio_probe(p->addr)) {
@@ -364,7 +372,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	}
 	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
-	while (size < p->len) {
+	while (size < size_lim) {
 		if (add_kmmio_fault_page(p->addr + size))
 			pr_err("kmmio: Unable to set page fault.\n");
 		size += PAGE_SIZE;
@@ -436,11 +444,12 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long flags;
 	unsigned long size = 0;
+	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
-	while (size < p->len) {
+	while (size < size_lim) {
 		release_kmmio_fault_page(p->addr + size, &release_list);
 		size += PAGE_SIZE;
 	}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index a8d2a0019da4..278998c1998f 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -280,6 +280,7 @@ static void ioremap_trace_core(unsigned long offset, unsigned long size,
 {
 	static atomic_t next_id;
 	struct remap_trace *trace = kmalloc(sizeof(*trace), GFP_KERNEL);
+	/* These are page-unaligned. */
 	struct mmiotrace_map map = {
 		.phys = offset,
 		.virt = (unsigned long)addr,
-- 
cgit v1.2.3


From dee310d0adf41019aca476052ac3085ff286d9be Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: x86 mmiotrace: use resource_size_t for phys addresses

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c         | 11 ++++++-----
 include/linux/mmiotrace.h      | 14 +++++++-------
 kernel/trace/trace_mmiotrace.c | 20 ++++++++++++--------
 3 files changed, 25 insertions(+), 20 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 278998c1998f..3b04a0126121 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -48,7 +48,7 @@ struct trap_reason {
 struct remap_trace {
 	struct list_head list;
 	struct kmmio_probe probe;
-	unsigned long phys;
+	resource_size_t phys;
 	unsigned long id;
 };
 
@@ -275,7 +275,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
 	put_cpu_var(pf_reason);
 }
 
-static void ioremap_trace_core(unsigned long offset, unsigned long size,
+static void ioremap_trace_core(resource_size_t offset, unsigned long size,
 							void __iomem *addr)
 {
 	static atomic_t next_id;
@@ -319,13 +319,14 @@ not_enabled:
 	spin_unlock_irq(&trace_lock);
 }
 
-void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+						void __iomem *addr)
 {
 	if (!is_enabled()) /* recheck and proper locking in *_core() */
 		return;
 
-	pr_debug(NAME "ioremap_*(0x%lx, 0x%lx) = %p\n", offset, size, addr);
+	pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n",
+				(unsigned long long)offset, size, addr);
 	if ((filter_offset) && (offset != filter_offset))
 		return;
 	ioremap_trace_core(offset, size, addr);
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index de8e91258da7..5cbbc374e945 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -2,7 +2,6 @@
 #define MMIOTRACE_H
 
 #include <linux/types.h>
-
 #include <linux/list.h>
 
 struct kmmio_probe;
@@ -37,14 +36,15 @@ extern int kmmio_handler(struct pt_regs *regs, unsigned long addr);
 
 /* Called from ioremap.c */
 #ifdef CONFIG_MMIOTRACE
-extern void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr);
+extern void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
+							void __iomem *addr);
 extern void mmiotrace_iounmap(volatile void __iomem *addr);
 #else
-static inline void
-mmiotrace_ioremap(unsigned long offset, unsigned long size, void __iomem *addr)
+static inline void mmiotrace_ioremap(resource_size_t offset,
+					unsigned long size, void __iomem *addr)
 {
 }
+
 static inline void mmiotrace_iounmap(volatile void __iomem *addr)
 {
 }
@@ -60,7 +60,7 @@ enum mm_io_opcode {
 };
 
 struct mmiotrace_rw {
-	unsigned long phys;	/* PCI address of register */
+	resource_size_t phys;	/* PCI address of register */
 	unsigned long value;
 	unsigned long pc;	/* optional program counter */
 	int map_id;
@@ -69,7 +69,7 @@ struct mmiotrace_rw {
 };
 
 struct mmiotrace_map {
-	unsigned long phys;	/* base address in PCI space */
+	resource_size_t phys;	/* base address in PCI space */
 	unsigned long virt;	/* base virtual address */
 	unsigned long len;	/* mapping size */
 	int map_id;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 3c1dacdc2d85..b13dc19dcbb4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,20 +184,23 @@ static int mmio_print_rw(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%lx 0x%lx 0x%lx %d\n",
-			rw->width, secs, usec_rem, rw->map_id, rw->phys,
+			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			rw->width, secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%lx %02x,%02x,%02x 0x%lx %d\n",
-			secs, usec_rem, rw->map_id, rw->phys,
+			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			secs, usec_rem, rw->map_id,
+			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
 			(rw->value >> 0) & 0xff, rw->pc, 0);
 		break;
@@ -223,8 +226,9 @@ static int mmio_print_map(struct trace_iterator *iter)
 	switch (entry->mmiorw.opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%lx 0x%lx 0x%lx 0x%lx %d\n",
-			secs, usec_rem, m->map_id, m->phys, m->virt, m->len,
+			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			secs, usec_rem, m->map_id,
+			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
-- 
cgit v1.2.3


From a50445d76c22a34ae149704ea5adaef171c8acb7 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:03 +0200
Subject: mmiotrace: rename kmmio_probe::user_data to :private.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/mmio-mod.c    | 4 ++--
 include/linux/mmiotrace.h | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 3b04a0126121..ed0e0e90b3ef 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -191,7 +191,7 @@ static void pre(struct kmmio_probe *p, struct pt_regs *regs,
 	struct mmiotrace_rw *my_trace = &get_cpu_var(cpu_trace);
 	const unsigned long instptr = instruction_pointer(regs);
 	const enum reason_type type = get_ins_type(instptr);
-	struct remap_trace *trace = p->user_data;
+	struct remap_trace *trace = p->private;
 
 	/* it doesn't make sense to have more than one active trace per cpu */
 	if (my_reason->active_traces)
@@ -299,7 +299,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
 			.len = size,
 			.pre_handler = pre,
 			.post_handler = post,
-			.user_data = trace
+			.private = trace
 		},
 		.phys = offset,
 		.id = atomic_inc_return(&next_id)
diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h
index 5cbbc374e945..61d19e1b7a0b 100644
--- a/include/linux/mmiotrace.h
+++ b/include/linux/mmiotrace.h
@@ -18,7 +18,7 @@ struct kmmio_probe {
 	unsigned long len; /* length of the probe region */
 	kmmio_pre_handler_t pre_handler; /* Called before addr is executed. */
 	kmmio_post_handler_t post_handler; /* Called after addr is executed */
-	void *user_data;
+	void *private;
 };
 
 /* kmmio is active by some kmmio_probes? */
-- 
cgit v1.2.3


From 790e2a290b499b0400254e6870ec27969065d122 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: x86 mmiotrace: page level is unsigned

Fixes some sparse warnings.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/kmmio.c    | 13 +++++++------
 arch/x86/mm/mmio-mod.c |  2 +-
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 93b1797666cb..b65871e6bba6 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -104,11 +104,12 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 	return NULL;
 }
 
-static void set_page_present(unsigned long addr, bool present, int *pglevel)
+static void set_page_present(unsigned long addr, bool present,
+							unsigned int *pglevel)
 {
 	pteval_t pteval;
 	pmdval_t pmdval;
-	int level;
+	unsigned int level;
 	pmd_t *pmd;
 	pte_t *pte = lookup_address(addr, &level);
 
@@ -145,15 +146,15 @@ static void set_page_present(unsigned long addr, bool present, int *pglevel)
 }
 
 /** Mark the given page as not present. Access to it will trigger a fault. */
-static void arm_kmmio_fault_page(unsigned long page, int *page_level)
+static void arm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	set_page_present(page & PAGE_MASK, false, page_level);
+	set_page_present(page & PAGE_MASK, false, pglevel);
 }
 
 /** Mark the given page as present. */
-static void disarm_kmmio_fault_page(unsigned long page, int *page_level)
+static void disarm_kmmio_fault_page(unsigned long page, unsigned int *pglevel)
 {
-	set_page_present(page & PAGE_MASK, true, page_level);
+	set_page_present(page & PAGE_MASK, true, pglevel);
 }
 
 /*
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index ed0e0e90b3ef..e7397e108beb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -137,7 +137,7 @@ static ssize_t write_marker(struct file *file, const char __user *buffer,
 
 static void print_pte(unsigned long address)
 {
-	int level;
+	unsigned int level;
 	pte_t *pte = lookup_address(address, &level);
 
 	if (!pte) {
-- 
cgit v1.2.3


From 6360b1fbb4a939efd34fc770c2ebd927c55506e0 Mon Sep 17 00:00:00 2001
From: Jan Beulich
Date: Mon, 12 May 2008 15:44:41 +0200
Subject: move BUG_TABLE into RODATA

Signed-off-by: Jan Beulich <jbeulich@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/avr32/kernel/vmlinux.lds.S   |  2 --
 arch/parisc/kernel/vmlinux.lds.S  |  1 -
 arch/powerpc/kernel/vmlinux.lds.S |  2 --
 arch/s390/kernel/vmlinux.lds.S    |  1 -
 arch/sh/kernel/vmlinux_32.lds.S   |  1 -
 arch/sh/kernel/vmlinux_64.lds.S   |  1 -
 arch/x86/kernel/vmlinux_32.lds.S  |  8 +++-----
 arch/x86/kernel/vmlinux_64.lds.S  | 10 ++++------
 include/asm-generic/vmlinux.lds.h |  6 ++++++
 9 files changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S
index 481cfd40c053..bc932c9b4272 100644
--- a/arch/avr32/kernel/vmlinux.lds.S
+++ b/arch/avr32/kernel/vmlinux.lds.S
@@ -93,8 +93,6 @@ SECTIONS
 		__stop___ex_table = .;
 	}
 
-	BUG_TABLE
-
 	RODATA
 
 	. = ALIGN(THREAD_SIZE);
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index 50b4a3a25d0a..ff7d4ff4675a 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -66,7 +66,6 @@ SECTIONS
 	_etext = .;
 
 	RODATA
-	BUG_TABLE
 
 	/* writeable */
 	/* Make sure this is page aligned so
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index 0c3000bf8d75..53d57d17a894 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -64,8 +64,6 @@ SECTIONS
 
 	NOTES
 
-	BUG_TABLE
-
 /*
  * Init sections discarded at runtime
  */
diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S
index b4607155e8d0..76c1e60c92f3 100644
--- a/arch/s390/kernel/vmlinux.lds.S
+++ b/arch/s390/kernel/vmlinux.lds.S
@@ -40,7 +40,6 @@ SECTIONS
 	_etext = .;		/* End of text section */
 
 	NOTES :text :note
-	BUG_TABLE :text
 
 	RODATA
 
diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S
index c7113786ecd4..7b4b82bd1156 100644
--- a/arch/sh/kernel/vmlinux_32.lds.S
+++ b/arch/sh/kernel/vmlinux_32.lds.S
@@ -44,7 +44,6 @@ SECTIONS
 
 	_etext = .;			/* End of text section */
 
-	BUG_TABLE
 	NOTES
 	RO_DATA(PAGE_SIZE)
 
diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S
index d1e177009a41..33fa46451406 100644
--- a/arch/sh/kernel/vmlinux_64.lds.S
+++ b/arch/sh/kernel/vmlinux_64.lds.S
@@ -65,7 +65,6 @@ SECTIONS
 
 	_etext = .;			/* End of text section */
 
-	BUG_TABLE
 	NOTES 
 	RO_DATA(PAGE_SIZE)
 
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index ce5ed083a1e9..aa0855471c79 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -49,16 +49,14 @@ SECTIONS
   	_etext = .;			/* End of text section */
   } :text = 0x9090
 
+  NOTES :text :note
+
   . = ALIGN(16);		/* Exception table */
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
   	__start___ex_table = .;
 	 *(__ex_table)
   	__stop___ex_table = .;
-  }
-
-  NOTES :text :note
-
-  BUG_TABLE :text
+  } :text = 0x9090
 
   . = ALIGN(4);
   .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) {
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index fad3674b06a5..d123747af1e4 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -19,7 +19,7 @@ PHDRS {
 	data PT_LOAD FLAGS(7);	/* RWE */
 	user PT_LOAD FLAGS(7);	/* RWE */
 	data.init PT_LOAD FLAGS(7);	/* RWE */
-	note PT_NOTE FLAGS(4);	/* R__ */
+	note PT_NOTE FLAGS(0);	/* ___ */
 }
 SECTIONS
 {
@@ -40,16 +40,14 @@ SECTIONS
 	_etext = .;		/* End of text section */
   } :text = 0x9090
 
+  NOTES :text :note
+
   . = ALIGN(16);		/* Exception table */
   __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
   	__start___ex_table = .;
 	 *(__ex_table)
   	__stop___ex_table = .;
-  }
-
-  NOTES :text :note
-
-  BUG_TABLE :text
+  } :text = 0x9090
 
   RODATA
 
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index f054778e916c..dd2cc8122ad8 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -67,6 +67,8 @@
 		*(.rodata1)						\
 	}								\
 									\
+	BUG_TABLE							\
+									\
 	/* PCI quirks */						\
 	.pci_fixup        : AT(ADDR(.pci_fixup) - LOAD_OFFSET) {	\
 		VMLINUX_SYMBOL(__start_pci_fixups_early) = .;		\
@@ -310,6 +312,7 @@
 		.stab.indexstr 0 : { *(.stab.indexstr) }		\
 		.comment 0 : { *(.comment) }
 
+#ifdef CONFIG_GENERIC_BUG
 #define BUG_TABLE							\
 	. = ALIGN(8);							\
 	__bug_table : AT(ADDR(__bug_table) - LOAD_OFFSET) {		\
@@ -317,6 +320,9 @@
 		*(__bug_table)						\
 		__stop___bug_table = .;					\
 	}
+#else
+#define BUG_TABLE
+#endif
 
 #define NOTES								\
 	.notes : AT(ADDR(.notes) - LOAD_OFFSET) {			\
-- 
cgit v1.2.3


From 668a6c3654560aef8741642478973e205a4f02bf Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Mon, 19 May 2008 13:35:24 +0200
Subject: - fix mmioftrace + rcu merge interaction

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/kmmio.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index b65871e6bba6..93d82038af4b 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -6,6 +6,7 @@
  */
 
 #include <linux/list.h>
+#include <linux/rculist.h>
 #include <linux/spinlock.h>
 #include <linux/hash.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From 1d74f2a0f64b4091e5e91b55ac1b17dff93f4b59 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar
Date: Sun, 1 Jun 2008 21:47:42 +0530
Subject: ftrace: remove ftrace_ip_converted()

Remove the unneeded function ftrace_ip_converted().

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/ftrace.c     | 10 ----------
 arch/powerpc/kernel/ftrace.c | 10 ----------
 arch/sparc64/kernel/ftrace.c |  7 -------
 arch/x86/kernel/ftrace.c     | 10 ----------
 kernel/trace/ftrace.c        |  7 -------
 5 files changed, 44 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index f4cb4cc3fa0c..22f3d6e309f9 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -22,16 +22,6 @@
 static unsigned long bl_insn;
 static const unsigned long NOP = 0xe1a00000; /* mov r0, r0 */
 
-/* return true if mcount call site is already patched/no-op'ed */
-int ftrace_ip_converted(unsigned long pc)
-{
-	unsigned long save;
-
-	pc -= INSN_SIZE;
-	save = *(unsigned long *)pc;
-	return save == NOP;
-}
-
 unsigned char *ftrace_nop_replace(void)
 {
 	return (char *)&NOP;
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index 69ed41223468..e12c593ab9ca 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -27,16 +27,6 @@ static unsigned int ftrace_nop = 0x60000000;
 # define GET_ADDR(addr) *(unsigned long *)addr
 #endif
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	unsigned int save;
-
-	ip -= CALL_BACK;
-	save = *(unsigned int *)ip;
-
-	return save == ftrace_nop;
-}
-
 static unsigned int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)((addr + CALL_BACK) - ip);
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index f449e6df6c4a..c17373195b1e 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -7,13 +7,6 @@
 
 static const u32 ftrace_nop = 0x01000000;
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	u32 insn = *(u32 *) ip;
-
-	return (insn == ftrace_nop);
-}
-
 notrace unsigned char *ftrace_nop_replace(void)
 {
 	return (char *)&ftrace_nop;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 498608c015fb..bc5cf8d46742 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -31,16 +31,6 @@ union ftrace_code_union {
 	} __attribute__((packed));
 };
 
-notrace int ftrace_ip_converted(unsigned long ip)
-{
-	unsigned long save;
-
-	ip -= CALL_BACK;
-	save = *(long *)ip;
-
-	return save == *ftrace_nop;
-}
-
 static int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ec54cb7d69d6..a8929e4c77c1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -306,13 +306,6 @@ ftrace_record_ip(unsigned long ip)
 	if (ftrace_ip_in_hash(ip, key))
 		goto out_unlock;
 
-	/*
-	 * There's a slight race that the ftraced will update the
-	 * hash and reset here. If it is already converted, skip it.
-	 */
-	if (ftrace_ip_converted(ip))
-		goto out_unlock;
-
 	node = ftrace_alloc_dyn_node(ip);
 	if (!node)
 		goto out_unlock;
-- 
cgit v1.2.3


From 273c11270d3715c4c06d4df1607a1a60034d887b Mon Sep 17 00:00:00 2001
From: Miklos Vajna
Date: Tue, 13 May 2008 18:38:56 +0200
Subject: x86/PCI: janitor work in irq.c

Wrapped long lines, removed trailing whitespaces, fixed case indentation
inside switch and so.

Signed-off-by: Miklos Vajna <vmiklos@frugalware.org>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/irq.c | 511 ++++++++++++++++++++++++++++++-----------------------
 1 file changed, 294 insertions(+), 217 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index ca8df9c260bc..c422e10337be 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -11,8 +11,8 @@
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/dmi.h>
-#include <asm/io.h>
-#include <asm/smp.h>
+#include <linux/io.h>
+#include <linux/smp.h>
 #include <asm/io_apic.h>
 #include <linux/irq.h>
 #include <linux/acpi.h>
@@ -45,7 +45,8 @@ struct irq_router {
 	char *name;
 	u16 vendor, device;
 	int (*get)(struct pci_dev *router, struct pci_dev *dev, int pirq);
-	int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq, int new);
+	int (*set)(struct pci_dev *router, struct pci_dev *dev, int pirq,
+		int new);
 };
 
 struct irq_router_handler {
@@ -61,7 +62,7 @@ void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
  *  and perform checksum verification.
  */
 
-static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
+static inline struct irq_routing_table *pirq_check_routing_table(u8 *addr)
 {
 	struct irq_routing_table *rt;
 	int i;
@@ -74,10 +75,11 @@ static inline struct irq_routing_table * pirq_check_routing_table(u8 *addr)
 	    rt->size < sizeof(struct irq_routing_table))
 		return NULL;
 	sum = 0;
-	for (i=0; i < rt->size; i++)
+	for (i = 0; i < rt->size; i++)
 		sum += addr[i];
 	if (!sum) {
-		DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n", rt);
+		DBG(KERN_DEBUG "PCI: Interrupt Routing Table found at 0x%p\n",
+			rt);
 		return rt;
 	}
 	return NULL;
@@ -100,7 +102,8 @@ static struct irq_routing_table * __init pirq_find_routing_table(void)
 			return rt;
 		printk(KERN_WARNING "PCI: PIRQ table NOT found at pirqaddr\n");
 	}
-	for(addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000); addr += 16) {
+	for (addr = (u8 *) __va(0xf0000); addr < (u8 *) __va(0x100000);
+		addr += 16) {
 		rt = pirq_check_routing_table(addr);
 		if (rt)
 			return rt;
@@ -122,20 +125,23 @@ static void __init pirq_peer_trick(void)
 	struct irq_info *e;
 
 	memset(busmap, 0, sizeof(busmap));
-	for(i=0; i < (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info); i++) {
+	for (i = 0; i < (rt->size - sizeof(struct irq_routing_table)) /
+		sizeof(struct irq_info); i++) {
 		e = &rt->slots[i];
 #ifdef DEBUG
 		{
 			int j;
-			DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus, e->devfn/8, e->slot);
-			for(j=0; j<4; j++)
-				DBG(" %d:%02x/%04x", j, e->irq[j].link, e->irq[j].bitmap);
+			DBG(KERN_DEBUG "%02x:%02x slot=%02x", e->bus,
+				e->devfn/8, e->slot);
+			for (j = 0; j < 4; j++)
+				DBG(" %d:%02x/%04x", j, e->irq[j].link,
+					e->irq[j].bitmap);
 			DBG("\n");
 		}
 #endif
 		busmap[e->bus] = 1;
 	}
-	for(i = 1; i < 256; i++) {
+	for (i = 1; i < 256; i++) {
 		int node;
 		if (!busmap[i] || pci_find_bus(0, i))
 			continue;
@@ -174,7 +180,8 @@ void eisa_set_level_irq(unsigned int irq)
  * Common IRQ routing practice: nibbles in config space,
  * offset by some magic constant.
  */
-static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
+static unsigned int
+read_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr)
 {
 	u8 x;
 	unsigned reg = offset + (nr >> 1);
@@ -183,7 +190,8 @@ static unsigned int read_config_nybble(struct pci_dev *router, unsigned offset,
 	return (nr & 1) ? (x >> 4) : (x & 0xf);
 }
 
-static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigned nr, unsigned int val)
+static void write_config_nybble(struct pci_dev *router, unsigned offset,
+	unsigned nr, unsigned int val)
 {
 	u8 x;
 	unsigned reg = offset + (nr >> 1);
@@ -200,15 +208,18 @@ static void write_config_nybble(struct pci_dev *router, unsigned offset, unsigne
  */
 static int pirq_ali_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
-	static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 };
+	static const unsigned char irqmap[16] = { 0, 9, 3, 10, 4, 5, 7, 6, 1,
+		11, 0, 12, 0, 14, 0, 15 };
 
 	WARN_ON_ONCE(pirq > 16);
 	return irqmap[read_config_nybble(router, 0x48, pirq-1)];
 }
 
-static int pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_ali_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
-	static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1, 3, 9, 11, 0, 13, 15 };
+	static const unsigned char irqmap[16] = { 0, 8, 0, 2, 4, 5, 7, 6, 0, 1,
+		3, 9, 11, 0, 13, 15 };
 	unsigned int val = irqmap[irq];
 
 	WARN_ON_ONCE(pirq > 16);
@@ -231,7 +242,8 @@ static int pirq_piix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return (x < 16) ? x : 0;
 }
 
-static int pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_piix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	pci_write_config_byte(router, pirq, irq);
 	return 1;
@@ -247,7 +259,8 @@ static int pirq_via_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return read_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq);
 }
 
-static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	write_config_nybble(router, 0x55, pirq == 4 ? 5 : pirq, irq);
 	return 1;
@@ -258,7 +271,8 @@ static int pirq_via_set(struct pci_dev *router, struct pci_dev *dev, int pirq, i
  * but without the ugly irq number munging.
  * However, for 82C586, nibble map is different .
  */
-static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+static int
+pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
 
@@ -266,7 +280,8 @@ static int pirq_via586_get(struct pci_dev *router, struct pci_dev *dev, int pirq
 	return read_config_nybble(router, 0x55, pirqmap[pirq-1]);
 }
 
-static int pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_via586_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	static const unsigned int pirqmap[5] = { 3, 2, 5, 1, 1 };
 
@@ -285,10 +300,11 @@ static int pirq_ite_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
 
 	WARN_ON_ONCE(pirq > 4);
-	return read_config_nybble(router,0x43, pirqmap[pirq-1]);
+	return read_config_nybble(router, 0x43, pirqmap[pirq-1]);
 }
 
-static int pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_ite_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	static const unsigned char pirqmap[4] = { 1, 0, 2, 3 };
 
@@ -306,7 +322,8 @@ static int pirq_opti_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return read_config_nybble(router, 0xb8, pirq >> 4);
 }
 
-static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	write_config_nybble(router, 0xb8, pirq >> 4, irq);
 	return 1;
@@ -314,7 +331,7 @@ static int pirq_opti_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
 
 /*
  * Cyrix: nibble offset 0x5C
- * 0x5C bits 7:4 is INTB bits 3:0 is INTA 
+ * 0x5C bits 7:4 is INTB bits 3:0 is INTA
  * 0x5D bits 7:4 is INTD bits 3:0 is INTC
  */
 static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
@@ -322,7 +339,8 @@ static int pirq_cyrix_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return read_config_nybble(router, 0x5C, (pirq-1)^1);
 }
 
-static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	write_config_nybble(router, 0x5C, (pirq-1)^1, irq);
 	return 1;
@@ -350,7 +368,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
  *	Apparently there are systems implementing PCI routing table using
  *	link values 0x01-0x04 and others using 0x41-0x44 for PCI INTA..D.
  *	We try our best to handle both link mappings.
- *	
+ *
  *	Currently (2003-05-21) it appears most SiS chipsets follow the
  *	definition of routing registers from the SiS-5595 southbridge.
  *	According to the SiS 5595 datasheets the revision id's of the
@@ -370,7 +388,7 @@ static int pirq_cyrix_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
  *
  *	0x62:	USBIRQ:
  *		bit 6 OHCI function disabled (0), enabled (1)
- *	
+ *
  *	0x6a:	ACPI/SCI IRQ: bits 4-6 reserved
  *
  *	0x7e:	Data Acq. Module IRQ - bits 4-6 reserved
@@ -405,7 +423,8 @@ static int pirq_sis_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return (x & PIRQ_SIS_IRQ_DISABLE) ? 0 : (x & PIRQ_SIS_IRQ_MASK);
 }
 
-static int pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_sis_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	u8 x;
 	int reg;
@@ -439,7 +458,8 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 	return read_config_nybble(router, 0x74, pirq-1);
 }
 
-static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	WARN_ON_ONCE(pirq >= 9);
 	if (pirq > 8) {
@@ -461,13 +481,15 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
  * 0x00 for ACPI (SCI), 0x01 for USB, 0x02 for IDE0, 0x04 for IDE1,
  * and 0x03 for SMBus.
  */
-static int pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+static int
+pirq_serverworks_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
 	outb(pirq, 0xc00);
 	return inb(0xc01) & 0xf;
 }
 
-static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev,
+	int pirq, int irq)
 {
 	outb(pirq, 0xc00);
 	outb(irq, 0xc01);
@@ -482,27 +504,27 @@ static int pirq_serverworks_set(struct pci_dev *router, struct pci_dev *dev, int
  * offset 0x56 0-3 PIRQA  4-7  PIRQB
  * offset 0x57 0-3 PIRQC  4-7  PIRQD
  */
-static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
+static int
+pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
 {
 	u8 irq;
 	irq = 0;
 	if (pirq <= 4)
-	{
 		irq = read_config_nybble(router, 0x56, pirq - 1);
-	}
-	printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
+	printk(KERN_INFO
+		"AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n",
 		dev->vendor, dev->device, pirq, irq);
 	return irq;
 }
 
-static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
-	printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 
+	printk(KERN_INFO
+		"AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n",
 		dev->vendor, dev->device, pirq, irq);
 	if (pirq <= 4)
-	{
 		write_config_nybble(router, 0x56, pirq - 1, irq);
-	}
 	return 1;
 }
 
@@ -528,7 +550,8 @@ static int pirq_pico_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
 
 #ifdef CONFIG_PCI_BIOS
 
-static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
+static int
+pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
 {
 	struct pci_dev *bridge;
 	int pin = pci_get_interrupt_pin(dev, &bridge);
@@ -537,11 +560,14 @@ static int pirq_bios_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
 
 #endif
 
-static __init int intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+intel_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
 	static struct pci_device_id __initdata pirq_440gx[] = {
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_0) },
-		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82443GX_2) },
+		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
+			PCI_DEVICE_ID_INTEL_82443GX_0) },
+		{ PCI_DEVICE(PCI_VENDOR_ID_INTEL,
+			PCI_DEVICE_ID_INTEL_82443GX_2) },
 		{ },
 	};
 
@@ -549,50 +575,49 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
 	if (pci_dev_present(pirq_440gx))
 		return 0;
 
-	switch(device)
-	{
-		case PCI_DEVICE_ID_INTEL_82371FB_0:
-		case PCI_DEVICE_ID_INTEL_82371SB_0:
-		case PCI_DEVICE_ID_INTEL_82371AB_0:
-		case PCI_DEVICE_ID_INTEL_82371MX:
-		case PCI_DEVICE_ID_INTEL_82443MX_0:
-		case PCI_DEVICE_ID_INTEL_82801AA_0:
-		case PCI_DEVICE_ID_INTEL_82801AB_0:
-		case PCI_DEVICE_ID_INTEL_82801BA_0:
-		case PCI_DEVICE_ID_INTEL_82801BA_10:
-		case PCI_DEVICE_ID_INTEL_82801CA_0:
-		case PCI_DEVICE_ID_INTEL_82801CA_12:
-		case PCI_DEVICE_ID_INTEL_82801DB_0:
-		case PCI_DEVICE_ID_INTEL_82801E_0:
-		case PCI_DEVICE_ID_INTEL_82801EB_0:
-		case PCI_DEVICE_ID_INTEL_ESB_1:
-		case PCI_DEVICE_ID_INTEL_ICH6_0:
-		case PCI_DEVICE_ID_INTEL_ICH6_1:
-		case PCI_DEVICE_ID_INTEL_ICH7_0:
-		case PCI_DEVICE_ID_INTEL_ICH7_1:
-		case PCI_DEVICE_ID_INTEL_ICH7_30:
-		case PCI_DEVICE_ID_INTEL_ICH7_31:
-		case PCI_DEVICE_ID_INTEL_ESB2_0:
-		case PCI_DEVICE_ID_INTEL_ICH8_0:
-		case PCI_DEVICE_ID_INTEL_ICH8_1:
-		case PCI_DEVICE_ID_INTEL_ICH8_2:
-		case PCI_DEVICE_ID_INTEL_ICH8_3:
-		case PCI_DEVICE_ID_INTEL_ICH8_4:
-		case PCI_DEVICE_ID_INTEL_ICH9_0:
-		case PCI_DEVICE_ID_INTEL_ICH9_1:
-		case PCI_DEVICE_ID_INTEL_ICH9_2:
-		case PCI_DEVICE_ID_INTEL_ICH9_3:
-		case PCI_DEVICE_ID_INTEL_ICH9_4:
-		case PCI_DEVICE_ID_INTEL_ICH9_5:
-		case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
-		case PCI_DEVICE_ID_INTEL_ICH10_0:
-		case PCI_DEVICE_ID_INTEL_ICH10_1:
-		case PCI_DEVICE_ID_INTEL_ICH10_2:
-		case PCI_DEVICE_ID_INTEL_ICH10_3:
-			r->name = "PIIX/ICH";
-			r->get = pirq_piix_get;
-			r->set = pirq_piix_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_INTEL_82371FB_0:
+	case PCI_DEVICE_ID_INTEL_82371SB_0:
+	case PCI_DEVICE_ID_INTEL_82371AB_0:
+	case PCI_DEVICE_ID_INTEL_82371MX:
+	case PCI_DEVICE_ID_INTEL_82443MX_0:
+	case PCI_DEVICE_ID_INTEL_82801AA_0:
+	case PCI_DEVICE_ID_INTEL_82801AB_0:
+	case PCI_DEVICE_ID_INTEL_82801BA_0:
+	case PCI_DEVICE_ID_INTEL_82801BA_10:
+	case PCI_DEVICE_ID_INTEL_82801CA_0:
+	case PCI_DEVICE_ID_INTEL_82801CA_12:
+	case PCI_DEVICE_ID_INTEL_82801DB_0:
+	case PCI_DEVICE_ID_INTEL_82801E_0:
+	case PCI_DEVICE_ID_INTEL_82801EB_0:
+	case PCI_DEVICE_ID_INTEL_ESB_1:
+	case PCI_DEVICE_ID_INTEL_ICH6_0:
+	case PCI_DEVICE_ID_INTEL_ICH6_1:
+	case PCI_DEVICE_ID_INTEL_ICH7_0:
+	case PCI_DEVICE_ID_INTEL_ICH7_1:
+	case PCI_DEVICE_ID_INTEL_ICH7_30:
+	case PCI_DEVICE_ID_INTEL_ICH7_31:
+	case PCI_DEVICE_ID_INTEL_ESB2_0:
+	case PCI_DEVICE_ID_INTEL_ICH8_0:
+	case PCI_DEVICE_ID_INTEL_ICH8_1:
+	case PCI_DEVICE_ID_INTEL_ICH8_2:
+	case PCI_DEVICE_ID_INTEL_ICH8_3:
+	case PCI_DEVICE_ID_INTEL_ICH8_4:
+	case PCI_DEVICE_ID_INTEL_ICH9_0:
+	case PCI_DEVICE_ID_INTEL_ICH9_1:
+	case PCI_DEVICE_ID_INTEL_ICH9_2:
+	case PCI_DEVICE_ID_INTEL_ICH9_3:
+	case PCI_DEVICE_ID_INTEL_ICH9_4:
+	case PCI_DEVICE_ID_INTEL_ICH9_5:
+	case PCI_DEVICE_ID_INTEL_TOLAPAI_0:
+	case PCI_DEVICE_ID_INTEL_ICH10_0:
+	case PCI_DEVICE_ID_INTEL_ICH10_1:
+	case PCI_DEVICE_ID_INTEL_ICH10_2:
+	case PCI_DEVICE_ID_INTEL_ICH10_3:
+		r->name = "PIIX/ICH";
+		r->get = pirq_piix_get;
+		r->set = pirq_piix_set;
+		return 1;
 	}
 	return 0;
 }
@@ -606,7 +631,7 @@ static __init int via_router_probe(struct irq_router *r,
 	 * workarounds for some buggy BIOSes
 	 */
 	if (device == PCI_DEVICE_ID_VIA_82C586_0) {
-		switch(router->device) {
+		switch (router->device) {
 		case PCI_DEVICE_ID_VIA_82C686:
 			/*
 			 * Asus k7m bios wrongly reports 82C686A
@@ -631,7 +656,7 @@ static __init int via_router_probe(struct irq_router *r,
 		}
 	}
 
-	switch(device) {
+	switch (device) {
 	case PCI_DEVICE_ID_VIA_82C586_0:
 		r->name = "VIA";
 		r->get = pirq_via586_get;
@@ -652,88 +677,89 @@ static __init int via_router_probe(struct irq_router *r,
 	return 0;
 }
 
-static __init int vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+vlsi_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_VLSI_82C534:
-			r->name = "VLSI 82C534";
-			r->get = pirq_vlsi_get;
-			r->set = pirq_vlsi_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_VLSI_82C534:
+		r->name = "VLSI 82C534";
+		r->get = pirq_vlsi_get;
+		r->set = pirq_vlsi_set;
+		return 1;
 	}
 	return 0;
 }
 
 
-static __init int serverworks_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int serverworks_router_probe(struct irq_router *r,
+		struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_SERVERWORKS_OSB4:
-		case PCI_DEVICE_ID_SERVERWORKS_CSB5:
-			r->name = "ServerWorks";
-			r->get = pirq_serverworks_get;
-			r->set = pirq_serverworks_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_SERVERWORKS_OSB4:
+	case PCI_DEVICE_ID_SERVERWORKS_CSB5:
+		r->name = "ServerWorks";
+		r->get = pirq_serverworks_get;
+		r->set = pirq_serverworks_set;
+		return 1;
 	}
 	return 0;
 }
 
-static __init int sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+sis_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
 	if (device != PCI_DEVICE_ID_SI_503)
 		return 0;
-		
+
 	r->name = "SIS";
 	r->get = pirq_sis_get;
 	r->set = pirq_sis_set;
 	return 1;
 }
 
-static __init int cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+cyrix_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_CYRIX_5520:
-			r->name = "NatSemi";
-			r->get = pirq_cyrix_get;
-			r->set = pirq_cyrix_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_CYRIX_5520:
+		r->name = "NatSemi";
+		r->get = pirq_cyrix_get;
+		r->set = pirq_cyrix_set;
+		return 1;
 	}
 	return 0;
 }
 
-static __init int opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+opti_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_OPTI_82C700:
-			r->name = "OPTI";
-			r->get = pirq_opti_get;
-			r->set = pirq_opti_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_OPTI_82C700:
+		r->name = "OPTI";
+		r->get = pirq_opti_get;
+		r->set = pirq_opti_set;
+		return 1;
 	}
 	return 0;
 }
 
-static __init int ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+ite_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_ITE_IT8330G_0:
-			r->name = "ITE";
-			r->get = pirq_ite_get;
-			r->set = pirq_ite_set;
-			return 1;
+	switch (device) {
+	case PCI_DEVICE_ID_ITE_IT8330G_0:
+		r->name = "ITE";
+		r->get = pirq_ite_get;
+		r->set = pirq_ite_set;
+		return 1;
 	}
 	return 0;
 }
 
-static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+ali_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
+	switch (device) {
 	case PCI_DEVICE_ID_AL_M1533:
 	case PCI_DEVICE_ID_AL_M1563:
 		printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
@@ -745,28 +771,29 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
 	return 0;
 }
 
-static __init int amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+static __init int
+amd_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
-	switch(device)
-	{
-		case PCI_DEVICE_ID_AMD_VIPER_740B:
-			r->name = "AMD756";
-			break;
-		case PCI_DEVICE_ID_AMD_VIPER_7413:
-			r->name = "AMD766";
-			break;
-		case PCI_DEVICE_ID_AMD_VIPER_7443:
-			r->name = "AMD768";
-			break;
-		default:
-			return 0;
+	switch (device) {
+	case PCI_DEVICE_ID_AMD_VIPER_740B:
+		r->name = "AMD756";
+		break;
+	case PCI_DEVICE_ID_AMD_VIPER_7413:
+		r->name = "AMD766";
+		break;
+	case PCI_DEVICE_ID_AMD_VIPER_7443:
+		r->name = "AMD768";
+		break;
+	default:
+		return 0;
 	}
 	r->get = pirq_amd756_get;
 	r->set = pirq_amd756_set;
 	return 1;
 }
-		
-static __init int pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
+
+static __init int
+pico_router_probe(struct irq_router *r, struct pci_dev *router, u16 device)
 {
 	switch (device) {
 	case PCI_DEVICE_ID_PICOPOWER_PT86C523:
@@ -807,7 +834,7 @@ static struct pci_dev *pirq_router_dev;
  *	FIXME: should we have an option to say "generic for
  *	chipset" ?
  */
- 
+
 static void __init pirq_find_router(struct irq_router *r)
 {
 	struct irq_routing_table *rt = pirq_table;
@@ -826,7 +853,7 @@ static void __init pirq_find_router(struct irq_router *r)
 	r->name = "default";
 	r->get = NULL;
 	r->set = NULL;
-	
+
 	DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n",
 	    rt->rtr_vendor, rt->rtr_device);
 
@@ -837,12 +864,14 @@ static void __init pirq_find_router(struct irq_router *r)
 		return;
 	}
 
-	for( h = pirq_routers; h->vendor; h++) {
+	for (h = pirq_routers; h->vendor; h++) {
 		/* First look for a router match */
-		if (rt->rtr_vendor == h->vendor && h->probe(r, pirq_router_dev, rt->rtr_device))
+		if (rt->rtr_vendor == h->vendor &&
+			h->probe(r, pirq_router_dev, rt->rtr_device))
 			break;
 		/* Fall back to a device match */
-		if (pirq_router_dev->vendor == h->vendor && h->probe(r, pirq_router_dev, pirq_router_dev->device))
+		if (pirq_router_dev->vendor == h->vendor &&
+			h->probe(r, pirq_router_dev, pirq_router_dev->device))
 			break;
 	}
 	printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n",
@@ -857,11 +886,13 @@ static void __init pirq_find_router(struct irq_router *r)
 static struct irq_info *pirq_get_info(struct pci_dev *dev)
 {
 	struct irq_routing_table *rt = pirq_table;
-	int entries = (rt->size - sizeof(struct irq_routing_table)) / sizeof(struct irq_info);
+	int entries = (rt->size - sizeof(struct irq_routing_table)) /
+		sizeof(struct irq_info);
 	struct irq_info *info;
 
 	for (info = rt->slots; entries--; info++)
-		if (info->bus == dev->bus->number && PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
+		if (info->bus == dev->bus->number &&
+			PCI_SLOT(info->devfn) == PCI_SLOT(dev->devfn))
 			return info;
 	return NULL;
 }
@@ -889,7 +920,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 
 	if (!pirq_table)
 		return 0;
-	
+
 	DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
 	info = pirq_get_info(dev);
 	if (!info) {
@@ -902,7 +933,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		DBG(" -> not routed\n" KERN_DEBUG);
 		return 0;
 	}
-	DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, pirq_table->exclusive_irqs);
+	DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask,
+		pirq_table->exclusive_irqs);
 	mask &= pcibios_irq_mask;
 
 	/* Work around broken HP Pavilion Notebooks which assign USB to
@@ -915,7 +947,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	}
 
 	/* same for Acer Travelmate 360, but with CB and irq 11 -> 10 */
-	if (acer_tm360_irqrouting && dev->irq == 11 && dev->vendor == PCI_VENDOR_ID_O2) {
+	if (acer_tm360_irqrouting && dev->irq == 11 &&
+		dev->vendor == PCI_VENDOR_ID_O2) {
 		pirq = 0x68;
 		mask = 0x400;
 		dev->irq = r->get(pirq_router_dev, dev, pirq);
@@ -928,17 +961,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 	 */
 	newirq = dev->irq;
 	if (newirq && !((1 << newirq) & mask)) {
-		if ( pci_probe & PCI_USE_PIRQ_MASK) newirq = 0;
-		else printk("\n" KERN_WARNING
-			"PCI: IRQ %i for device %s doesn't match PIRQ mask "
-			"- try pci=usepirqmask\n" KERN_DEBUG, newirq,
-			pci_name(dev));
+		if (pci_probe & PCI_USE_PIRQ_MASK)
+			newirq = 0;
+		else
+			printk("\n" KERN_WARNING
+				"PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n"
+				KERN_DEBUG, newirq,
+				pci_name(dev));
 	}
 	if (!newirq && assign) {
 		for (i = 0; i < 16; i++) {
 			if (!(mask & (1 << i)))
 				continue;
-			if (pirq_penalty[i] < pirq_penalty[newirq] && can_request_irq(i, IRQF_SHARED))
+			if (pirq_penalty[i] < pirq_penalty[newirq] &&
+				can_request_irq(i, IRQF_SHARED))
 				newirq = i;
 		}
 	}
@@ -949,12 +985,13 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		irq = pirq & 0xf;
 		DBG(" -> hardcoded IRQ %d\n", irq);
 		msg = "Hardcoded";
-	} else if ( r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
-	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask)) ) {
+	} else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
+	((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
 		DBG(" -> got IRQ %d\n", irq);
 		msg = "Found";
 		eisa_set_level_irq(irq);
-	} else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
+	} else if (newirq && r->set &&
+		(dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
 		DBG(" -> assigning IRQ %d", newirq);
 		if (r->set(pirq_router_dev, dev, pirq, newirq)) {
 			eisa_set_level_irq(newirq);
@@ -972,7 +1009,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		} else
 			return 0;
 	}
-	printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, pci_name(dev));
+	printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq,
+		pci_name(dev));
 
 	/* Update IRQ for all devices with the same pirq value */
 	while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -984,20 +1022,26 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
 		if (!info)
 			continue;
 		if (info->irq[pin].link == pirq) {
-			/* We refuse to override the dev->irq information. Give a warning! */
-		    	if ( dev2->irq && dev2->irq != irq && \
+			/*
+			 * We refuse to override the dev->irq
+			 * information. Give a warning!
+			 */
+			if (dev2->irq && dev2->irq != irq && \
 			(!(pci_probe & PCI_USE_PIRQ_MASK) || \
-			((1 << dev2->irq) & mask)) ) {
+			((1 << dev2->irq) & mask))) {
 #ifndef CONFIG_PCI_MSI
-		    		printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n",
-				       pci_name(dev2), dev2->irq, irq);
+				printk(KERN_INFO
+					"IRQ routing conflict for %s, have irq %d, want irq %d\n",
+					pci_name(dev2), dev2->irq, irq);
 #endif
-		    		continue;
-		    	}
+				continue;
+			}
 			dev2->irq = irq;
 			pirq_penalty[irq]++;
 			if (dev != dev2)
-				printk(KERN_INFO "PCI: Sharing IRQ %d with %s\n", irq, pci_name(dev2));
+				printk(KERN_INFO
+					"PCI: Sharing IRQ %d with %s\n",
+					irq, pci_name(dev2));
 		}
 	}
 	return 1;
@@ -1011,15 +1055,21 @@ static void __init pcibios_fixup_irqs(void)
 	DBG(KERN_DEBUG "PCI: IRQ fixup\n");
 	while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
 		/*
-		 * If the BIOS has set an out of range IRQ number, just ignore it.
-		 * Also keep track of which IRQ's are already in use.
+		 * If the BIOS has set an out of range IRQ number, just
+		 * ignore it.  Also keep track of which IRQ's are
+		 * already in use.
 		 */
 		if (dev->irq >= 16) {
-			DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", pci_name(dev), dev->irq);
+			DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n",
+				pci_name(dev), dev->irq);
 			dev->irq = 0;
 		}
-		/* If the IRQ is already assigned to a PCI device, ignore its ISA use penalty */
-		if (pirq_penalty[dev->irq] >= 100 && pirq_penalty[dev->irq] < 100000)
+		/*
+		 * If the IRQ is already assigned to a PCI device,
+		 * ignore its ISA use penalty
+		 */
+		if (pirq_penalty[dev->irq] >= 100 &&
+				pirq_penalty[dev->irq] < 100000)
 			pirq_penalty[dev->irq] = 0;
 		pirq_penalty[dev->irq]++;
 	}
@@ -1031,31 +1081,40 @@ static void __init pcibios_fixup_irqs(void)
 		/*
 		 * Recalculate IRQ numbers if we use the I/O APIC.
 		 */
-		if (io_apic_assign_pci_irqs)
-		{
+		if (io_apic_assign_pci_irqs) {
 			int irq;
 
 			if (pin) {
-				pin--;		/* interrupt pins are numbered starting from 1 */
-				irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+				/*
+				 * interrupt pins are numbered starting
+				 * from 1
+				 */
+				pin--;
+				irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+					PCI_SLOT(dev->devfn), pin);
 	/*
 	 * Busses behind bridges are typically not listed in the MP-table.
 	 * In this case we have to look up the IRQ based on the parent bus,
 	 * parent slot, and pin number. The SMP code detects such bridged
 	 * busses itself so we should get into this branch reliably.
 	 */
-				if (irq < 0 && dev->bus->parent) { /* go back to the bridge */
-					struct pci_dev * bridge = dev->bus->self;
+				if (irq < 0 && dev->bus->parent) {
+					/* go back to the bridge */
+					struct pci_dev *bridge = dev->bus->self;
 
 					pin = (pin + PCI_SLOT(dev->devfn)) % 4;
-					irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
-							PCI_SLOT(bridge->devfn), pin);
+					irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
+							PCI_SLOT(bridge->devfn),
+							pin);
 					if (irq >= 0)
-						printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
-							pci_name(bridge), 'A' + pin, irq);
+						printk(KERN_WARNING
+							"PCI: using PPB %s[%c] to get irq %d\n",
+							pci_name(bridge),
+							'A' + pin, irq);
 				}
 				if (irq >= 0) {
-					printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+					printk(KERN_INFO
+						"PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
 						pci_name(dev), 'A' + pin, irq);
 					dev->irq = irq;
 				}
@@ -1078,7 +1137,8 @@ static int __init fix_broken_hp_bios_irq9(const struct dmi_system_id *d)
 {
 	if (!broken_hp_bios_irq9) {
 		broken_hp_bios_irq9 = 1;
-		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+			d->ident);
 	}
 	return 0;
 }
@@ -1091,7 +1151,8 @@ static int __init fix_acer_tm360_irqrouting(const struct dmi_system_id *d)
 {
 	if (!acer_tm360_irqrouting) {
 		acer_tm360_irqrouting = 1;
-		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n", d->ident);
+		printk(KERN_INFO "%s detected - fixing broken IRQ routing\n",
+			d->ident);
 	}
 	return 0;
 }
@@ -1103,7 +1164,8 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
 			DMI_MATCH(DMI_BIOS_VERSION, "GE.M1.03"),
-			DMI_MATCH(DMI_PRODUCT_VERSION, "HP Pavilion Notebook Model GE"),
+			DMI_MATCH(DMI_PRODUCT_VERSION,
+				"HP Pavilion Notebook Model GE"),
 			DMI_MATCH(DMI_BOARD_VERSION, "OmniBook N32N-736"),
 		},
 	},
@@ -1138,11 +1200,14 @@ static int __init pcibios_irq_init(void)
 		pirq_find_router(&pirq_router);
 		if (pirq_table->exclusive_irqs) {
 			int i;
-			for (i=0; i<16; i++)
+			for (i = 0; i < 16; i++)
 				if (!(pirq_table->exclusive_irqs & (1 << i)))
 					pirq_penalty[i] += 100;
 		}
-		/* If we're using the I/O APIC, avoid using the PCI IRQ routing table */
+		/*
+		 * If we're using the I/O APIC, avoid using the PCI IRQ
+		 * routing table
+		 */
 		if (io_apic_assign_pci_irqs)
 			pirq_table = NULL;
 	}
@@ -1189,33 +1254,40 @@ static int pirq_enable_irq(struct pci_dev *dev)
 	if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
 		char *msg = "";
 
-		pin--;		/* interrupt pins are numbered starting from 1 */
+		pin--; /* interrupt pins are numbered starting from 1 */
 
 		if (io_apic_assign_pci_irqs) {
 			int irq;
 
-			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin);
+			irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+				PCI_SLOT(dev->devfn), pin);
 			/*
-			 * Busses behind bridges are typically not listed in the MP-table.
-			 * In this case we have to look up the IRQ based on the parent bus,
-			 * parent slot, and pin number. The SMP code detects such bridged
-			 * busses itself so we should get into this branch reliably.
+			 * Busses behind bridges are typically not
+			 * listed in the MP-table.  In this case we have
+			 * to look up the IRQ based on the parent bus,
+			 * parent slot, and pin number. The SMP code
+			 * detects such bridged busses itself so we
+			 * should get into this branch reliably.
 			 */
 			temp_dev = dev;
-			while (irq < 0 && dev->bus->parent) { /* go back to the bridge */
-				struct pci_dev * bridge = dev->bus->self;
+			while (irq < 0 && dev->bus->parent) {
+				/* go back to the bridge */
+				struct pci_dev *bridge = dev->bus->self;
 
 				pin = (pin + PCI_SLOT(dev->devfn)) % 4;
-				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 
+				irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
 						PCI_SLOT(bridge->devfn), pin);
 				if (irq >= 0)
-					printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n",
-						pci_name(bridge), 'A' + pin, irq);
+					printk(KERN_WARNING
+						"PCI: using PPB %s[%c] to get irq %d\n",
+						pci_name(bridge),
+						'A' + pin, irq);
 				dev = bridge;
 			}
 			dev = temp_dev;
 			if (irq >= 0) {
-				printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
+				printk(KERN_INFO
+					"PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n",
 					pci_name(dev), 'A' + pin, irq);
 				dev->irq = irq;
 				return 0;
@@ -1226,12 +1298,17 @@ static int pirq_enable_irq(struct pci_dev *dev)
 		else
 			msg = " Please try using pci=biosirq.";
 
-		/* With IDE legacy devices the IRQ lookup failure is not a problem.. */
-		if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE && !(dev->class & 0x5))
+		/*
+		 * With IDE legacy devices the IRQ lookup failure is not
+		 * a problem..
+		 */
+		if (dev->class >> 8 == PCI_CLASS_STORAGE_IDE &&
+				!(dev->class & 0x5))
 			return 0;
 
-		printk(KERN_WARNING "PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
-		       'A' + pin, pci_name(dev), msg);
+		printk(KERN_WARNING
+			"PCI: No IRQ known for interrupt pin %c of device %s.%s\n",
+			'A' + pin, pci_name(dev), msg);
 	}
 	return 0;
 }
-- 
cgit v1.2.3


From bb71ad880204b79d60331d3384103976e086cb9f Mon Sep 17 00:00:00 2001
From: Gary Hade
Date: Mon, 12 May 2008 13:57:46 -0700
Subject: PCI: boot parameter to avoid expansion ROM memory allocation

Contention for scarce PCI memory resources has been growing
due to an increasing number of PCI slots in large multi-node
systems.  The kernel currently attempts by default to
allocate memory for all PCI expansion ROMs so there has
also been an increasing number of PCI memory allocation
failures seen on these systems.  This occurs because the
BIOS either (1) provides insufficient PCI memory resource
for all the expansion ROMs or (2) provides adequate PCI
memory resource for expansion ROMs but provides the
space in kernel unexpected BIOS assigned P2P non-prefetch
windows.

The resulting PCI memory allocation failures may be benign
when related to memory requests for expansion ROMs themselves
but in some cases they can occur when attempting to allocate
space for more critical BARs.  This can happen when a successful
expansion ROM allocation request consumes memory resource
that was intended for a non-ROM BAR.  We have seen this
happen during PCI hotplug of an adapter that contains a
P2P bridge where successful memory allocation for an
expansion ROM BAR on device behind the bridge consumed
memory that was intended for a non-ROM BAR on the P2P bridge.
In all cases the allocation failure messages can be very
confusing for users.

This patch provides a new 'pci=norom' kernel boot parameter
that can be used to disable the default PCI expansion ROM memory
resource allocation.  This provides a way to avoid the above
described issues on systems that do not contain PCI devices
for which drivers or user-level applications depend on the
default PCI expansion ROM memory resource allocation behavior.

Signed-off-by: Gary Hade <garyhade@us.ibm.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |  3 +++
 arch/x86/pci/common.c               | 22 ++++++++++++++++++++++
 arch/x86/pci/pci.h                  |  1 +
 3 files changed, 26 insertions(+)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e07c432c731f..9cf7b34f2db0 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1496,6 +1496,9 @@ and is between 256 and 4096 characters. It is defined in the file
 				Use with caution as certain devices share
 				address decoders between ROMs and other
 				resources.
+		norom		[X86-32,X86_64] Do not assign address space to
+				expansion ROMs that do not already have
+				BIOS assigned address ranges.
 		irqmask=0xMMMM	[X86-32] Set a bit mask of IRQs allowed to be
 				assigned automatically to PCI devices. You can
 				make the kernel exclude IRQs of your ISA cards
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 6e64aaf00d1d..3a5261bdff5d 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -121,6 +121,21 @@ void __init dmi_check_skip_isa_align(void)
 	dmi_check_system(can_skip_pciprobe_dmi_table);
 }
 
+static void __devinit pcibios_fixup_device_resources(struct pci_dev *dev)
+{
+	struct resource *rom_r = &dev->resource[PCI_ROM_RESOURCE];
+
+	if (pci_probe & PCI_NOASSIGN_ROMS) {
+		if (rom_r->parent)
+			return;
+		if (rom_r->start) {
+			/* we deal with BIOS assigned ROM later */
+			return;
+		}
+		rom_r->start = rom_r->end = rom_r->flags = 0;
+	}
+}
+
 /*
  *  Called after each bus is probed, but before its children
  *  are examined.
@@ -128,7 +143,11 @@ void __init dmi_check_skip_isa_align(void)
 
 void __devinit  pcibios_fixup_bus(struct pci_bus *b)
 {
+	struct pci_dev *dev;
+
 	pci_read_bridge_bases(b);
+	list_for_each_entry(dev, &b->devices, bus_list)
+		pcibios_fixup_device_resources(dev);
 }
 
 /*
@@ -483,6 +502,9 @@ char * __devinit  pcibios_setup(char *str)
 	else if (!strcmp(str, "rom")) {
 		pci_probe |= PCI_ASSIGN_ROMS;
 		return NULL;
+	} else if (!strcmp(str, "norom")) {
+		pci_probe |= PCI_NOASSIGN_ROMS;
+		return NULL;
 	} else if (!strcmp(str, "assign-busses")) {
 		pci_probe |= PCI_ASSIGN_ALL_BUSSES;
 		return NULL;
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 720c4c554534..291dafec07b7 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -27,6 +27,7 @@
 #define PCI_CAN_SKIP_ISA_ALIGN	0x8000
 #define PCI_USE__CRS		0x10000
 #define PCI_CHECK_ENABLE_AMD_MMCONF	0x20000
+#define PCI_NOASSIGN_ROMS	0x40000
 
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
-- 
cgit v1.2.3


From 1eede070a59e1cc73da51e1aaa00d9ab86572cfc Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Tue, 20 May 2008 23:00:01 +0200
Subject: Introduce new top level suspend and hibernation callbacks

Introduce 'struct pm_ops' and 'struct pm_ext_ops' ('ext' meaning
'extended') representing suspend and hibernation operations for bus
types, device classes, device types and device drivers.

Modify the PM core to use 'struct pm_ops' and 'struct pm_ext_ops'
objects, if defined, instead of the ->suspend(), ->resume(),
->suspend_late(), and ->resume_early() callbacks (the old callbacks
will be considered as legacy and gradually phased out).

The main purpose of doing this is to separate suspend (aka S2RAM and
standby) callbacks from hibernation callbacks in such a way that the
new callbacks won't take arguments and the semantics of each of them
will be clearly specified.  This has been requested for multiple
times by many people, including Linus himself, and the reason is that
within the current scheme if ->resume() is called, for example, it's
difficult to say why it's been called (ie. is it a resume from RAM or
from hibernation or a suspend/hibernation failure etc.?).

The second purpose is to make the suspend/hibernation callbacks more
flexible so that device drivers can handle more than they can within
the current scheme.  For example, some drivers may need to prevent
new children of the device from being registered before their
->suspend() callbacks are executed or they may want to carry out some
operations requiring the availability of some other devices, not
directly bound via the parent-child relationship, in order to prepare
for the execution of ->suspend(), etc.

Ultimately, we'd like to stop using the freezing of tasks for suspend
and therefore the drivers' suspend/hibernation code will have to take
care of the handling of the user space during suspend/hibernation.
That, in turn, would be difficult within the current scheme, without
the new ->prepare() and ->complete() callbacks.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/apm_32.c   |   8 +-
 drivers/base/power/main.c  | 675 +++++++++++++++++++++++++++++++++++----------
 drivers/base/power/power.h |   2 +-
 drivers/base/power/trace.c |   4 +-
 include/linux/device.h     |   9 +
 include/linux/pm.h         | 314 +++++++++++++++++++--
 kernel/power/disk.c        |  22 +-
 kernel/power/main.c        |   6 +-
 8 files changed, 845 insertions(+), 195 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..c1735f61a2c0 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1211,9 +1211,9 @@ static int suspend(int vetoable)
 	if (err != APM_SUCCESS)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
-	device_power_up();
+	device_power_up(PMSG_RESUME);
 	local_irq_enable();
-	device_resume();
+	device_resume(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
 	for (as = user_list; as != NULL; as = as->next) {
@@ -1238,7 +1238,7 @@ static void standby(void)
 		apm_error("standby", err);
 
 	local_irq_disable();
-	device_power_up();
+	device_power_up(PMSG_RESUME);
 	local_irq_enable();
 }
 
@@ -1324,7 +1324,7 @@ static void check_events(void)
 			ignore_bounce = 1;
 			if ((event != APM_NORMAL_RESUME)
 			    || (ignore_normal_resume == 0)) {
-				device_resume();
+				device_resume(PMSG_RESUME);
 				queue_event(event, NULL);
 			}
 			ignore_normal_resume = 0;
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 45cc3d9eacb8..d571204aaff7 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -12,11 +12,9 @@
  * and add it to the list of power-controlled devices. sysfs entries for
  * controlling device power management will also be added.
  *
- * A different set of lists than the global subsystem list are used to
- * keep track of power info because we use different lists to hold
- * devices based on what stage of the power management process they
- * are in. The power domain dependencies may also differ from the
- * ancestral dependencies that the subsystem list maintains.
+ * A separate list is used for keeping track of power info, because the power
+ * domain dependencies may differ from the ancestral dependencies that the
+ * subsystem list maintains.
  */
 
 #include <linux/device.h>
@@ -30,31 +28,40 @@
 #include "power.h"
 
 /*
- * The entries in the dpm_active list are in a depth first order, simply
+ * The entries in the dpm_list list are in a depth first order, simply
  * because children are guaranteed to be discovered after parents, and
  * are inserted at the back of the list on discovery.
  *
- * All the other lists are kept in the same order, for consistency.
- * However the lists aren't always traversed in the same order.
- * Semaphores must be acquired from the top (i.e., front) down
- * and released in the opposite order.  Devices must be suspended
- * from the bottom (i.e., end) up and resumed in the opposite order.
- * That way no parent will be suspended while it still has an active
- * child.
- *
  * Since device_pm_add() may be called with a device semaphore held,
  * we must never try to acquire a device semaphore while holding
  * dpm_list_mutex.
  */
 
-LIST_HEAD(dpm_active);
-static LIST_HEAD(dpm_off);
-static LIST_HEAD(dpm_off_irq);
+LIST_HEAD(dpm_list);
 
 static DEFINE_MUTEX(dpm_list_mtx);
 
-/* 'true' if all devices have been suspended, protected by dpm_list_mtx */
-static bool all_sleeping;
+/*
+ * Set once the preparation of devices for a PM transition has started, reset
+ * before starting to resume devices.  Protected by dpm_list_mtx.
+ */
+static bool transition_started;
+
+/**
+ *	device_pm_lock - lock the list of active devices used by the PM core
+ */
+void device_pm_lock(void)
+{
+	mutex_lock(&dpm_list_mtx);
+}
+
+/**
+ *	device_pm_unlock - unlock the list of active devices used by the PM core
+ */
+void device_pm_unlock(void)
+{
+	mutex_unlock(&dpm_list_mtx);
+}
 
 /**
  *	device_pm_add - add a device to the list of active devices
@@ -68,17 +75,25 @@ int device_pm_add(struct device *dev)
 		 dev->bus ? dev->bus->name : "No Bus",
 		 kobject_name(&dev->kobj));
 	mutex_lock(&dpm_list_mtx);
-	if ((dev->parent && dev->parent->power.sleeping) || all_sleeping) {
-		if (dev->parent->power.sleeping)
-			dev_warn(dev, "parent %s is sleeping\n",
+	if (dev->parent) {
+		if (dev->parent->power.status >= DPM_SUSPENDING) {
+			dev_warn(dev, "parent %s is sleeping, will not add\n",
 				dev->parent->bus_id);
-		else
-			dev_warn(dev, "all devices are sleeping\n");
+			WARN_ON(true);
+		}
+	} else if (transition_started) {
+		/*
+		 * We refuse to register parentless devices while a PM
+		 * transition is in progress in order to avoid leaving them
+		 * unhandled down the road
+		 */
 		WARN_ON(true);
 	}
 	error = dpm_sysfs_add(dev);
-	if (!error)
-		list_add_tail(&dev->power.entry, &dpm_active);
+	if (!error) {
+		dev->power.status = DPM_ON;
+		list_add_tail(&dev->power.entry, &dpm_list);
+	}
 	mutex_unlock(&dpm_list_mtx);
 	return error;
 }
@@ -100,73 +115,243 @@ void device_pm_remove(struct device *dev)
 	mutex_unlock(&dpm_list_mtx);
 }
 
+/**
+ *	pm_op - execute the PM operation appropiate for given PM event
+ *	@dev:	Device.
+ *	@ops:	PM operations to choose from.
+ *	@state:	PM transition of the system being carried out.
+ */
+static int pm_op(struct device *dev, struct pm_ops *ops, pm_message_t state)
+{
+	int error = 0;
+
+	switch (state.event) {
+#ifdef CONFIG_SUSPEND
+	case PM_EVENT_SUSPEND:
+		if (ops->suspend) {
+			error = ops->suspend(dev);
+			suspend_report_result(ops->suspend, error);
+		}
+		break;
+	case PM_EVENT_RESUME:
+		if (ops->resume) {
+			error = ops->resume(dev);
+			suspend_report_result(ops->resume, error);
+		}
+		break;
+#endif /* CONFIG_SUSPEND */
+#ifdef CONFIG_HIBERNATION
+	case PM_EVENT_FREEZE:
+	case PM_EVENT_QUIESCE:
+		if (ops->freeze) {
+			error = ops->freeze(dev);
+			suspend_report_result(ops->freeze, error);
+		}
+		break;
+	case PM_EVENT_HIBERNATE:
+		if (ops->poweroff) {
+			error = ops->poweroff(dev);
+			suspend_report_result(ops->poweroff, error);
+		}
+		break;
+	case PM_EVENT_THAW:
+	case PM_EVENT_RECOVER:
+		if (ops->thaw) {
+			error = ops->thaw(dev);
+			suspend_report_result(ops->thaw, error);
+		}
+		break;
+	case PM_EVENT_RESTORE:
+		if (ops->restore) {
+			error = ops->restore(dev);
+			suspend_report_result(ops->restore, error);
+		}
+		break;
+#endif /* CONFIG_HIBERNATION */
+	default:
+		error = -EINVAL;
+	}
+	return error;
+}
+
+/**
+ *	pm_noirq_op - execute the PM operation appropiate for given PM event
+ *	@dev:	Device.
+ *	@ops:	PM operations to choose from.
+ *	@state: PM transition of the system being carried out.
+ *
+ *	The operation is executed with interrupts disabled by the only remaining
+ *	functional CPU in the system.
+ */
+static int pm_noirq_op(struct device *dev, struct pm_ext_ops *ops,
+			pm_message_t state)
+{
+	int error = 0;
+
+	switch (state.event) {
+#ifdef CONFIG_SUSPEND
+	case PM_EVENT_SUSPEND:
+		if (ops->suspend_noirq) {
+			error = ops->suspend_noirq(dev);
+			suspend_report_result(ops->suspend_noirq, error);
+		}
+		break;
+	case PM_EVENT_RESUME:
+		if (ops->resume_noirq) {
+			error = ops->resume_noirq(dev);
+			suspend_report_result(ops->resume_noirq, error);
+		}
+		break;
+#endif /* CONFIG_SUSPEND */
+#ifdef CONFIG_HIBERNATION
+	case PM_EVENT_FREEZE:
+	case PM_EVENT_QUIESCE:
+		if (ops->freeze_noirq) {
+			error = ops->freeze_noirq(dev);
+			suspend_report_result(ops->freeze_noirq, error);
+		}
+		break;
+	case PM_EVENT_HIBERNATE:
+		if (ops->poweroff_noirq) {
+			error = ops->poweroff_noirq(dev);
+			suspend_report_result(ops->poweroff_noirq, error);
+		}
+		break;
+	case PM_EVENT_THAW:
+	case PM_EVENT_RECOVER:
+		if (ops->thaw_noirq) {
+			error = ops->thaw_noirq(dev);
+			suspend_report_result(ops->thaw_noirq, error);
+		}
+		break;
+	case PM_EVENT_RESTORE:
+		if (ops->restore_noirq) {
+			error = ops->restore_noirq(dev);
+			suspend_report_result(ops->restore_noirq, error);
+		}
+		break;
+#endif /* CONFIG_HIBERNATION */
+	default:
+		error = -EINVAL;
+	}
+	return error;
+}
+
+static char *pm_verb(int event)
+{
+	switch (event) {
+	case PM_EVENT_SUSPEND:
+		return "suspend";
+	case PM_EVENT_RESUME:
+		return "resume";
+	case PM_EVENT_FREEZE:
+		return "freeze";
+	case PM_EVENT_QUIESCE:
+		return "quiesce";
+	case PM_EVENT_HIBERNATE:
+		return "hibernate";
+	case PM_EVENT_THAW:
+		return "thaw";
+	case PM_EVENT_RESTORE:
+		return "restore";
+	case PM_EVENT_RECOVER:
+		return "recover";
+	default:
+		return "(unknown PM event)";
+	}
+}
+
+static void pm_dev_dbg(struct device *dev, pm_message_t state, char *info)
+{
+	dev_dbg(dev, "%s%s%s\n", info, pm_verb(state.event),
+		((state.event & PM_EVENT_SLEEP) && device_may_wakeup(dev)) ?
+		", may wakeup" : "");
+}
+
+static void pm_dev_err(struct device *dev, pm_message_t state, char *info,
+			int error)
+{
+	printk(KERN_ERR "PM: Device %s failed to %s%s: error %d\n",
+		kobject_name(&dev->kobj), pm_verb(state.event), info, error);
+}
+
 /*------------------------- Resume routines -------------------------*/
 
 /**
- *	resume_device_early - Power on one device (early resume).
+ *	resume_device_noirq - Power on one device (early resume).
  *	@dev:	Device.
+ *	@state: PM transition of the system being carried out.
  *
  *	Must be called with interrupts disabled.
  */
-static int resume_device_early(struct device *dev)
+static int resume_device_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
 	TRACE_DEVICE(dev);
 	TRACE_RESUME(0);
 
-	if (dev->bus && dev->bus->resume_early) {
-		dev_dbg(dev, "EARLY resume\n");
+	if (!dev->bus)
+		goto End;
+
+	if (dev->bus->pm) {
+		pm_dev_dbg(dev, state, "EARLY ");
+		error = pm_noirq_op(dev, dev->bus->pm, state);
+	} else if (dev->bus->resume_early) {
+		pm_dev_dbg(dev, state, "legacy EARLY ");
 		error = dev->bus->resume_early(dev);
 	}
-
+ End:
 	TRACE_RESUME(error);
 	return error;
 }
 
 /**
  *	dpm_power_up - Power on all regular (non-sysdev) devices.
+ *	@state: PM transition of the system being carried out.
  *
- *	Walk the dpm_off_irq list and power each device up. This
- *	is used for devices that required they be powered down with
- *	interrupts disabled. As devices are powered on, they are moved
- *	to the dpm_off list.
+ *	Execute the appropriate "noirq resume" callback for all devices marked
+ *	as DPM_OFF_IRQ.
  *
  *	Must be called with interrupts disabled and only one CPU running.
  */
-static void dpm_power_up(void)
+static void dpm_power_up(pm_message_t state)
 {
+	struct device *dev;
 
-	while (!list_empty(&dpm_off_irq)) {
-		struct list_head *entry = dpm_off_irq.next;
-		struct device *dev = to_device(entry);
+	list_for_each_entry(dev, &dpm_list, power.entry)
+		if (dev->power.status > DPM_OFF) {
+			int error;
 
-		list_move_tail(entry, &dpm_off);
-		resume_device_early(dev);
-	}
+			dev->power.status = DPM_OFF;
+			error = resume_device_noirq(dev, state);
+			if (error)
+				pm_dev_err(dev, state, " early", error);
+		}
 }
 
 /**
  *	device_power_up - Turn on all devices that need special attention.
+ *	@state: PM transition of the system being carried out.
  *
  *	Power on system devices, then devices that required we shut them down
  *	with interrupts disabled.
  *
  *	Must be called with interrupts disabled.
  */
-void device_power_up(void)
+void device_power_up(pm_message_t state)
 {
 	sysdev_resume();
-	dpm_power_up();
+	dpm_power_up(state);
 }
 EXPORT_SYMBOL_GPL(device_power_up);
 
 /**
  *	resume_device - Restore state for one device.
  *	@dev:	Device.
- *
+ *	@state: PM transition of the system being carried out.
  */
-static int resume_device(struct device *dev)
+static int resume_device(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
@@ -175,21 +360,40 @@ static int resume_device(struct device *dev)
 
 	down(&dev->sem);
 
-	if (dev->bus && dev->bus->resume) {
-		dev_dbg(dev,"resuming\n");
-		error = dev->bus->resume(dev);
+	if (dev->bus) {
+		if (dev->bus->pm) {
+			pm_dev_dbg(dev, state, "");
+			error = pm_op(dev, &dev->bus->pm->base, state);
+		} else if (dev->bus->resume) {
+			pm_dev_dbg(dev, state, "legacy ");
+			error = dev->bus->resume(dev);
+		}
+		if (error)
+			goto End;
 	}
 
-	if (!error && dev->type && dev->type->resume) {
-		dev_dbg(dev,"resuming\n");
-		error = dev->type->resume(dev);
+	if (dev->type) {
+		if (dev->type->pm) {
+			pm_dev_dbg(dev, state, "type ");
+			error = pm_op(dev, dev->type->pm, state);
+		} else if (dev->type->resume) {
+			pm_dev_dbg(dev, state, "legacy type ");
+			error = dev->type->resume(dev);
+		}
+		if (error)
+			goto End;
 	}
 
-	if (!error && dev->class && dev->class->resume) {
-		dev_dbg(dev,"class resume\n");
-		error = dev->class->resume(dev);
+	if (dev->class) {
+		if (dev->class->pm) {
+			pm_dev_dbg(dev, state, "class ");
+			error = pm_op(dev, dev->class->pm, state);
+		} else if (dev->class->resume) {
+			pm_dev_dbg(dev, state, "legacy class ");
+			error = dev->class->resume(dev);
+		}
 	}
-
+ End:
 	up(&dev->sem);
 
 	TRACE_RESUME(error);
@@ -198,78 +402,161 @@ static int resume_device(struct device *dev)
 
 /**
  *	dpm_resume - Resume every device.
+ *	@state: PM transition of the system being carried out.
  *
- *	Resume the devices that have either not gone through
- *	the late suspend, or that did go through it but also
- *	went through the early resume.
+ *	Execute the appropriate "resume" callback for all devices the status of
+ *	which indicates that they are inactive.
+ */
+static void dpm_resume(pm_message_t state)
+{
+	struct list_head list;
+
+	INIT_LIST_HEAD(&list);
+	mutex_lock(&dpm_list_mtx);
+	transition_started = false;
+	while (!list_empty(&dpm_list)) {
+		struct device *dev = to_device(dpm_list.next);
+
+		get_device(dev);
+		if (dev->power.status >= DPM_OFF) {
+			int error;
+
+			dev->power.status = DPM_RESUMING;
+			mutex_unlock(&dpm_list_mtx);
+
+			error = resume_device(dev, state);
+
+			mutex_lock(&dpm_list_mtx);
+			if (error)
+				pm_dev_err(dev, state, "", error);
+		} else if (dev->power.status == DPM_SUSPENDING) {
+			/* Allow new children of the device to be registered */
+			dev->power.status = DPM_RESUMING;
+		}
+		if (!list_empty(&dev->power.entry))
+			list_move_tail(&dev->power.entry, &list);
+		put_device(dev);
+	}
+	list_splice(&list, &dpm_list);
+	mutex_unlock(&dpm_list_mtx);
+}
+
+/**
+ *	complete_device - Complete a PM transition for given device
+ *	@dev:	Device.
+ *	@state: PM transition of the system being carried out.
+ */
+static void complete_device(struct device *dev, pm_message_t state)
+{
+	down(&dev->sem);
+
+	if (dev->class && dev->class->pm && dev->class->pm->complete) {
+		pm_dev_dbg(dev, state, "completing class ");
+		dev->class->pm->complete(dev);
+	}
+
+	if (dev->type && dev->type->pm && dev->type->pm->complete) {
+		pm_dev_dbg(dev, state, "completing type ");
+		dev->type->pm->complete(dev);
+	}
+
+	if (dev->bus && dev->bus->pm && dev->bus->pm->base.complete) {
+		pm_dev_dbg(dev, state, "completing ");
+		dev->bus->pm->base.complete(dev);
+	}
+
+	up(&dev->sem);
+}
+
+/**
+ *	dpm_complete - Complete a PM transition for all devices.
+ *	@state: PM transition of the system being carried out.
  *
- *	Take devices from the dpm_off_list, resume them,
- *	and put them on the dpm_locked list.
+ *	Execute the ->complete() callbacks for all devices that are not marked
+ *	as DPM_ON.
  */
-static void dpm_resume(void)
+static void dpm_complete(pm_message_t state)
 {
+	struct list_head list;
+
+	INIT_LIST_HEAD(&list);
 	mutex_lock(&dpm_list_mtx);
-	all_sleeping = false;
-	while(!list_empty(&dpm_off)) {
-		struct list_head *entry = dpm_off.next;
-		struct device *dev = to_device(entry);
+	while (!list_empty(&dpm_list)) {
+		struct device *dev = to_device(dpm_list.prev);
 
-		list_move_tail(entry, &dpm_active);
-		dev->power.sleeping = false;
-		mutex_unlock(&dpm_list_mtx);
-		resume_device(dev);
-		mutex_lock(&dpm_list_mtx);
+		get_device(dev);
+		if (dev->power.status > DPM_ON) {
+			dev->power.status = DPM_ON;
+			mutex_unlock(&dpm_list_mtx);
+
+			complete_device(dev, state);
+
+			mutex_lock(&dpm_list_mtx);
+		}
+		if (!list_empty(&dev->power.entry))
+			list_move(&dev->power.entry, &list);
+		put_device(dev);
 	}
+	list_splice(&list, &dpm_list);
 	mutex_unlock(&dpm_list_mtx);
 }
 
 /**
  *	device_resume - Restore state of each device in system.
+ *	@state: PM transition of the system being carried out.
  *
  *	Resume all the devices, unlock them all, and allow new
  *	devices to be registered once again.
  */
-void device_resume(void)
+void device_resume(pm_message_t state)
 {
 	might_sleep();
-	dpm_resume();
+	dpm_resume(state);
+	dpm_complete(state);
 }
 EXPORT_SYMBOL_GPL(device_resume);
 
 
 /*------------------------- Suspend routines -------------------------*/
 
-static inline char *suspend_verb(u32 event)
+/**
+ *	resume_event - return a PM message representing the resume event
+ *	               corresponding to given sleep state.
+ *	@sleep_state: PM message representing a sleep state.
+ */
+static pm_message_t resume_event(pm_message_t sleep_state)
 {
-	switch (event) {
-	case PM_EVENT_SUSPEND:	return "suspend";
-	case PM_EVENT_FREEZE:	return "freeze";
-	case PM_EVENT_PRETHAW:	return "prethaw";
-	default:		return "(unknown suspend event)";
+	switch (sleep_state.event) {
+	case PM_EVENT_SUSPEND:
+		return PMSG_RESUME;
+	case PM_EVENT_FREEZE:
+	case PM_EVENT_QUIESCE:
+		return PMSG_RECOVER;
+	case PM_EVENT_HIBERNATE:
+		return PMSG_RESTORE;
 	}
-}
-
-static void
-suspend_device_dbg(struct device *dev, pm_message_t state, char *info)
-{
-	dev_dbg(dev, "%s%s%s\n", info, suspend_verb(state.event),
-		((state.event == PM_EVENT_SUSPEND) && device_may_wakeup(dev)) ?
-		", may wakeup" : "");
+	return PMSG_ON;
 }
 
 /**
- *	suspend_device_late - Shut down one device (late suspend).
+ *	suspend_device_noirq - Shut down one device (late suspend).
  *	@dev:	Device.
- *	@state:	Power state device is entering.
+ *	@state: PM transition of the system being carried out.
  *
  *	This is called with interrupts off and only a single CPU running.
  */
-static int suspend_device_late(struct device *dev, pm_message_t state)
+static int suspend_device_noirq(struct device *dev, pm_message_t state)
 {
 	int error = 0;
 
-	if (dev->bus && dev->bus->suspend_late) {
-		suspend_device_dbg(dev, state, "LATE ");
+	if (!dev->bus)
+		return 0;
+
+	if (dev->bus->pm) {
+		pm_dev_dbg(dev, state, "LATE ");
+		error = pm_noirq_op(dev, dev->bus->pm, state);
+	} else if (dev->bus->suspend_late) {
+		pm_dev_dbg(dev, state, "legacy LATE ");
 		error = dev->bus->suspend_late(dev, state);
 		suspend_report_result(dev->bus->suspend_late, error);
 	}
@@ -278,37 +565,30 @@ static int suspend_device_late(struct device *dev, pm_message_t state)
 
 /**
  *	device_power_down - Shut down special devices.
- *	@state:		Power state to enter.
+ *	@state: PM transition of the system being carried out.
  *
- *	Power down devices that require interrupts to be disabled
- *	and move them from the dpm_off list to the dpm_off_irq list.
+ *	Power down devices that require interrupts to be disabled.
  *	Then power down system devices.
  *
  *	Must be called with interrupts disabled and only one CPU running.
  */
 int device_power_down(pm_message_t state)
 {
+	struct device *dev;
 	int error = 0;
 
-	while (!list_empty(&dpm_off)) {
-		struct list_head *entry = dpm_off.prev;
-		struct device *dev = to_device(entry);
-
-		error = suspend_device_late(dev, state);
+	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
+		error = suspend_device_noirq(dev, state);
 		if (error) {
-			printk(KERN_ERR "Could not power down device %s: "
-					"error %d\n",
-					kobject_name(&dev->kobj), error);
+			pm_dev_err(dev, state, " late", error);
 			break;
 		}
-		if (!list_empty(&dev->power.entry))
-			list_move(&dev->power.entry, &dpm_off_irq);
+		dev->power.status = DPM_OFF_IRQ;
 	}
-
 	if (!error)
 		error = sysdev_suspend(state);
 	if (error)
-		dpm_power_up();
+		dpm_power_up(resume_event(state));
 	return error;
 }
 EXPORT_SYMBOL_GPL(device_power_down);
@@ -316,7 +596,7 @@ EXPORT_SYMBOL_GPL(device_power_down);
 /**
  *	suspend_device - Save state of one device.
  *	@dev:	Device.
- *	@state:	Power state device is entering.
+ *	@state: PM transition of the system being carried out.
  */
 static int suspend_device(struct device *dev, pm_message_t state)
 {
@@ -324,24 +604,43 @@ static int suspend_device(struct device *dev, pm_message_t state)
 
 	down(&dev->sem);
 
-	if (dev->class && dev->class->suspend) {
-		suspend_device_dbg(dev, state, "class ");
-		error = dev->class->suspend(dev, state);
-		suspend_report_result(dev->class->suspend, error);
+	if (dev->class) {
+		if (dev->class->pm) {
+			pm_dev_dbg(dev, state, "class ");
+			error = pm_op(dev, dev->class->pm, state);
+		} else if (dev->class->suspend) {
+			pm_dev_dbg(dev, state, "legacy class ");
+			error = dev->class->suspend(dev, state);
+			suspend_report_result(dev->class->suspend, error);
+		}
+		if (error)
+			goto End;
 	}
 
-	if (!error && dev->type && dev->type->suspend) {
-		suspend_device_dbg(dev, state, "type ");
-		error = dev->type->suspend(dev, state);
-		suspend_report_result(dev->type->suspend, error);
+	if (dev->type) {
+		if (dev->type->pm) {
+			pm_dev_dbg(dev, state, "type ");
+			error = pm_op(dev, dev->type->pm, state);
+		} else if (dev->type->suspend) {
+			pm_dev_dbg(dev, state, "legacy type ");
+			error = dev->type->suspend(dev, state);
+			suspend_report_result(dev->type->suspend, error);
+		}
+		if (error)
+			goto End;
 	}
 
-	if (!error && dev->bus && dev->bus->suspend) {
-		suspend_device_dbg(dev, state, "");
-		error = dev->bus->suspend(dev, state);
-		suspend_report_result(dev->bus->suspend, error);
+	if (dev->bus) {
+		if (dev->bus->pm) {
+			pm_dev_dbg(dev, state, "");
+			error = pm_op(dev, &dev->bus->pm->base, state);
+		} else if (dev->bus->suspend) {
+			pm_dev_dbg(dev, state, "legacy ");
+			error = dev->bus->suspend(dev, state);
+			suspend_report_result(dev->bus->suspend, error);
+		}
 	}
-
+ End:
 	up(&dev->sem);
 
 	return error;
@@ -349,67 +648,141 @@ static int suspend_device(struct device *dev, pm_message_t state)
 
 /**
  *	dpm_suspend - Suspend every device.
- *	@state:	Power state to put each device in.
+ *	@state: PM transition of the system being carried out.
  *
- *	Walk the dpm_locked list.  Suspend each device and move it
- *	to the dpm_off list.
- *
- *	(For historical reasons, if it returns -EAGAIN, that used to mean
- *	that the device would be called again with interrupts disabled.
- *	These days, we use the "suspend_late()" callback for that, so we
- *	print a warning and consider it an error).
+ *	Execute the appropriate "suspend" callbacks for all devices.
  */
 static int dpm_suspend(pm_message_t state)
 {
+	struct list_head list;
 	int error = 0;
 
+	INIT_LIST_HEAD(&list);
 	mutex_lock(&dpm_list_mtx);
-	while (!list_empty(&dpm_active)) {
-		struct list_head *entry = dpm_active.prev;
-		struct device *dev = to_device(entry);
-
-		WARN_ON(dev->parent && dev->parent->power.sleeping);
+	while (!list_empty(&dpm_list)) {
+		struct device *dev = to_device(dpm_list.prev);
 
-		dev->power.sleeping = true;
+		get_device(dev);
 		mutex_unlock(&dpm_list_mtx);
+
 		error = suspend_device(dev, state);
+
 		mutex_lock(&dpm_list_mtx);
 		if (error) {
-			printk(KERN_ERR "Could not suspend device %s: "
-					"error %d%s\n",
-					kobject_name(&dev->kobj),
-					error,
-					(error == -EAGAIN ?
-					" (please convert to suspend_late)" :
-					""));
-			dev->power.sleeping = false;
+			pm_dev_err(dev, state, "", error);
+			put_device(dev);
 			break;
 		}
+		dev->power.status = DPM_OFF;
 		if (!list_empty(&dev->power.entry))
-			list_move(&dev->power.entry, &dpm_off);
+			list_move(&dev->power.entry, &list);
+		put_device(dev);
 	}
-	if (!error)
-		all_sleeping = true;
+	list_splice(&list, dpm_list.prev);
 	mutex_unlock(&dpm_list_mtx);
+	return error;
+}
+
+/**
+ *	prepare_device - Execute the ->prepare() callback(s) for given device.
+ *	@dev:	Device.
+ *	@state: PM transition of the system being carried out.
+ */
+static int prepare_device(struct device *dev, pm_message_t state)
+{
+	int error = 0;
+
+	down(&dev->sem);
+
+	if (dev->bus && dev->bus->pm && dev->bus->pm->base.prepare) {
+		pm_dev_dbg(dev, state, "preparing ");
+		error = dev->bus->pm->base.prepare(dev);
+		suspend_report_result(dev->bus->pm->base.prepare, error);
+		if (error)
+			goto End;
+	}
+
+	if (dev->type && dev->type->pm && dev->type->pm->prepare) {
+		pm_dev_dbg(dev, state, "preparing type ");
+		error = dev->type->pm->prepare(dev);
+		suspend_report_result(dev->type->pm->prepare, error);
+		if (error)
+			goto End;
+	}
+
+	if (dev->class && dev->class->pm && dev->class->pm->prepare) {
+		pm_dev_dbg(dev, state, "preparing class ");
+		error = dev->class->pm->prepare(dev);
+		suspend_report_result(dev->class->pm->prepare, error);
+	}
+ End:
+	up(&dev->sem);
+
+	return error;
+}
 
+/**
+ *	dpm_prepare - Prepare all devices for a PM transition.
+ *	@state: PM transition of the system being carried out.
+ *
+ *	Execute the ->prepare() callback for all devices.
+ */
+static int dpm_prepare(pm_message_t state)
+{
+	struct list_head list;
+	int error = 0;
+
+	INIT_LIST_HEAD(&list);
+	mutex_lock(&dpm_list_mtx);
+	transition_started = true;
+	while (!list_empty(&dpm_list)) {
+		struct device *dev = to_device(dpm_list.next);
+
+		get_device(dev);
+		dev->power.status = DPM_PREPARING;
+		mutex_unlock(&dpm_list_mtx);
+
+		error = prepare_device(dev, state);
+
+		mutex_lock(&dpm_list_mtx);
+		if (error) {
+			dev->power.status = DPM_ON;
+			if (error == -EAGAIN) {
+				put_device(dev);
+				continue;
+			}
+			printk(KERN_ERR "PM: Failed to prepare device %s "
+				"for power transition: error %d\n",
+				kobject_name(&dev->kobj), error);
+			put_device(dev);
+			break;
+		}
+		dev->power.status = DPM_SUSPENDING;
+		if (!list_empty(&dev->power.entry))
+			list_move_tail(&dev->power.entry, &list);
+		put_device(dev);
+	}
+	list_splice(&list, &dpm_list);
+	mutex_unlock(&dpm_list_mtx);
 	return error;
 }
 
 /**
  *	device_suspend - Save state and stop all devices in system.
- *	@state: new power management state
+ *	@state: PM transition of the system being carried out.
  *
- *	Prevent new devices from being registered, then lock all devices
- *	and suspend them.
+ *	Prepare and suspend all devices.
  */
 int device_suspend(pm_message_t state)
 {
 	int error;
 
 	might_sleep();
-	error = dpm_suspend(state);
+	error = dpm_prepare(state);
+	if (!error)
+		error = dpm_suspend(state);
 	if (error)
-		device_resume();
+		device_resume(resume_event(state));
 	return error;
 }
 EXPORT_SYMBOL_GPL(device_suspend);
diff --git a/drivers/base/power/power.h b/drivers/base/power/power.h
index a6894f2a4b99..a3252c0e2887 100644
--- a/drivers/base/power/power.h
+++ b/drivers/base/power/power.h
@@ -4,7 +4,7 @@
  * main.c
  */
 
-extern struct list_head dpm_active;	/* The active device list */
+extern struct list_head dpm_list;	/* The active device list */
 
 static inline struct device *to_device(struct list_head *entry)
 {
diff --git a/drivers/base/power/trace.c b/drivers/base/power/trace.c
index 2b4b392dcbc1..8c1e656b5f8b 100644
--- a/drivers/base/power/trace.c
+++ b/drivers/base/power/trace.c
@@ -188,9 +188,9 @@ static int show_file_hash(unsigned int value)
 static int show_dev_hash(unsigned int value)
 {
 	int match = 0;
-	struct list_head * entry = dpm_active.prev;
+	struct list_head *entry = dpm_list.prev;
 
-	while (entry != &dpm_active) {
+	while (entry != &dpm_list) {
 		struct device * dev = to_device(entry);
 		unsigned int hash = hash_string(DEVSEED, dev->bus_id, DEVHASH);
 		if (hash == value) {
diff --git a/include/linux/device.h b/include/linux/device.h
index 6a2d04c011bc..f71a78d123ae 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -68,6 +68,8 @@ struct bus_type {
 	int (*resume_early)(struct device *dev);
 	int (*resume)(struct device *dev);
 
+	struct pm_ext_ops *pm;
+
 	struct bus_type_private *p;
 };
 
@@ -131,6 +133,8 @@ struct device_driver {
 	int (*resume) (struct device *dev);
 	struct attribute_group **groups;
 
+	struct pm_ops *pm;
+
 	struct driver_private *p;
 };
 
@@ -197,6 +201,8 @@ struct class {
 
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
+
+	struct pm_ops *pm;
 };
 
 extern int __must_check class_register(struct class *class);
@@ -248,8 +254,11 @@ struct device_type {
 	struct attribute_group **groups;
 	int (*uevent)(struct device *dev, struct kobj_uevent_env *env);
 	void (*release)(struct device *dev);
+
 	int (*suspend)(struct device *dev, pm_message_t state);
 	int (*resume)(struct device *dev);
+
+	struct pm_ops *pm;
 };
 
 /* interface for exporting device attributes */
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 39a7ee859b67..4ad9de94449a 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -112,7 +112,9 @@ typedef struct pm_message {
 	int event;
 } pm_message_t;
 
-/*
+/**
+ * struct pm_ops - device PM callbacks
+ *
  * Several driver power state transitions are externally visible, affecting
  * the state of pending I/O queues and (for drivers that touch hardware)
  * interrupts, wakeups, DMA, and other hardware state.  There may also be
@@ -120,6 +122,284 @@ typedef struct pm_message {
  * to the rest of the driver stack (such as a driver that's ON gating off
  * clocks which are not in active use).
  *
+ * The externally visible transitions are handled with the help of the following
+ * callbacks included in this structure:
+ *
+ * @prepare: Prepare the device for the upcoming transition, but do NOT change
+ *	its hardware state.  Prevent new children of the device from being
+ *	registered after @prepare() returns (the driver's subsystem and
+ *	generally the rest of the kernel is supposed to prevent new calls to the
+ *	probe method from being made too once @prepare() has succeeded).  If
+ *	@prepare() detects a situation it cannot handle (e.g. registration of a
+ *	child already in progress), it may return -EAGAIN, so that the PM core
+ *	can execute it once again (e.g. after the new child has been registered)
+ *	to recover from the race condition.  This method is executed for all
+ *	kinds of suspend transitions and is followed by one of the suspend
+ *	callbacks: @suspend(), @freeze(), or @poweroff().
+ *	The PM core executes @prepare() for all devices before starting to
+ *	execute suspend callbacks for any of them, so drivers may assume all of
+ *	the other devices to be present and functional while @prepare() is being
+ *	executed.  In particular, it is safe to make GFP_KERNEL memory
+ *	allocations from within @prepare().  However, drivers may NOT assume
+ *	anything about the availability of the user space at that time and it
+ *	is not correct to request firmware from within @prepare() (it's too
+ *	late to do that).  [To work around this limitation, drivers may
+ *	register suspend and hibernation notifiers that are executed before the
+ *	freezing of tasks.]
+ *
+ * @complete: Undo the changes made by @prepare().  This method is executed for
+ *	all kinds of resume transitions, following one of the resume callbacks:
+ *	@resume(), @thaw(), @restore().  Also called if the state transition
+ *	fails before the driver's suspend callback (@suspend(), @freeze(),
+ *	@poweroff()) can be executed (e.g. if the suspend callback fails for one
+ *	of the other devices that the PM core has unsuccessfully attempted to
+ *	suspend earlier).
+ *	The PM core executes @complete() after it has executed the appropriate
+ *	resume callback for all devices.
+ *
+ * @suspend: Executed before putting the system into a sleep state in which the
+ *	contents of main memory are preserved.  Quiesce the device, put it into
+ *	a low power state appropriate for the upcoming system state (such as
+ *	PCI_D3hot), and enable wakeup events as appropriate.
+ *
+ * @resume: Executed after waking the system up from a sleep state in which the
+ *	contents of main memory were preserved.  Put the device into the
+ *	appropriate state, according to the information saved in memory by the
+ *	preceding @suspend().  The driver starts working again, responding to
+ *	hardware events and software requests.  The hardware may have gone
+ *	through a power-off reset, or it may have maintained state from the
+ *	previous suspend() which the driver may rely on while resuming.  On most
+ *	platforms, there are no restrictions on availability of resources like
+ *	clocks during @resume().
+ *
+ * @freeze: Hibernation-specific, executed before creating a hibernation image.
+ *	Quiesce operations so that a consistent image can be created, but do NOT
+ *	otherwise put the device into a low power device state and do NOT emit
+ *	system wakeup events.  Save in main memory the device settings to be
+ *	used by @restore() during the subsequent resume from hibernation or by
+ *	the subsequent @thaw(), if the creation of the image or the restoration
+ *	of main memory contents from it fails.
+ *
+ * @thaw: Hibernation-specific, executed after creating a hibernation image OR
+ *	if the creation of the image fails.  Also executed after a failing
+ *	attempt to restore the contents of main memory from such an image.
+ *	Undo the changes made by the preceding @freeze(), so the device can be
+ *	operated in the same way as immediately before the call to @freeze().
+ *
+ * @poweroff: Hibernation-specific, executed after saving a hibernation image.
+ *	Quiesce the device, put it into a low power state appropriate for the
+ *	upcoming system state (such as PCI_D3hot), and enable wakeup events as
+ *	appropriate.
+ *
+ * @restore: Hibernation-specific, executed after restoring the contents of main
+ *	memory from a hibernation image.  Driver starts working again,
+ *	responding to hardware events and software requests.  Drivers may NOT
+ *	make ANY assumptions about the hardware state right prior to @restore().
+ *	On most platforms, there are no restrictions on availability of
+ *	resources like clocks during @restore().
+ *
+ * All of the above callbacks, except for @complete(), return error codes.
+ * However, the error codes returned by the resume operations, @resume(),
+ * @thaw(), and @restore(), do not cause the PM core to abort the resume
+ * transition during which they are returned.  The error codes returned in
+ * that cases are only printed by the PM core to the system logs for debugging
+ * purposes.  Still, it is recommended that drivers only return error codes
+ * from their resume methods in case of an unrecoverable failure (i.e. when the
+ * device being handled refuses to resume and becomes unusable) to allow us to
+ * modify the PM core in the future, so that it can avoid attempting to handle
+ * devices that failed to resume and their children.
+ *
+ * It is allowed to unregister devices while the above callbacks are being
+ * executed.  However, it is not allowed to unregister a device from within any
+ * of its own callbacks.
+ */
+
+struct pm_ops {
+	int (*prepare)(struct device *dev);
+	void (*complete)(struct device *dev);
+	int (*suspend)(struct device *dev);
+	int (*resume)(struct device *dev);
+	int (*freeze)(struct device *dev);
+	int (*thaw)(struct device *dev);
+	int (*poweroff)(struct device *dev);
+	int (*restore)(struct device *dev);
+};
+
+/**
+ * struct pm_ext_ops - extended device PM callbacks
+ *
+ * Some devices require certain operations related to suspend and hibernation
+ * to be carried out with interrupts disabled.  Thus, 'struct pm_ext_ops' below
+ * is defined, adding callbacks to be executed with interrupts disabled to
+ * 'struct pm_ops'.
+ *
+ * The following callbacks included in 'struct pm_ext_ops' are executed with
+ * the nonboot CPUs switched off and with interrupts disabled on the only
+ * functional CPU.  They also are executed with the PM core list of devices
+ * locked, so they must NOT unregister any devices.
+ *
+ * @suspend_noirq: Complete the operations of ->suspend() by carrying out any
+ *	actions required for suspending the device that need interrupts to be
+ *	disabled
+ *
+ * @resume_noirq: Prepare for the execution of ->resume() by carrying out any
+ *	actions required for resuming the device that need interrupts to be
+ *	disabled
+ *
+ * @freeze_noirq: Complete the operations of ->freeze() by carrying out any
+ *	actions required for freezing the device that need interrupts to be
+ *	disabled
+ *
+ * @thaw_noirq: Prepare for the execution of ->thaw() by carrying out any
+ *	actions required for thawing the device that need interrupts to be
+ *	disabled
+ *
+ * @poweroff_noirq: Complete the operations of ->poweroff() by carrying out any
+ *	actions required for handling the device that need interrupts to be
+ *	disabled
+ *
+ * @restore_noirq: Prepare for the execution of ->restore() by carrying out any
+ *	actions required for restoring the operations of the device that need
+ *	interrupts to be disabled
+ *
+ * All of the above callbacks return error codes, but the error codes returned
+ * by the resume operations, @resume_noirq(), @thaw_noirq(), and
+ * @restore_noirq(), do not cause the PM core to abort the resume transition
+ * during which they are returned.  The error codes returned in that cases are
+ * only printed by the PM core to the system logs for debugging purposes.
+ * Still, as stated above, it is recommended that drivers only return error
+ * codes from their resume methods if the device being handled fails to resume
+ * and is not usable any more.
+ */
+
+struct pm_ext_ops {
+	struct pm_ops base;
+	int (*suspend_noirq)(struct device *dev);
+	int (*resume_noirq)(struct device *dev);
+	int (*freeze_noirq)(struct device *dev);
+	int (*thaw_noirq)(struct device *dev);
+	int (*poweroff_noirq)(struct device *dev);
+	int (*restore_noirq)(struct device *dev);
+};
+
+/**
+ * PM_EVENT_ messages
+ *
+ * The following PM_EVENT_ messages are defined for the internal use of the PM
+ * core, in order to provide a mechanism allowing the high level suspend and
+ * hibernation code to convey the necessary information to the device PM core
+ * code:
+ *
+ * ON		No transition.
+ *
+ * FREEZE 	System is going to hibernate, call ->prepare() and ->freeze()
+ *		for all devices.
+ *
+ * SUSPEND	System is going to suspend, call ->prepare() and ->suspend()
+ *		for all devices.
+ *
+ * HIBERNATE	Hibernation image has been saved, call ->prepare() and
+ *		->poweroff() for all devices.
+ *
+ * QUIESCE	Contents of main memory are going to be restored from a (loaded)
+ *		hibernation image, call ->prepare() and ->freeze() for all
+ *		devices.
+ *
+ * RESUME	System is resuming, call ->resume() and ->complete() for all
+ *		devices.
+ *
+ * THAW		Hibernation image has been created, call ->thaw() and
+ *		->complete() for all devices.
+ *
+ * RESTORE	Contents of main memory have been restored from a hibernation
+ *		image, call ->restore() and ->complete() for all devices.
+ *
+ * RECOVER	Creation of a hibernation image or restoration of the main
+ *		memory contents from a hibernation image has failed, call
+ *		->thaw() and ->complete() for all devices.
+ */
+
+#define PM_EVENT_ON		0x0000
+#define PM_EVENT_FREEZE 	0x0001
+#define PM_EVENT_SUSPEND	0x0002
+#define PM_EVENT_HIBERNATE	0x0004
+#define PM_EVENT_QUIESCE	0x0008
+#define PM_EVENT_RESUME		0x0010
+#define PM_EVENT_THAW		0x0020
+#define PM_EVENT_RESTORE	0x0040
+#define PM_EVENT_RECOVER	0x0080
+
+#define PM_EVENT_SLEEP	(PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
+
+#define PMSG_FREEZE	((struct pm_message){ .event = PM_EVENT_FREEZE, })
+#define PMSG_QUIESCE	((struct pm_message){ .event = PM_EVENT_QUIESCE, })
+#define PMSG_SUSPEND	((struct pm_message){ .event = PM_EVENT_SUSPEND, })
+#define PMSG_HIBERNATE	((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
+#define PMSG_RESUME	((struct pm_message){ .event = PM_EVENT_RESUME, })
+#define PMSG_THAW	((struct pm_message){ .event = PM_EVENT_THAW, })
+#define PMSG_RESTORE	((struct pm_message){ .event = PM_EVENT_RESTORE, })
+#define PMSG_RECOVER	((struct pm_message){ .event = PM_EVENT_RECOVER, })
+#define PMSG_ON		((struct pm_message){ .event = PM_EVENT_ON, })
+
+/**
+ * Device power management states
+ *
+ * These state labels are used internally by the PM core to indicate the current
+ * status of a device with respect to the PM core operations.
+ *
+ * DPM_ON		Device is regarded as operational.  Set this way
+ *			initially and when ->complete() is about to be called.
+ *			Also set when ->prepare() fails.
+ *
+ * DPM_PREPARING	Device is going to be prepared for a PM transition.  Set
+ *			when ->prepare() is about to be called.
+ *
+ * DPM_RESUMING		Device is going to be resumed.  Set when ->resume(),
+ *			->thaw(), or ->restore() is about to be called.
+ *
+ * DPM_SUSPENDING	Device has been prepared for a power transition.  Set
+ *			when ->prepare() has just succeeded.
+ *
+ * DPM_OFF		Device is regarded as inactive.  Set immediately after
+ *			->suspend(), ->freeze(), or ->poweroff() has succeeded.
+ *			Also set when ->resume()_noirq, ->thaw_noirq(), or
+ *			->restore_noirq() is about to be called.
+ *
+ * DPM_OFF_IRQ		Device is in a "deep sleep".  Set immediately after
+ *			->suspend_noirq(), ->freeze_noirq(), or
+ *			->poweroff_noirq() has just succeeded.
+ */
+
+enum dpm_state {
+	DPM_INVALID,
+	DPM_ON,
+	DPM_PREPARING,
+	DPM_RESUMING,
+	DPM_SUSPENDING,
+	DPM_OFF,
+	DPM_OFF_IRQ,
+};
+
+struct dev_pm_info {
+	pm_message_t		power_state;
+	unsigned		can_wakeup:1;
+	unsigned		should_wakeup:1;
+	enum dpm_state		status;		/* Owned by the PM core */
+#ifdef	CONFIG_PM_SLEEP
+	struct list_head	entry;
+#endif
+};
+
+/*
+ * The PM_EVENT_ messages are also used by drivers implementing the legacy
+ * suspend framework, based on the ->suspend() and ->resume() callbacks common
+ * for suspend and hibernation transitions, according to the rules below.
+ */
+
+/* Necessary, because several drivers use PM_EVENT_PRETHAW */
+#define PM_EVENT_PRETHAW PM_EVENT_QUIESCE
+
+/*
  * One transition is triggered by resume(), after a suspend() call; the
  * message is implicit:
  *
@@ -164,35 +444,13 @@ typedef struct pm_message {
  * or from system low-power states such as standby or suspend-to-RAM.
  */
 
-#define PM_EVENT_ON 0
-#define PM_EVENT_FREEZE 1
-#define PM_EVENT_SUSPEND 2
-#define PM_EVENT_HIBERNATE 4
-#define PM_EVENT_PRETHAW 8
-
-#define PM_EVENT_SLEEP	(PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE)
-
-#define PMSG_FREEZE	((struct pm_message){ .event = PM_EVENT_FREEZE, })
-#define PMSG_PRETHAW	((struct pm_message){ .event = PM_EVENT_PRETHAW, })
-#define PMSG_SUSPEND	((struct pm_message){ .event = PM_EVENT_SUSPEND, })
-#define PMSG_HIBERNATE	((struct pm_message){ .event = PM_EVENT_HIBERNATE, })
-#define PMSG_ON		((struct pm_message){ .event = PM_EVENT_ON, })
-
-struct dev_pm_info {
-	pm_message_t		power_state;
-	unsigned		can_wakeup:1;
-	unsigned		should_wakeup:1;
-	bool			sleeping:1;	/* Owned by the PM core */
-#ifdef	CONFIG_PM_SLEEP
-	struct list_head	entry;
-#endif
-};
+#ifdef CONFIG_PM_SLEEP
+extern void device_pm_lock(void);
+extern void device_power_up(pm_message_t state);
+extern void device_resume(pm_message_t state);
 
+extern void device_pm_unlock(void);
 extern int device_power_down(pm_message_t state);
-extern void device_power_up(void);
-extern void device_resume(void);
-
-#ifdef CONFIG_PM_SLEEP
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
 
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..d416be0efa8a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -193,6 +193,7 @@ static int create_image(int platform_mode)
 	if (error)
 		return error;
 
+	device_pm_lock();
 	local_irq_disable();
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +225,11 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
-	device_power_up();
+	device_power_up(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -280,7 +283,8 @@ int hibernation_snapshot(int platform_mode)
  Finish:
 	platform_finish(platform_mode);
  Resume_devices:
-	device_resume();
+	device_resume(in_suspend ?
+		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
@@ -300,8 +304,9 @@ static int resume_target_kernel(void)
 {
 	int error;
 
+	device_pm_lock();
 	local_irq_disable();
-	error = device_power_down(PMSG_PRETHAW);
+	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
@@ -329,9 +334,10 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
-	device_power_up();
+	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
+	device_pm_unlock();
 	return error;
 }
 
@@ -350,7 +356,7 @@ int hibernation_restore(int platform_mode)
 
 	pm_prepare_console();
 	suspend_console();
-	error = device_suspend(PMSG_PRETHAW);
+	error = device_suspend(PMSG_QUIESCE);
 	if (error)
 		goto Finish;
 
@@ -362,7 +368,7 @@ int hibernation_restore(int platform_mode)
 		enable_nonboot_cpus();
 	}
 	platform_restore_cleanup(platform_mode);
-	device_resume();
+	device_resume(PMSG_RECOVER);
  Finish:
 	resume_console();
 	pm_restore_console();
@@ -403,6 +409,7 @@ int hibernation_platform_enter(void)
 	if (error)
 		goto Finish;
 
+	device_pm_lock();
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
@@ -411,6 +418,7 @@ int hibernation_platform_enter(void)
 		while (1);
 	}
 	local_irq_enable();
+	device_pm_unlock();
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,7 +427,7 @@ int hibernation_platform_enter(void)
  Finish:
 	hibernation_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESTORE);
  Resume_console:
 	resume_console();
  Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..d023b6b584e5 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -228,6 +228,7 @@ static int suspend_enter(suspend_state_t state)
 {
 	int error = 0;
 
+	device_pm_lock();
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -239,10 +240,11 @@ static int suspend_enter(suspend_state_t state)
 	if (!suspend_test(TEST_CORE))
 		error = suspend_ops->enter(state);
 
-	device_power_up();
+	device_power_up(PMSG_RESUME);
  Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+	device_pm_unlock();
 	return error;
 }
 
@@ -291,7 +293,7 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_ops->finish)
 		suspend_ops->finish();
  Resume_devices:
-	device_resume();
+	device_resume(PMSG_RESUME);
  Resume_console:
 	resume_console();
  Close:
-- 
cgit v1.2.3


From e7891c733f9b26c851edde50cf886a30bd133dbd Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Thu, 22 May 2008 14:35:21 -0700
Subject: PCI/x86: write_pci_config_byte fix offset

also add write_pci_config_16

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/early.c         | 9 ++++++++-
 include/asm-x86/pci-direct.h | 1 +
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 42df4b6606df..8e2821e8dac5 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -49,7 +49,14 @@ void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
 {
 	PDprintk("%x writing to %x: %x\n", slot, offset, val);
 	outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
-	outb(val, 0xcfc);
+	outb(val, 0xcfc + (offset&3));
+}
+
+void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
+{
+	PDprintk("%x writing to %x: %x\n", slot, offset, val);
+	outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
+	outw(val, 0xcfc + (offset&2));
 }
 
 int early_pci_allowed(void)
diff --git a/include/asm-x86/pci-direct.h b/include/asm-x86/pci-direct.h
index 5b21485be573..7bd40b4de751 100644
--- a/include/asm-x86/pci-direct.h
+++ b/include/asm-x86/pci-direct.h
@@ -11,6 +11,7 @@ extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
 extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
 extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
 extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
+extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
 
 extern int early_pci_allowed(void);
 
-- 
cgit v1.2.3


From e3f2baebf4209b5927e23fa65d5977d31db936b3 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Thu, 22 May 2008 14:35:11 -0700
Subject: PCI/x86: early dump pci conf space v2

Allows us to dump PCI space before any kernel changes have been made.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/setup_64.c   |  5 +++++
 arch/x86/pci/common.c        |  4 ++++
 arch/x86/pci/early.c         | 51 ++++++++++++++++++++++++++++++++++++++++++++
 include/asm-x86/pci-direct.h |  3 +++
 4 files changed, 63 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c
index 6dff1286ad8a..524b6850b2c0 100644
--- a/arch/x86/kernel/setup_64.c
+++ b/arch/x86/kernel/setup_64.c
@@ -361,6 +361,11 @@ void __init setup_arch(char **cmdline_p)
 
 	parse_early_param();
 
+#ifdef CONFIG_PCI
+	if (pci_early_dump_regs)
+		early_dump_pci_devices();
+#endif
+
 #ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
 	if (init_ohci1394_dma_early)
 		init_ohci1394_dma_on_all_controllers();
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 3a5261bdff5d..d19fd07bafd6 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -20,6 +20,7 @@
 unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
 				PCI_PROBE_MMCONF;
 
+unsigned int pci_early_dump_regs;
 static int pci_bf_sort;
 int pci_routeirq;
 int pcibios_last_bus = -1;
@@ -511,6 +512,9 @@ char * __devinit  pcibios_setup(char *str)
 	} else if (!strcmp(str, "use_crs")) {
 		pci_probe |= PCI_USE__CRS;
 		return NULL;
+	} else if (!strcmp(str, "earlydump")) {
+		pci_early_dump_regs = 1;
+		return NULL;
 	} else if (!strcmp(str, "routeirq")) {
 		pci_routeirq = 1;
 		return NULL;
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 8e2821e8dac5..858dbe3399f9 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -64,3 +64,54 @@ int early_pci_allowed(void)
 	return (pci_probe & (PCI_PROBE_CONF1|PCI_PROBE_NOEARLY)) ==
 			PCI_PROBE_CONF1;
 }
+
+void early_dump_pci_device(u8 bus, u8 slot, u8 func)
+{
+	int i;
+	int j;
+	u32 val;
+
+	printk("PCI: %02x:%02x:%02x", bus, slot, func);
+
+	for (i = 0; i < 256; i += 4) {
+		if (!(i & 0x0f))
+			printk("\n%04x:",i);
+
+		val = read_pci_config(bus, slot, func, i);
+		for (j = 0; j < 4; j++) {
+			printk(" %02x", val & 0xff);
+			val >>= 8;
+		}
+	}
+	printk("\n");
+}
+
+void early_dump_pci_devices(void)
+{
+	unsigned bus, slot, func;
+
+	if (!early_pci_allowed())
+		return;
+
+	for (bus = 0; bus < 256; bus++) {
+		for (slot = 0; slot < 32; slot++) {
+			for (func = 0; func < 8; func++) {
+				u32 class;
+				u8 type;
+				class = read_pci_config(bus, slot, func,
+							PCI_CLASS_REVISION);
+				if (class == 0xffffffff)
+					break;
+
+				early_dump_pci_device(bus, slot, func);
+
+				/* No multi-function device? */
+				type = read_pci_config_byte(bus, slot, func,
+							       PCI_HEADER_TYPE);
+				if (!(type & 0x80))
+					break;
+			}
+		}
+	}
+}
+
diff --git a/include/asm-x86/pci-direct.h b/include/asm-x86/pci-direct.h
index 7bd40b4de751..80c775d9fe20 100644
--- a/include/asm-x86/pci-direct.h
+++ b/include/asm-x86/pci-direct.h
@@ -15,4 +15,7 @@ extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
 
 extern int early_pci_allowed(void);
 
+extern unsigned int pci_early_dump_regs;
+extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
+extern void early_dump_pci_devices(void);
 #endif
-- 
cgit v1.2.3


From d8f3de0d2412bb91639cfefc5b3c79dbf3812212 Mon Sep 17 00:00:00 2001
From: Rafael J. Wysocki
Date: Thu, 12 Jun 2008 23:24:06 +0200
Subject: Suspend-related patches for 2.6.27

ACPI PM: Add possibility to change suspend sequence

There are some systems out there that don't work correctly with
our current suspend/hibernation code ordering.  Provide a workaround
for these systems allowing them to pass 'acpi_sleep=old_ordering' in
the kernel command line so that it will use the pre-ACPI 2.0 ("old")
suspend code ordering.

Unfortunately, this requires us to add a platform hook to the
resuming of devices for recovering the platform in case one of the
device drivers' .suspend() routines returns error code.  Namely,
ACPI 1.0 specifies that _PTS should be called before suspending
devices, but _WAK still should be called before resuming them in
order to undo the changes made by _PTS.  However, if there is an
error during suspending devices, they are automatically resumed
without returning control to the PM core, so the _WAK has to be
called from within device_resume() in that cases.

The patch also reorders and refactors the ACPI suspend/hibernation
code to avoid duplication as far as reasonably possible.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 Documentation/kernel-parameters.txt |   6 +-
 arch/x86/kernel/acpi/sleep.c        |   2 +
 drivers/acpi/sleep/main.c           | 276 +++++++++++++++++++++---------------
 drivers/base/power/main.c           |   2 -
 include/linux/acpi.h                |   3 +
 include/linux/suspend.h             |  14 +-
 kernel/power/disk.c                 |  28 +++-
 kernel/power/main.c                 |  10 +-
 8 files changed, 215 insertions(+), 126 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 9cf7b34f2db0..18d793ea0dd3 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -147,10 +147,14 @@ and is between 256 and 4096 characters. It is defined in the file
 			default: 0
 
 	acpi_sleep=	[HW,ACPI] Sleep options
-			Format: { s3_bios, s3_mode, s3_beep }
+			Format: { s3_bios, s3_mode, s3_beep, old_ordering }
 			See Documentation/power/video.txt for s3_bios and s3_mode.
 			s3_beep is for debugging; it makes the PC's speaker beep
 			as soon as the kernel's real-mode entry point is called.
+			old_ordering causes the ACPI 1.0 ordering of the _PTS
+			control method, wrt putting devices into low power
+			states, to be enforced (the ACPI 2.0 ordering of _PTS is
+			used by default).
 
 	acpi_sci=	[HW,ACPI] ACPI System Control Interrupt trigger mode
 			Format: { level | edge | high | low }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index afc25ee9964b..882e970032d5 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -124,6 +124,8 @@ static int __init acpi_sleep_setup(char *str)
 			acpi_realmode_flags |= 2;
 		if (strncmp(str, "s3_beep", 7) == 0)
 			acpi_realmode_flags |= 4;
+		if (strncmp(str, "old_ordering", 12) == 0)
+			acpi_old_suspend_ordering();
 		str = strchr(str, ',');
 		if (str != NULL)
 			str += strspn(str, ", \t");
diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/sleep/main.c
index 0f2caea2fc83..4addf8ad50ae 100644
--- a/drivers/acpi/sleep/main.c
+++ b/drivers/acpi/sleep/main.c
@@ -24,10 +24,6 @@
 
 u8 sleep_states[ACPI_S_STATE_COUNT];
 
-#ifdef CONFIG_PM_SLEEP
-static u32 acpi_target_sleep_state = ACPI_STATE_S0;
-#endif
-
 static int acpi_sleep_prepare(u32 acpi_state)
 {
 #ifdef CONFIG_ACPI_SLEEP
@@ -50,9 +46,96 @@ static int acpi_sleep_prepare(u32 acpi_state)
 	return 0;
 }
 
-#ifdef CONFIG_SUSPEND
-static struct platform_suspend_ops acpi_suspend_ops;
+#ifdef CONFIG_PM_SLEEP
+static u32 acpi_target_sleep_state = ACPI_STATE_S0;
+
+/*
+ * ACPI 1.0 wants us to execute _PTS before suspending devices, so we allow the
+ * user to request that behavior by using the 'acpi_old_suspend_ordering'
+ * kernel command line option that causes the following variable to be set.
+ */
+static bool old_suspend_ordering;
+
+void __init acpi_old_suspend_ordering(void)
+{
+	old_suspend_ordering = true;
+}
+
+/**
+ *	acpi_pm_disable_gpes - Disable the GPEs.
+ */
+static int acpi_pm_disable_gpes(void)
+{
+	acpi_hw_disable_all_gpes();
+	return 0;
+}
+
+/**
+ *	__acpi_pm_prepare - Prepare the platform to enter the target state.
+ *
+ *	If necessary, set the firmware waking vector and do arch-specific
+ *	nastiness to get the wakeup code to the waking vector.
+ */
+static int __acpi_pm_prepare(void)
+{
+	int error = acpi_sleep_prepare(acpi_target_sleep_state);
+
+	if (error)
+		acpi_target_sleep_state = ACPI_STATE_S0;
+	return error;
+}
+
+/**
+ *	acpi_pm_prepare - Prepare the platform to enter the target sleep
+ *		state and disable the GPEs.
+ */
+static int acpi_pm_prepare(void)
+{
+	int error = __acpi_pm_prepare();
+
+	if (!error)
+		acpi_hw_disable_all_gpes();
+	return error;
+}
+
+/**
+ *	acpi_pm_finish - Instruct the platform to leave a sleep state.
+ *
+ *	This is called after we wake back up (or if entering the sleep state
+ *	failed).
+ */
+static void acpi_pm_finish(void)
+{
+	u32 acpi_state = acpi_target_sleep_state;
+
+	if (acpi_state == ACPI_STATE_S0)
+		return;
 
+	printk(KERN_INFO PREFIX "Waking up from system sleep state S%d\n",
+		acpi_state);
+	acpi_disable_wakeup_device(acpi_state);
+	acpi_leave_sleep_state(acpi_state);
+
+	/* reset firmware waking vector */
+	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
+
+	acpi_target_sleep_state = ACPI_STATE_S0;
+}
+
+/**
+ *	acpi_pm_end - Finish up suspend sequence.
+ */
+static void acpi_pm_end(void)
+{
+	/*
+	 * This is necessary in case acpi_pm_finish() is not called during a
+	 * failing transition to a sleep state.
+	 */
+	acpi_target_sleep_state = ACPI_STATE_S0;
+}
+#endif /* CONFIG_PM_SLEEP */
+
+#ifdef CONFIG_SUSPEND
 extern void do_suspend_lowlevel(void);
 
 static u32 acpi_suspend_states[] = {
@@ -66,7 +149,6 @@ static u32 acpi_suspend_states[] = {
  *	acpi_suspend_begin - Set the target system sleep state to the state
  *		associated with given @pm_state, if supported.
  */
-
 static int acpi_suspend_begin(suspend_state_t pm_state)
 {
 	u32 acpi_state = acpi_suspend_states[pm_state];
@@ -82,25 +164,6 @@ static int acpi_suspend_begin(suspend_state_t pm_state)
 	return error;
 }
 
-/**
- *	acpi_suspend_prepare - Do preliminary suspend work.
- *
- *	If necessary, set the firmware waking vector and do arch-specific
- *	nastiness to get the wakeup code to the waking vector.
- */
-
-static int acpi_suspend_prepare(void)
-{
-	int error = acpi_sleep_prepare(acpi_target_sleep_state);
-
-	if (error) {
-		acpi_target_sleep_state = ACPI_STATE_S0;
-		return error;
-	}
-
-	return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT;
-}
-
 /**
  *	acpi_suspend_enter - Actually enter a sleep state.
  *	@pm_state: ignored
@@ -109,7 +172,6 @@ static int acpi_suspend_prepare(void)
  *	assembly, which in turn call acpi_enter_sleep_state().
  *	It's unfortunate, but it works. Please fix if you're feeling frisky.
  */
-
 static int acpi_suspend_enter(suspend_state_t pm_state)
 {
 	acpi_status status = AE_OK;
@@ -166,39 +228,6 @@ static int acpi_suspend_enter(suspend_state_t pm_state)
 	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
 }
 
-/**
- *	acpi_suspend_finish - Instruct the platform to leave a sleep state.
- *
- *	This is called after we wake back up (or if entering the sleep state
- *	failed). 
- */
-
-static void acpi_suspend_finish(void)
-{
-	u32 acpi_state = acpi_target_sleep_state;
-
-	acpi_disable_wakeup_device(acpi_state);
-	acpi_leave_sleep_state(acpi_state);
-
-	/* reset firmware waking vector */
-	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
-
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
-
-/**
- *	acpi_suspend_end - Finish up suspend sequence.
- */
-
-static void acpi_suspend_end(void)
-{
-	/*
-	 * This is necessary in case acpi_suspend_finish() is not called during a
-	 * failing transition to a sleep state.
-	 */
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
-
 static int acpi_suspend_state_valid(suspend_state_t pm_state)
 {
 	u32 acpi_state;
@@ -218,10 +247,39 @@ static int acpi_suspend_state_valid(suspend_state_t pm_state)
 static struct platform_suspend_ops acpi_suspend_ops = {
 	.valid = acpi_suspend_state_valid,
 	.begin = acpi_suspend_begin,
-	.prepare = acpi_suspend_prepare,
+	.prepare = acpi_pm_prepare,
+	.enter = acpi_suspend_enter,
+	.finish = acpi_pm_finish,
+	.end = acpi_pm_end,
+};
+
+/**
+ *	acpi_suspend_begin_old - Set the target system sleep state to the
+ *		state associated with given @pm_state, if supported, and
+ *		execute the _PTS control method.  This function is used if the
+ *		pre-ACPI 2.0 suspend ordering has been requested.
+ */
+static int acpi_suspend_begin_old(suspend_state_t pm_state)
+{
+	int error = acpi_suspend_begin(pm_state);
+
+	if (!error)
+		error = __acpi_pm_prepare();
+	return error;
+}
+
+/*
+ * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
+ * been requested.
+ */
+static struct platform_suspend_ops acpi_suspend_ops_old = {
+	.valid = acpi_suspend_state_valid,
+	.begin = acpi_suspend_begin_old,
+	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_suspend_enter,
-	.finish = acpi_suspend_finish,
-	.end = acpi_suspend_end,
+	.finish = acpi_pm_finish,
+	.end = acpi_pm_end,
+	.recover = acpi_pm_finish,
 };
 #endif /* CONFIG_SUSPEND */
 
@@ -229,22 +287,9 @@ static struct platform_suspend_ops acpi_suspend_ops = {
 static int acpi_hibernation_begin(void)
 {
 	acpi_target_sleep_state = ACPI_STATE_S4;
-
 	return 0;
 }
 
-static int acpi_hibernation_prepare(void)
-{
-	int error = acpi_sleep_prepare(ACPI_STATE_S4);
-
-	if (error) {
-		acpi_target_sleep_state = ACPI_STATE_S0;
-		return error;
-	}
-
-	return ACPI_SUCCESS(acpi_hw_disable_all_gpes()) ? 0 : -EFAULT;
-}
-
 static int acpi_hibernation_enter(void)
 {
 	acpi_status status = AE_OK;
@@ -274,52 +319,55 @@ static void acpi_hibernation_leave(void)
 	acpi_leave_sleep_state_prep(ACPI_STATE_S4);
 }
 
-static void acpi_hibernation_finish(void)
+static void acpi_pm_enable_gpes(void)
 {
-	acpi_disable_wakeup_device(ACPI_STATE_S4);
-	acpi_leave_sleep_state(ACPI_STATE_S4);
-
-	/* reset firmware waking vector */
-	acpi_set_firmware_waking_vector((acpi_physical_address) 0);
-
-	acpi_target_sleep_state = ACPI_STATE_S0;
+	acpi_hw_enable_all_runtime_gpes();
 }
 
-static void acpi_hibernation_end(void)
-{
-	/*
-	 * This is necessary in case acpi_hibernation_finish() is not called
-	 * during a failing transition to the sleep state.
-	 */
-	acpi_target_sleep_state = ACPI_STATE_S0;
-}
+static struct platform_hibernation_ops acpi_hibernation_ops = {
+	.begin = acpi_hibernation_begin,
+	.end = acpi_pm_end,
+	.pre_snapshot = acpi_pm_prepare,
+	.finish = acpi_pm_finish,
+	.prepare = acpi_pm_prepare,
+	.enter = acpi_hibernation_enter,
+	.leave = acpi_hibernation_leave,
+	.pre_restore = acpi_pm_disable_gpes,
+	.restore_cleanup = acpi_pm_enable_gpes,
+};
 
-static int acpi_hibernation_pre_restore(void)
+/**
+ *	acpi_hibernation_begin_old - Set the target system sleep state to
+ *		ACPI_STATE_S4 and execute the _PTS control method.  This
+ *		function is used if the pre-ACPI 2.0 suspend ordering has been
+ *		requested.
+ */
+static int acpi_hibernation_begin_old(void)
 {
-	acpi_status status;
-
-	status = acpi_hw_disable_all_gpes();
-
-	return ACPI_SUCCESS(status) ? 0 : -EFAULT;
-}
+	int error = acpi_sleep_prepare(ACPI_STATE_S4);
 
-static void acpi_hibernation_restore_cleanup(void)
-{
-	acpi_hw_enable_all_runtime_gpes();
+	if (!error)
+		acpi_target_sleep_state = ACPI_STATE_S4;
+	return error;
 }
 
-static struct platform_hibernation_ops acpi_hibernation_ops = {
-	.begin = acpi_hibernation_begin,
-	.end = acpi_hibernation_end,
-	.pre_snapshot = acpi_hibernation_prepare,
-	.finish = acpi_hibernation_finish,
-	.prepare = acpi_hibernation_prepare,
+/*
+ * The following callbacks are used if the pre-ACPI 2.0 suspend ordering has
+ * been requested.
+ */
+static struct platform_hibernation_ops acpi_hibernation_ops_old = {
+	.begin = acpi_hibernation_begin_old,
+	.end = acpi_pm_end,
+	.pre_snapshot = acpi_pm_disable_gpes,
+	.finish = acpi_pm_finish,
+	.prepare = acpi_pm_disable_gpes,
 	.enter = acpi_hibernation_enter,
 	.leave = acpi_hibernation_leave,
-	.pre_restore = acpi_hibernation_pre_restore,
-	.restore_cleanup = acpi_hibernation_restore_cleanup,
+	.pre_restore = acpi_pm_disable_gpes,
+	.restore_cleanup = acpi_pm_enable_gpes,
+	.recover = acpi_pm_finish,
 };
-#endif				/* CONFIG_HIBERNATION */
+#endif /* CONFIG_HIBERNATION */
 
 int acpi_suspend(u32 acpi_state)
 {
@@ -461,13 +509,15 @@ int __init acpi_sleep_init(void)
 		}
 	}
 
-	suspend_set_ops(&acpi_suspend_ops);
+	suspend_set_ops(old_suspend_ordering ?
+		&acpi_suspend_ops_old : &acpi_suspend_ops);
 #endif
 
 #ifdef CONFIG_HIBERNATION
 	status = acpi_get_sleep_type_data(ACPI_STATE_S4, &type_a, &type_b);
 	if (ACPI_SUCCESS(status)) {
-		hibernation_set_ops(&acpi_hibernation_ops);
+		hibernation_set_ops(old_suspend_ordering ?
+			&acpi_hibernation_ops_old : &acpi_hibernation_ops);
 		sleep_states[ACPI_STATE_S4] = 1;
 		printk(" S4");
 	}
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index d571204aaff7..3250c5257b74 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -781,8 +781,6 @@ int device_suspend(pm_message_t state)
 	error = dpm_prepare(state);
 	if (!error)
 		error = dpm_suspend(state);
-	if (error)
-		device_resume(resume_event(state));
 	return error;
 }
 EXPORT_SYMBOL_GPL(device_suspend);
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index 41f7ce7edd7a..33adcf91ef41 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -234,6 +234,9 @@ int acpi_check_region(resource_size_t start, resource_size_t n,
 int acpi_check_mem_region(resource_size_t start, resource_size_t n,
 		      const char *name);
 
+#ifdef CONFIG_PM_SLEEP
+void __init acpi_old_suspend_ordering(void);
+#endif /* CONFIG_PM_SLEEP */
 #else	/* CONFIG_ACPI */
 
 static inline int early_acpi_boot_init(void)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index a6977423baf7..e8e69159af71 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -86,6 +86,11 @@ typedef int __bitwise suspend_state_t;
  *	that implement @begin(), but platforms implementing @begin() should
  *	also provide a @end() which cleans up transitions aborted before
  *	@enter().
+ *
+ * @recover: Recover the platform from a suspend failure.
+ *	Called by the PM core if the suspending of devices fails.
+ *	This callback is optional and should only be implemented by platforms
+ *	which require special recovery actions in that situation.
  */
 struct platform_suspend_ops {
 	int (*valid)(suspend_state_t state);
@@ -94,6 +99,7 @@ struct platform_suspend_ops {
 	int (*enter)(suspend_state_t state);
 	void (*finish)(void);
 	void (*end)(void);
+	void (*recover)(void);
 };
 
 #ifdef CONFIG_SUSPEND
@@ -149,7 +155,7 @@ extern void mark_free_pages(struct zone *zone);
  * The methods in this structure allow a platform to carry out special
  * operations required by it during a hibernation transition.
  *
- * All the methods below must be implemented.
+ * All the methods below, except for @recover(), must be implemented.
  *
  * @begin: Tell the platform driver that we're starting hibernation.
  *	Called right after shrinking memory and before freezing devices.
@@ -189,6 +195,11 @@ extern void mark_free_pages(struct zone *zone);
  * @restore_cleanup: Clean up after a failing image restoration.
  *	Called right after the nonboot CPUs have been enabled and before
  *	thawing devices (runs with IRQs on).
+ *
+ * @recover: Recover the platform from a failure to suspend devices.
+ *	Called by the PM core if the suspending of devices during hibernation
+ *	fails.  This callback is optional and should only be implemented by
+ *	platforms which require special recovery actions in that situation.
  */
 struct platform_hibernation_ops {
 	int (*begin)(void);
@@ -200,6 +211,7 @@ struct platform_hibernation_ops {
 	void (*leave)(void);
 	int (*pre_restore)(void);
 	void (*restore_cleanup)(void);
+	void (*recover)(void);
 };
 
 #ifdef CONFIG_HIBERNATION
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index d416be0efa8a..f011e0870b52 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -179,6 +179,17 @@ static void platform_restore_cleanup(int platform_mode)
 		hibernation_ops->restore_cleanup();
 }
 
+/**
+ *	platform_recover - recover the platform from a failure to suspend
+ *	devices.
+ */
+
+static void platform_recover(int platform_mode)
+{
+	if (platform_mode && hibernation_ops && hibernation_ops->recover)
+		hibernation_ops->recover();
+}
+
 /**
  *	create_image - freeze devices that need to be frozen with interrupts
  *	off, create the hibernation image and thaw those devices.  Control
@@ -258,10 +269,10 @@ int hibernation_snapshot(int platform_mode)
 	suspend_console();
 	error = device_suspend(PMSG_FREEZE);
 	if (error)
-		goto Resume_console;
+		goto Recover_platform;
 
 	if (hibernation_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	error = platform_pre_snapshot(platform_mode);
 	if (error || hibernation_test(TEST_PLATFORM))
@@ -285,11 +296,14 @@ int hibernation_snapshot(int platform_mode)
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	platform_end(platform_mode);
 	return error;
+
+ Recover_platform:
+	platform_recover(platform_mode);
+	goto Resume_devices;
 }
 
 /**
@@ -398,8 +412,11 @@ int hibernation_platform_enter(void)
 
 	suspend_console();
 	error = device_suspend(PMSG_HIBERNATE);
-	if (error)
-		goto Resume_console;
+	if (error) {
+		if (hibernation_ops->recover)
+			hibernation_ops->recover();
+		goto Resume_devices;
+	}
 
 	error = hibernation_ops->prepare();
 	if (error)
@@ -428,7 +445,6 @@ int hibernation_platform_enter(void)
 	hibernation_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESTORE);
- Resume_console:
 	resume_console();
  Close:
 	hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index d023b6b584e5..3398f4651aa1 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,11 +269,11 @@ int suspend_devices_and_enter(suspend_state_t state)
 	error = device_suspend(PMSG_SUSPEND);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to suspend\n");
-		goto Resume_console;
+		goto Recover_platform;
 	}
 
 	if (suspend_test(TEST_DEVICES))
-		goto Resume_devices;
+		goto Recover_platform;
 
 	if (suspend_ops->prepare) {
 		error = suspend_ops->prepare();
@@ -294,12 +294,16 @@ int suspend_devices_and_enter(suspend_state_t state)
 		suspend_ops->finish();
  Resume_devices:
 	device_resume(PMSG_RESUME);
- Resume_console:
 	resume_console();
  Close:
 	if (suspend_ops->end)
 		suspend_ops->end();
 	return error;
+
+ Recover_platform:
+	if (suspend_ops->recover)
+		suspend_ops->recover();
+	goto Resume_devices;
 }
 
 /**
-- 
cgit v1.2.3


From 15650a2f644a2f15738cf22807c090d89328f500 Mon Sep 17 00:00:00 2001
From: Jesse Barnes
Date: Mon, 16 Jun 2008 15:29:45 -0700
Subject: x86/PCI: fixup early quirk probing

On x86, we do early PCI probing to apply some quirks for chipset bugs.
However, in a recent cleanup (7bcbc78dea92fdf0947fa48e248da3c993a5690f) a
thinko was introduced that causes us to probe all subfunctions of even single
function devices (a function was factored out of an inner loop and a "break"
became a "return").  Fix that up by making check_dev_quirk() return a value so
we can keep the factored code intact.

Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/kernel/early-quirks.c | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 9f51e1ea9e82..8566fea647eb 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -133,7 +133,18 @@ static struct chipset early_qrk[] __initdata = {
 	{}
 };
 
-static void __init check_dev_quirk(int num, int slot, int func)
+/**
+ * check_dev_quirk - apply early quirks to a given PCI device
+ * @num: bus number
+ * @slot: slot number
+ * @func: PCI function
+ *
+ * Check the vendor & device ID against the early quirks table.
+ *
+ * If the device is single function, let early_quirks() know so we don't
+ * poke at this device again.
+ */
+static int __init check_dev_quirk(int num, int slot, int func)
 {
 	u16 class;
 	u16 vendor;
@@ -144,7 +155,7 @@ static void __init check_dev_quirk(int num, int slot, int func)
 	class = read_pci_config_16(num, slot, func, PCI_CLASS_DEVICE);
 
 	if (class == 0xffff)
-		return;
+		return -1; /* no class, treat as single function */
 
 	vendor = read_pci_config_16(num, slot, func, PCI_VENDOR_ID);
 
@@ -167,7 +178,9 @@ static void __init check_dev_quirk(int num, int slot, int func)
 	type = read_pci_config_byte(num, slot, func,
 				    PCI_HEADER_TYPE);
 	if (!(type & 0x80))
-		return;
+		return -1;
+
+	return 0;
 }
 
 void __init early_quirks(void)
@@ -180,6 +193,9 @@ void __init early_quirks(void)
 	/* Poor man's PCI discovery */
 	for (num = 0; num < 32; num++)
 		for (slot = 0; slot < 32; slot++)
-			for (func = 0; func < 8; func++)
-				check_dev_quirk(num, slot, func);
+			for (func = 0; func < 8; func++) {
+				/* Only probe function 0 on single fn devices */
+				if (check_dev_quirk(num, slot, func))
+					break;
+			}
 }
-- 
cgit v1.2.3


From ee4311adf105f4d740f52e3948acc1d81598afcc Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 17 Jun 2008 17:43:02 +0200
Subject: ftrace: build fix with gcc 4.3

fix:

arch/x86/kernel/ftrace.c: Assembler messages:
arch/x86/kernel/ftrace.c:82: Error: bad register name `%sil'
make[1]: *** [arch/x86/kernel/ftrace.o] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index bc5cf8d46742..55828149e01e 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -88,7 +88,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 		".previous\n"
 		_ASM_EXTABLE(1b, 3b)
 		: "=r"(faulted), "=a"(replaced)
-		: "r"(ip), "r"(new), "r"(newch),
+		: "r"(ip), "r"(new), "c"(newch),
 		  "0"(faulted), "a"(old)
 		: "memory");
 	sync_core();
-- 
cgit v1.2.3


From e17ba73b0ee6c0f24393c48b455e0d8db761782c Mon Sep 17 00:00:00 2001
From: Jiri Slaby
Date: Mon, 12 May 2008 15:44:40 +0200
Subject: x86, generic: mark early_printk as asmlinkage

It's not explicitly marked as asmlinkage, but invoked from x86_32
startup code with parameters on stack.

No other architectures define early_printk and none of them are affected
by this change, since defines asmlinkage as empty token.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/early_printk.c | 2 +-
 include/linux/kernel.h         | 2 +-
 kernel/printk.c                | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index 643fd861b724..ff9e7350da54 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -196,7 +196,7 @@ static struct console simnow_console = {
 static struct console *early_console = &early_vga_console;
 static int early_console_initialized;
 
-void early_printk(const char *fmt, ...)
+asmlinkage void early_printk(const char *fmt, ...)
 {
 	char buf[512];
 	int n;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index f2a668c195bf..4cb8d3df414e 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -207,7 +207,7 @@ static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 		{ return false; }
 #endif
 
-extern void __attribute__((format(printf, 1, 2)))
+extern void asmlinkage __attribute__((format(printf, 1, 2)))
 	early_printk(const char *fmt, ...);
 
 unsigned long int_sqrt(unsigned long);
diff --git a/kernel/printk.c b/kernel/printk.c
index 70cfa5ac75ce..de1a4f4470c3 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
 /*
  * Architectures can override it:
  */
-void __attribute__((weak)) early_printk(const char *fmt, ...)
+void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
 {
 }
 
-- 
cgit v1.2.3


From 864fe51671c9e44fb9d02765623daac9acc26a8b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Tue, 20 May 2008 19:15:34 +0200
Subject: apm_32: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/apm_32.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9290e29013..82222366477f 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,6 +204,7 @@
 #include <linux/module.h>
 
 #include <linux/poll.h>
+#include <linux/smp_lock.h>
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/timer.h>
@@ -1544,10 +1545,12 @@ static int do_open(struct inode *inode, struct file *filp)
 {
 	struct apm_user *as;
 
+	lock_kernel();
 	as = kmalloc(sizeof(*as), GFP_KERNEL);
 	if (as == NULL) {
 		printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
 		       sizeof(*as));
+		       unlock_kernel();
 		return -ENOMEM;
 	}
 	as->magic = APM_BIOS_MAGIC;
@@ -1569,6 +1572,7 @@ static int do_open(struct inode *inode, struct file *filp)
 	user_list = as;
 	spin_unlock(&user_list_lock);
 	filp->private_data = as;
+	unlock_kernel();
 	return 0;
 }
 
-- 
cgit v1.2.3


From 77149367dade50af8370420265bd4f818cde8afd Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Tue, 20 May 2008 19:16:16 +0200
Subject: microcode: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/microcode.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..9f0a994eef9b 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -75,6 +75,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/cpumask.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -422,6 +423,7 @@ out:
 
 static int microcode_open (struct inode *unused1, struct file *unused2)
 {
+	cycle_kernel_lock();
 	return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
 }
 
-- 
cgit v1.2.3


From 395a59d0f8e86bb39cd700c3d185d30c670bb958 Mon Sep 17 00:00:00 2001
From: Abhishek Sagar
Date: Sat, 21 Jun 2008 23:47:27 +0530
Subject: ftrace: store mcount address in rec->ip

Record the address of the mcount call-site. Currently all archs except sparc64
record the address of the instruction following the mcount call-site. Some
general cleanups are entailed. Storing mcount addresses in rec->ip enables
looking them up in the kprobe hash table later on to check if they're kprobe'd.

Signed-off-by: Abhishek Sagar <sagar.abhishek@gmail.com>
Cc: davem@davemloft.net
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/arm/kernel/armksyms.c          | 10 +++++-----
 arch/arm/kernel/entry-common.S      |  4 ++++
 arch/arm/kernel/ftrace.c            | 16 +++++++---------
 arch/powerpc/kernel/entry_32.S      |  4 ++++
 arch/powerpc/kernel/entry_64.S      |  5 ++++-
 arch/powerpc/kernel/ftrace.c        | 21 +++++++--------------
 arch/sparc64/kernel/ftrace.c        | 10 ++++++----
 arch/sparc64/kernel/sparc64_ksyms.c |  2 +-
 arch/x86/kernel/entry_32.S          |  4 ++++
 arch/x86/kernel/entry_64.S          |  4 ++++
 arch/x86/kernel/ftrace.c            | 26 +++++++++-----------------
 arch/x86/kernel/i386_ksyms_32.c     |  2 +-
 arch/x86/kernel/x8664_ksyms_64.c    |  2 +-
 include/asm-arm/ftrace.h            | 14 ++++++++++++++
 include/asm-powerpc/ftrace.h        |  8 ++++++++
 include/asm-sparc64/ftrace.h        | 14 ++++++++++++++
 include/asm-x86/ftrace.h            | 14 ++++++++++++++
 include/linux/ftrace.h              |  3 +--
 kernel/trace/ftrace.c               |  3 ++-
 19 files changed, 110 insertions(+), 56 deletions(-)
 create mode 100644 include/asm-arm/ftrace.h
 create mode 100644 include/asm-sparc64/ftrace.h
 create mode 100644 include/asm-x86/ftrace.h

(limited to 'arch/x86')

diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
index 3b132215cbf8..cc7b246e9652 100644
--- a/arch/arm/kernel/armksyms.c
+++ b/arch/arm/kernel/armksyms.c
@@ -18,6 +18,7 @@
 #include <asm/io.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
+#include <asm/ftrace.h>
 
 /*
  * libgcc functions - functions that are used internally by the
@@ -48,11 +49,6 @@ extern void __aeabi_ulcmp(void);
 extern void fpundefinstr(void);
 extern void fp_enter(void);
 
-#ifdef CONFIG_FTRACE
-extern void mcount(void);
-EXPORT_SYMBOL(mcount);
-#endif
-
 /*
  * This has a special calling convention; it doesn't
  * modify any of the usual registers, except for LR.
@@ -186,3 +182,7 @@ EXPORT_SYMBOL(_find_next_bit_be);
 #endif
 
 EXPORT_SYMBOL(copy_page);
+
+#ifdef CONFIG_FTRACE
+EXPORT_SYMBOL(mcount);
+#endif
diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S
index 8f79a4789ed4..84694e88b428 100644
--- a/arch/arm/kernel/entry-common.S
+++ b/arch/arm/kernel/entry-common.S
@@ -9,6 +9,7 @@
  */
 
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 #include <asm/arch/entry-macro.S>
 
 #include "entry-header.S"
@@ -104,6 +105,7 @@ ENTRY(ret_from_fork)
 ENTRY(mcount)
 	stmdb sp!, {r0-r3, lr}
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl mcount_call
 mcount_call:
@@ -114,6 +116,7 @@ ENTRY(ftrace_caller)
 	stmdb sp!, {r0-r3, lr}
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 
 	.globl ftrace_call
 ftrace_call:
@@ -134,6 +137,7 @@ ENTRY(mcount)
 trace:
 	ldr r1, [fp, #-4]
 	mov r0, lr
+	sub r0, r0, #MCOUNT_INSN_SIZE
 	mov lr, pc
 	mov pc, r2
 	ldmia sp!, {r0-r3, pc}
diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c
index 22f3d6e309f9..76d50e6091bc 100644
--- a/arch/arm/kernel/ftrace.c
+++ b/arch/arm/kernel/ftrace.c
@@ -12,9 +12,10 @@
  */
 
 #include <linux/ftrace.h>
+
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define INSN_SIZE      4
 #define PC_OFFSET      8
 #define BL_OPCODE      0xeb000000
 #define BL_OFFSET_MASK 0x00ffffff
@@ -32,10 +33,10 @@ unsigned char *ftrace_call_replace(unsigned long pc, unsigned long addr)
 {
 	long offset;
 
-	offset = (long)addr - (long)(pc - INSN_SIZE + PC_OFFSET);
+	offset = (long)addr - (long)(pc + PC_OFFSET);
 	if (unlikely(offset < -33554432 || offset > 33554428)) {
 		/* Can't generate branches that far (from ARM ARM). Ftrace
-		 * doesn't generate branches outside of core kernel text.
+		 * doesn't generate branches outside of kernel text.
 		 */
 		WARN_ON_ONCE(1);
 		return NULL;
@@ -52,7 +53,6 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 
 	old = *(unsigned long *)old_code;
 	new = *(unsigned long *)new_code;
-	pc -= INSN_SIZE;
 
 	__asm__ __volatile__ (
 		"1:  ldr    %1, [%2]  \n"
@@ -77,7 +77,7 @@ int ftrace_modify_code(unsigned long pc, unsigned char *old_code,
 		: "memory");
 
 	if (!err && (replaced == old))
-		flush_icache_range(pc, pc + INSN_SIZE);
+		flush_icache_range(pc, pc + MCOUNT_INSN_SIZE);
 
 	return err;
 }
@@ -89,8 +89,7 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
 	unsigned char *new;
 
 	pc = (unsigned long)&ftrace_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &ftrace_call, INSN_SIZE);
+	memcpy(&old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, (unsigned long)func);
 	ret = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return ret;
@@ -103,8 +102,7 @@ int ftrace_mcount_set(unsigned long *data)
 	unsigned char *new;
 
 	pc = (unsigned long)&mcount_call;
-	pc += INSN_SIZE;
-	memcpy(&old, &mcount_call, INSN_SIZE);
+	memcpy(&old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(pc, *addr);
 	*addr = ftrace_modify_code(pc, (unsigned char *)&old, new);
 	return 0;
diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S
index 3b1dd29d9f91..7231a708af0d 100644
--- a/arch/powerpc/kernel/entry_32.S
+++ b/arch/powerpc/kernel/entry_32.S
@@ -30,6 +30,7 @@
 #include <asm/ppc_asm.h>
 #include <asm/asm-offsets.h>
 #include <asm/unistd.h>
+#include <asm/ftrace.h>
 
 #undef SHOW_SYSCALLS
 #undef SHOW_SYSCALLS_TASK
@@ -1053,6 +1054,7 @@ _GLOBAL(_mcount)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -1090,6 +1092,7 @@ _GLOBAL(ftrace_caller)
 	stw	r10,40(r1)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -1128,6 +1131,7 @@ _GLOBAL(_mcount)
 	stw	r3, 44(r1)
 	stw	r5, 8(r1)
 
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5, ftrace_trace_function)
 	lwz	r5,0(r5)
 
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 2c4d9e056ead..2f511a969d2c 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -31,6 +31,7 @@
 #include <asm/bug.h>
 #include <asm/ptrace.h>
 #include <asm/irqflags.h>
+#include <asm/ftrace.h>
 
 /*
  * System calls.
@@ -879,6 +880,7 @@ _GLOBAL(_mcount)
 	mflr	r3
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	.globl mcount_call
 mcount_call:
 	bl	ftrace_stub
@@ -895,6 +897,7 @@ _GLOBAL(ftrace_caller)
 	stdu	r1, -112(r1)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
+	subi	r3, r3, MCOUNT_INSN_SIZE
 .globl ftrace_call
 ftrace_call:
 	bl	ftrace_stub
@@ -916,7 +919,7 @@ _GLOBAL(_mcount)
 	std	r3, 128(r1)
 	ld	r4, 16(r11)
 
-
+	subi	r3, r3, MCOUNT_INSN_SIZE
 	LOAD_REG_ADDR(r5,ftrace_trace_function)
 	ld	r5,0(r5)
 	ld	r5,0(r5)
diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c
index e12c593ab9ca..3855ceb937b0 100644
--- a/arch/powerpc/kernel/ftrace.c
+++ b/arch/powerpc/kernel/ftrace.c
@@ -15,8 +15,8 @@
 #include <linux/list.h>
 
 #include <asm/cacheflush.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		4
 
 static unsigned int ftrace_nop = 0x60000000;
 
@@ -27,9 +27,10 @@ static unsigned int ftrace_nop = 0x60000000;
 # define GET_ADDR(addr) *(unsigned long *)addr
 #endif
 
+
 static unsigned int notrace ftrace_calc_offset(long ip, long addr)
 {
-	return (int)((addr + CALL_BACK) - ip);
+	return (int)(addr - ip);
 }
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -76,9 +77,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned new = *(unsigned *)new_code;
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -118,12 +116,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -134,16 +130,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/ftrace.c b/arch/sparc64/kernel/ftrace.c
index c17373195b1e..4298d0aee713 100644
--- a/arch/sparc64/kernel/ftrace.c
+++ b/arch/sparc64/kernel/ftrace.c
@@ -5,6 +5,8 @@
 #include <linux/init.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 static const u32 ftrace_nop = 0x01000000;
 
 notrace unsigned char *ftrace_nop_replace(void)
@@ -60,9 +62,9 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
-	memcpy(old, &ftrace_call, 4);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	return ftrace_modify_code(ip, old, new);
 }
@@ -71,13 +73,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[4], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 4);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
diff --git a/arch/sparc64/kernel/sparc64_ksyms.c b/arch/sparc64/kernel/sparc64_ksyms.c
index 8ac0b99f2c55..b80d982a29c6 100644
--- a/arch/sparc64/kernel/sparc64_ksyms.c
+++ b/arch/sparc64/kernel/sparc64_ksyms.c
@@ -53,6 +53,7 @@
 #include <asm/ns87303.h>
 #include <asm/timer.h>
 #include <asm/cpudata.h>
+#include <asm/ftrace.h>
 
 struct poll {
 	int fd;
@@ -112,7 +113,6 @@ EXPORT_SYMBOL(smp_call_function);
 #endif /* CONFIG_SMP */
 
 #if defined(CONFIG_MCOUNT)
-extern void _mcount(void);
 EXPORT_SYMBOL(_mcount);
 #endif
 
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 04ea83ccb979..95e6bbe3665e 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -51,6 +51,7 @@
 #include <asm/percpu.h>
 #include <asm/dwarf2.h>
 #include <asm/processor-flags.h>
+#include <asm/ftrace.h>
 #include "irq_vectors.h"
 
 /*
@@ -1118,6 +1119,7 @@ ENTRY(mcount)
 	pushl %ecx
 	pushl %edx
 	movl 0xc(%esp), %eax
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl mcount_call
 mcount_call:
@@ -1136,6 +1138,7 @@ ENTRY(ftrace_caller)
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 .globl ftrace_call
 ftrace_call:
@@ -1166,6 +1169,7 @@ trace:
 	pushl %edx
 	movl 0xc(%esp), %eax
 	movl 0x4(%ebp), %edx
+	subl $MCOUNT_INSN_SIZE, %eax
 
 	call *ftrace_trace_function
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index fe25e5febca3..b0f7308f78a6 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -51,6 +51,7 @@
 #include <asm/page.h>
 #include <asm/irqflags.h>
 #include <asm/paravirt.h>
+#include <asm/ftrace.h>
 
 	.code64
 
@@ -68,6 +69,7 @@ ENTRY(mcount)
 	movq %r9, 48(%rsp)
 
 	movq 0x38(%rsp), %rdi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl mcount_call
 mcount_call:
@@ -99,6 +101,7 @@ ENTRY(ftrace_caller)
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 .globl ftrace_call
 ftrace_call:
@@ -139,6 +142,7 @@ trace:
 
 	movq 0x38(%rsp), %rdi
 	movq 8(%rbp), %rsi
+	subq $MCOUNT_INSN_SIZE, %rdi
 
 	call   *ftrace_trace_function
 
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 55828149e01e..ab115cd15fdf 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -17,20 +17,21 @@
 #include <linux/list.h>
 
 #include <asm/alternative.h>
+#include <asm/ftrace.h>
 
-#define CALL_BACK		5
 
 /* Long is fine, even if it is only 4 bytes ;-) */
 static long *ftrace_nop;
 
 union ftrace_code_union {
-	char code[5];
+	char code[MCOUNT_INSN_SIZE];
 	struct {
 		char e8;
 		int offset;
 	} __attribute__((packed));
 };
 
+
 static int notrace ftrace_calc_offset(long ip, long addr)
 {
 	return (int)(addr - ip);
@@ -46,7 +47,7 @@ notrace unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
 	static union ftrace_code_union calc;
 
 	calc.e8		= 0xe8;
-	calc.offset	= ftrace_calc_offset(ip, addr);
+	calc.offset	= ftrace_calc_offset(ip + MCOUNT_INSN_SIZE, addr);
 
 	/*
 	 * No locking needed, this must be called via kstop_machine
@@ -65,9 +66,6 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 	unsigned char newch = new_code[4];
 	int faulted = 0;
 
-	/* move the IP back to the start of the call */
-	ip -= CALL_BACK;
-
 	/*
 	 * Note: Due to modules and __init, code can
 	 *  disappear and change, we need to protect against faulting
@@ -102,12 +100,10 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
 notrace int ftrace_update_ftrace_func(ftrace_func_t func)
 {
 	unsigned long ip = (unsigned long)(&ftrace_call);
-	unsigned char old[5], *new;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 	int ret;
 
-	ip += CALL_BACK;
-
-	memcpy(old, &ftrace_call, 5);
+	memcpy(old, &ftrace_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, (unsigned long)func);
 	ret = ftrace_modify_code(ip, old, new);
 
@@ -118,16 +114,13 @@ notrace int ftrace_mcount_set(unsigned long *data)
 {
 	unsigned long ip = (long)(&mcount_call);
 	unsigned long *addr = data;
-	unsigned char old[5], *new;
-
-	/* ip is at the location, but modify code will subtact this */
-	ip += CALL_BACK;
+	unsigned char old[MCOUNT_INSN_SIZE], *new;
 
 	/*
 	 * Replace the mcount stub with a pointer to the
 	 * ip recorder function.
 	 */
-	memcpy(old, &mcount_call, 5);
+	memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
 	new = ftrace_call_replace(ip, *addr);
 	*addr = ftrace_modify_code(ip, old, new);
 
@@ -142,8 +135,7 @@ int __init ftrace_dyn_arch_init(void *data)
 
 	ftrace_mcount_set(data);
 
-	ftrace_nop = (unsigned long *)noptable[CALL_BACK];
+	ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE];
 
 	return 0;
 }
-
diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c
index 29999dbb754c..dd7ebee446af 100644
--- a/arch/x86/kernel/i386_ksyms_32.c
+++ b/arch/x86/kernel/i386_ksyms_32.c
@@ -1,9 +1,9 @@
-#include <linux/ftrace.h>
 #include <linux/module.h>
 
 #include <asm/checksum.h>
 #include <asm/pgtable.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 122885bc5f3b..16ff4bf418d9 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -1,7 +1,6 @@
 /* Exports for assembly files.
    All C exports should go in the respective C files. */
 
-#include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/smp.h>
 
@@ -11,6 +10,7 @@
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/ftrace.h>
 
 #ifdef CONFIG_FTRACE
 /* mcount is defined in assembly */
diff --git a/include/asm-arm/ftrace.h b/include/asm-arm/ftrace.h
new file mode 100644
index 000000000000..584ef9a8e5a5
--- /dev/null
+++ b/include/asm-arm/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_ARM_FTRACE
+#define _ASM_ARM_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_ARM_FTRACE */
diff --git a/include/asm-powerpc/ftrace.h b/include/asm-powerpc/ftrace.h
index b1bfa704b6e5..de921326cca8 100644
--- a/include/asm-powerpc/ftrace.h
+++ b/include/asm-powerpc/ftrace.h
@@ -1,6 +1,14 @@
 #ifndef _ASM_POWERPC_FTRACE
 #define _ASM_POWERPC_FTRACE
 
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
 extern void _mcount(void);
+#endif
 
 #endif
+
+#endif /* _ASM_POWERPC_FTRACE */
diff --git a/include/asm-sparc64/ftrace.h b/include/asm-sparc64/ftrace.h
new file mode 100644
index 000000000000..f76a40a338bb
--- /dev/null
+++ b/include/asm-sparc64/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_SPARC64_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(_mcount))
+#define MCOUNT_INSN_SIZE	4 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void _mcount(void);
+#endif
+
+#endif
+
+#endif /* _ASM_SPARC64_FTRACE */
diff --git a/include/asm-x86/ftrace.h b/include/asm-x86/ftrace.h
new file mode 100644
index 000000000000..c184441133f2
--- /dev/null
+++ b/include/asm-x86/ftrace.h
@@ -0,0 +1,14 @@
+#ifndef _ASM_X86_FTRACE
+#define _ASM_SPARC64_FTRACE
+
+#ifdef CONFIG_FTRACE
+#define MCOUNT_ADDR		((long)(mcount))
+#define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */
+
+#ifndef __ASSEMBLY__
+extern void mcount(void);
+#endif
+
+#endif /* CONFIG_FTRACE */
+
+#endif /* _ASM_X86_FTRACE */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 20e14d0093c7..366098d591de 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -31,7 +31,6 @@ int unregister_ftrace_function(struct ftrace_ops *ops);
 void clear_ftrace_function(void);
 
 extern void ftrace_stub(unsigned long a0, unsigned long a1);
-extern void mcount(void);
 
 #else /* !CONFIG_FTRACE */
 # define register_ftrace_function(ops) do { } while (0)
@@ -54,7 +53,7 @@ enum {
 
 struct dyn_ftrace {
 	struct hlist_node node;
-	unsigned long	  ip;
+	unsigned long	  ip; /* address of mcount call-site */
 	unsigned long	  flags;
 };
 
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0d5bcf69952d..f1e9e5c74e64 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,8 @@
 #include <linux/hash.h>
 #include <linux/list.h>
 
+#include <asm/ftrace.h>
+
 #include "trace.h"
 
 /* ftrace_enabled is a method to turn ftrace on or off */
@@ -329,7 +331,6 @@ ftrace_record_ip(unsigned long ip)
 }
 
 #define FTRACE_ADDR ((long)(ftrace_caller))
-#define MCOUNT_ADDR ((long)(mcount))
 
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec,
-- 
cgit v1.2.3


From 3b16cf874861436725c43ba0b68bdd799297be7c Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Thu, 26 Jun 2008 11:21:54 +0200
Subject: x86: convert to generic helpers for IPI function calls

This converts x86, x86-64, and xen to use the new helpers for
smp_call_function() and friends, and adds support for
smp_call_function_single().

Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/x86/Kconfig                           |   1 +
 arch/x86/kernel/apic_32.c                  |   4 +
 arch/x86/kernel/entry_64.S                 |   3 +
 arch/x86/kernel/i8259_64.c                 |   4 +
 arch/x86/kernel/smp.c                      | 158 ++++-------------------------
 arch/x86/kernel/smpboot.c                  |   4 +-
 arch/x86/kernel/smpcommon.c                |  56 ----------
 arch/x86/mach-voyager/voyager_smp.c        |  94 ++++-------------
 arch/x86/xen/enlighten.c                   |   4 +-
 arch/x86/xen/mmu.c                         |   2 +-
 arch/x86/xen/smp.c                         | 133 +++++++++---------------
 arch/x86/xen/xen-ops.h                     |   9 +-
 include/asm-x86/hw_irq_32.h                |   1 +
 include/asm-x86/hw_irq_64.h                |   2 +
 include/asm-x86/mach-default/entry_arch.h  |   1 +
 include/asm-x86/mach-default/irq_vectors.h |   1 +
 include/asm-x86/mach-voyager/entry_arch.h  |   2 +-
 include/asm-x86/mach-voyager/irq_vectors.h |   4 +-
 include/asm-x86/smp.h                      |  21 ++--
 include/asm-x86/xen/events.h               |   1 +
 20 files changed, 125 insertions(+), 380 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e0edaaa6920a..2f3fbebf51d8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -168,6 +168,7 @@ config GENERIC_PENDING_IRQ
 config X86_SMP
 	bool
 	depends on SMP && ((X86_32 && !X86_VOYAGER) || X86_64)
+	select USE_GENERIC_SMP_HELPERS
 	default y
 
 config X86_32_SMP
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c
index 4b99b1bdeb6c..71017f71f4bc 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic_32.c
@@ -1358,6 +1358,10 @@ void __init smp_intr_init(void)
 
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+	/* IPI for single call function */
+	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+				call_function_single_interrupt);
 }
 #endif
 
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 556a8df522a7..6d1fe270a96d 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -711,6 +711,9 @@ END(invalidate_interrupt\num)
 ENTRY(call_function_interrupt)
 	apicinterrupt CALL_FUNCTION_VECTOR,smp_call_function_interrupt
 END(call_function_interrupt)
+ENTRY(call_function_single_interrupt)
+	apicinterrupt CALL_FUNCTION_SINGLE_VECTOR,smp_call_function_single_interrupt
+END(call_function_single_interrupt)
 ENTRY(irq_move_cleanup_interrupt)
 	apicinterrupt IRQ_MOVE_CLEANUP_VECTOR,smp_irq_move_cleanup_interrupt
 END(irq_move_cleanup_interrupt)
diff --git a/arch/x86/kernel/i8259_64.c b/arch/x86/kernel/i8259_64.c
index fa57a1568508..00d2ccdc69f8 100644
--- a/arch/x86/kernel/i8259_64.c
+++ b/arch/x86/kernel/i8259_64.c
@@ -494,6 +494,10 @@ void __init native_init_IRQ(void)
 	/* IPI for generic function call */
 	set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
 
+	/* IPI for generic single function call */
+	set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+				call_function_single_interrupt);
+
 	/* Low priority IPI to cleanup after moving an irq */
 	set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
 #endif
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 0cb7aadc87cd..575aa3d7248a 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -121,132 +121,23 @@ static void native_smp_send_reschedule(int cpu)
 	send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR);
 }
 
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	atomic_t started;
-	atomic_t finished;
-	int wait;
-};
-
-void lock_ipi_call_lock(void)
+void native_send_call_func_single_ipi(int cpu)
 {
-	spin_lock_irq(&call_lock);
-}
-
-void unlock_ipi_call_lock(void)
-{
-	spin_unlock_irq(&call_lock);
-}
-
-static struct call_data_struct *call_data;
-
-static void __smp_call_function(void (*func) (void *info), void *info,
-				int nonatomic, int wait)
-{
-	struct call_data_struct data;
-	int cpus = num_online_cpus() - 1;
-
-	if (!cpus)
-		return;
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	mb();
-
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_IPI_allbutself(CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
+	send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_SINGLE_VECTOR);
 }
 
-
-/**
- * smp_call_function_mask(): Run a function on a set of other CPUs.
- * @mask: The set of cpus to run on.  Must not include the current cpu.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
-  * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-static int
-native_smp_call_function_mask(cpumask_t mask,
-			      void (*func)(void *), void *info,
-			      int wait)
+void native_send_call_func_ipi(cpumask_t mask)
 {
-	struct call_data_struct data;
 	cpumask_t allbutself;
-	int cpus;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	/* Holding any lock stops cpus from going down. */
-	spin_lock(&call_lock);
 
 	allbutself = cpu_online_map;
 	cpu_clear(smp_processor_id(), allbutself);
 
-	cpus_and(mask, mask, allbutself);
-	cpus = cpus_weight(mask);
-
-	if (!cpus) {
-		spin_unlock(&call_lock);
-		return 0;
-	}
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	wmb();
-
-	/* Send a message to other CPUs */
 	if (cpus_equal(mask, allbutself) &&
 	    cpus_equal(cpu_online_map, cpu_callout_map))
 		send_IPI_allbutself(CALL_FUNCTION_VECTOR);
 	else
 		send_IPI_mask(mask, CALL_FUNCTION_VECTOR);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus)
-		cpu_relax();
-
-	if (wait)
-		while (atomic_read(&data.finished) != cpus)
-			cpu_relax();
-	spin_unlock(&call_lock);
-
-	return 0;
 }
 
 static void stop_this_cpu(void *dummy)
@@ -268,18 +159,13 @@ static void stop_this_cpu(void *dummy)
 
 static void native_smp_send_stop(void)
 {
-	int nolock;
 	unsigned long flags;
 
 	if (reboot_force)
 		return;
 
-	/* Don't deadlock on the call lock in panic */
-	nolock = !spin_trylock(&call_lock);
+	smp_call_function(stop_this_cpu, NULL, 0, 0);
 	local_irq_save(flags);
-	__smp_call_function(stop_this_cpu, NULL, 0, 0);
-	if (!nolock)
-		spin_unlock(&call_lock);
 	disable_local_APIC();
 	local_irq_restore(flags);
 }
@@ -301,33 +187,28 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
 
 void smp_call_function_interrupt(struct pt_regs *regs)
 {
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	int wait = call_data->wait;
-
 	ack_APIC_irq();
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	atomic_inc(&call_data->started);
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
 	irq_enter();
-	(*func)(info);
+	generic_smp_call_function_interrupt();
 #ifdef CONFIG_X86_32
 	__get_cpu_var(irq_stat).irq_call_count++;
 #else
 	add_pda(irq_call_count, 1);
 #endif
 	irq_exit();
+}
 
-	if (wait) {
-		mb();
-		atomic_inc(&call_data->finished);
-	}
+void smp_call_function_single_interrupt(void)
+{
+	ack_APIC_irq();
+	irq_enter();
+	generic_smp_call_function_single_interrupt();
+#ifdef CONFIG_X86_32
+	__get_cpu_var(irq_stat).irq_call_count++;
+#else
+	add_pda(irq_call_count, 1);
+#endif
+	irq_exit();
 }
 
 struct smp_ops smp_ops = {
@@ -338,7 +219,8 @@ struct smp_ops smp_ops = {
 
 	.smp_send_stop = native_smp_send_stop,
 	.smp_send_reschedule = native_smp_send_reschedule,
-	.smp_call_function_mask = native_smp_call_function_mask,
+
+	.send_call_func_ipi = native_send_call_func_ipi,
+	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
 EXPORT_SYMBOL_GPL(smp_ops);
-
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 56078d61c793..89647898f546 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -345,7 +345,7 @@ static void __cpuinit start_secondary(void *unused)
 	 * lock helps us to not include this cpu in a currently in progress
 	 * smp_call_function().
 	 */
-	lock_ipi_call_lock();
+	ipi_call_lock_irq();
 #ifdef CONFIG_X86_64
 	spin_lock(&vector_lock);
 
@@ -357,7 +357,7 @@ static void __cpuinit start_secondary(void *unused)
 	spin_unlock(&vector_lock);
 #endif
 	cpu_set(smp_processor_id(), cpu_online_map);
-	unlock_ipi_call_lock();
+	ipi_call_unlock_irq();
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
 	setup_secondary_clock();
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 3449064d141a..99941b37eca0 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -25,59 +25,3 @@ __cpuinit void init_gdt(int cpu)
 	per_cpu(cpu_number, cpu) = cpu;
 }
 #endif
-
-/**
- * smp_call_function(): Run a function on all other CPUs.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- *
- * You must not call this function with disabled interrupts or from a
- * hardware interrupt handler or from a bottom half handler.
- */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-		      int wait)
-{
-	return smp_call_function_mask(cpu_online_map, func, info, wait);
-}
-EXPORT_SYMBOL(smp_call_function);
-
-/**
- * smp_call_function_single - Run a function on a specific CPU
- * @cpu: The target CPU.  Cannot be the calling CPU.
- * @func: The function to run. This must be fast and non-blocking.
- * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: Unused.
- * @wait: If true, wait until function has completed on other CPUs.
- *
- * Returns 0 on success, else a negative status code.
- *
- * If @wait is true, then returns once @func has returned; otherwise
- * it returns just before the target cpu calls @func.
- */
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int nonatomic, int wait)
-{
-	/* prevent preemption and reschedule on another processor */
-	int ret;
-	int me = get_cpu();
-	if (cpu == me) {
-		local_irq_disable();
-		func(info);
-		local_irq_enable();
-		put_cpu();
-		return 0;
-	}
-
-	ret = smp_call_function_mask(cpumask_of_cpu(cpu), func, info, wait);
-
-	put_cpu();
-	return ret;
-}
-EXPORT_SYMBOL(smp_call_function_single);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 8acbf0cdf1a5..cb34407a9930 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -955,94 +955,24 @@ static void smp_stop_cpu_function(void *dummy)
 		halt();
 }
 
-static DEFINE_SPINLOCK(call_lock);
-
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	volatile unsigned long started;
-	volatile unsigned long finished;
-	int wait;
-};
-
-static struct call_data_struct *call_data;
-
 /* execute a thread on a new CPU.  The function to be called must be
  * previously set up.  This is used to schedule a function for
  * execution on all CPUs - set up the function then broadcast a
  * function_interrupt CPI to come here on each CPU */
 static void smp_call_function_interrupt(void)
 {
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	/* must take copy of wait because call_data may be replaced
-	 * unless the function is waiting for us to finish */
-	int wait = call_data->wait;
-	__u8 cpu = smp_processor_id();
-
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	if (!test_and_clear_bit(cpu, &call_data->started)) {
-		/* If the bit wasn't set, this could be a replay */
-		printk(KERN_WARNING "VOYAGER SMP: CPU %d received call funtion"
-		       " with no call pending\n", cpu);
-		return;
-	}
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
 	irq_enter();
-	(*func) (info);
+	generic_smp_call_function_interrupt();
 	__get_cpu_var(irq_stat).irq_call_count++;
 	irq_exit();
-	if (wait) {
-		mb();
-		clear_bit(cpu, &call_data->finished);
-	}
 }
 
-static int
-voyager_smp_call_function_mask(cpumask_t cpumask,
-			       void (*func) (void *info), void *info, int wait)
+static void smp_call_function_single_interrupt(void)
 {
-	struct call_data_struct data;
-	u32 mask = cpus_addr(cpumask)[0];
-
-	mask &= ~(1 << smp_processor_id());
-
-	if (!mask)
-		return 0;
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	data.func = func;
-	data.info = info;
-	data.started = mask;
-	data.wait = wait;
-	if (wait)
-		data.finished = mask;
-
-	spin_lock(&call_lock);
-	call_data = &data;
-	wmb();
-	/* Send a message to all other CPUs and wait for them to respond */
-	send_CPI(mask, VIC_CALL_FUNCTION_CPI);
-
-	/* Wait for response */
-	while (data.started)
-		barrier();
-
-	if (wait)
-		while (data.finished)
-			barrier();
-
-	spin_unlock(&call_lock);
-
-	return 0;
+	irq_enter();
+	generic_smp_call_function_single_interrupt();
+	__get_cpu_var(irq_stat).irq_call_count++;
+	irq_exit();
 }
 
 /* Sorry about the name.  In an APIC based system, the APICs
@@ -1099,6 +1029,12 @@ void smp_qic_call_function_interrupt(struct pt_regs *regs)
 	smp_call_function_interrupt();
 }
 
+void smp_qic_call_function_single_interrupt(struct pt_regs *regs)
+{
+	ack_QIC_CPI(QIC_CALL_FUNCTION_SINGLE_CPI);
+	smp_call_function_single_interrupt();
+}
+
 void smp_vic_cpi_interrupt(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
@@ -1119,6 +1055,8 @@ void smp_vic_cpi_interrupt(struct pt_regs *regs)
 		smp_enable_irq_interrupt();
 	if (test_and_clear_bit(VIC_CALL_FUNCTION_CPI, &vic_cpi_mailbox[cpu]))
 		smp_call_function_interrupt();
+	if (test_and_clear_bit(VIC_CALL_FUNCTION_SINGLE_CPI, &vic_cpi_mailbox[cpu]))
+		smp_call_function_single_interrupt();
 	set_irq_regs(old_regs);
 }
 
@@ -1862,5 +1800,7 @@ struct smp_ops smp_ops = {
 
 	.smp_send_stop = voyager_smp_send_stop,
 	.smp_send_reschedule = voyager_smp_send_reschedule,
-	.smp_call_function_mask = voyager_smp_call_function_mask,
+
+	.send_call_func_ipi = native_send_call_func_ipi,
+	.send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index f09c1c69c37a..8e317782fe37 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1108,7 +1108,9 @@ static const struct smp_ops xen_smp_ops __initdata = {
 
 	.smp_send_stop = xen_smp_send_stop,
 	.smp_send_reschedule = xen_smp_send_reschedule,
-	.smp_call_function_mask = xen_smp_call_function_mask,
+
+	.send_call_func_ipi = xen_smp_send_call_function_ipi,
+	.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
 };
 #endif	/* CONFIG_SMP */
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index df40bf74ea75..5c01590380bc 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -558,7 +558,7 @@ static void drop_mm_ref(struct mm_struct *mm)
 	}
 
 	if (!cpus_empty(mask))
-		xen_smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
+		smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
 }
 #else
 static void drop_mm_ref(struct mm_struct *mm)
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 94e69000f982..b3786e749b8e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -36,27 +36,14 @@
 #include "mmu.h"
 
 static cpumask_t xen_cpu_initialized_map;
-static DEFINE_PER_CPU(int, resched_irq) = -1;
-static DEFINE_PER_CPU(int, callfunc_irq) = -1;
-static DEFINE_PER_CPU(int, debug_irq) = -1;
-
-/*
- * Structure and data for smp_call_function(). This is designed to minimise
- * static memory requirements. It also looks cleaner.
- */
-static DEFINE_SPINLOCK(call_lock);
 
-struct call_data_struct {
-	void (*func) (void *info);
-	void *info;
-	atomic_t started;
-	atomic_t finished;
-	int wait;
-};
+static DEFINE_PER_CPU(int, resched_irq);
+static DEFINE_PER_CPU(int, callfunc_irq);
+static DEFINE_PER_CPU(int, callfuncsingle_irq);
+static DEFINE_PER_CPU(int, debug_irq) = -1;
 
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
-
-static struct call_data_struct *call_data;
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
 
 /*
  * Reschedule call back. Nothing to do,
@@ -122,6 +109,17 @@ static int xen_smp_intr_init(unsigned int cpu)
 		goto fail;
 	per_cpu(debug_irq, cpu) = rc;
 
+	callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
+	rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
+				    cpu,
+				    xen_call_function_single_interrupt,
+				    IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
+				    callfunc_name,
+				    NULL);
+	if (rc < 0)
+		goto fail;
+	per_cpu(callfuncsingle_irq, cpu) = rc;
+
 	return 0;
 
  fail:
@@ -131,6 +129,9 @@ static int xen_smp_intr_init(unsigned int cpu)
 		unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
 	if (per_cpu(debug_irq, cpu) >= 0)
 		unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
+	if (per_cpu(callfuncsingle_irq, cpu) >= 0)
+		unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
+
 	return rc;
 }
 
@@ -338,7 +339,6 @@ void xen_smp_send_reschedule(int cpu)
 	xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
 }
 
-
 static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
 {
 	unsigned cpu;
@@ -349,83 +349,42 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
 		xen_send_IPI_one(cpu, vector);
 }
 
+void xen_smp_send_call_function_ipi(cpumask_t mask)
+{
+	int cpu;
+
+	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
+
+	/* Make sure other vcpus get a chance to run if they need to. */
+	for_each_cpu_mask(cpu, mask) {
+		if (xen_vcpu_stolen(cpu)) {
+			HYPERVISOR_sched_op(SCHEDOP_yield, 0);
+			break;
+		}
+	}
+}
+
+void xen_smp_send_call_function_single_ipi(int cpu)
+{
+	xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
+}
+
 static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
 {
-	void (*func) (void *info) = call_data->func;
-	void *info = call_data->info;
-	int wait = call_data->wait;
-
-	/*
-	 * Notify initiating CPU that I've grabbed the data and am
-	 * about to execute the function
-	 */
-	mb();
-	atomic_inc(&call_data->started);
-	/*
-	 * At this point the info structure may be out of scope unless wait==1
-	 */
 	irq_enter();
-	(*func)(info);
+	generic_smp_call_function_interrupt();
 	__get_cpu_var(irq_stat).irq_call_count++;
 	irq_exit();
 
-	if (wait) {
-		mb();		/* commit everything before setting finished */
-		atomic_inc(&call_data->finished);
-	}
-
 	return IRQ_HANDLED;
 }
 
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-			       void *info, int wait)
+static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
 {
-	struct call_data_struct data;
-	int cpus, cpu;
-	bool yield;
-
-	/* Holding any lock stops cpus from going down. */
-	spin_lock(&call_lock);
-
-	cpu_clear(smp_processor_id(), mask);
-
-	cpus = cpus_weight(mask);
-	if (!cpus) {
-		spin_unlock(&call_lock);
-		return 0;
-	}
-
-	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
-
-	data.func = func;
-	data.info = info;
-	atomic_set(&data.started, 0);
-	data.wait = wait;
-	if (wait)
-		atomic_set(&data.finished, 0);
-
-	call_data = &data;
-	mb();			/* write everything before IPI */
-
-	/* Send a message to other CPUs and wait for them to respond */
-	xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
-
-	/* Make sure other vcpus get a chance to run if they need to. */
-	yield = false;
-	for_each_cpu_mask(cpu, mask)
-		if (xen_vcpu_stolen(cpu))
-			yield = true;
-
-	if (yield)
-		HYPERVISOR_sched_op(SCHEDOP_yield, 0);
-
-	/* Wait for response */
-	while (atomic_read(&data.started) != cpus ||
-	       (wait && atomic_read(&data.finished) != cpus))
-		cpu_relax();
-
-	spin_unlock(&call_lock);
+	irq_enter();
+	generic_smp_call_function_single_interrupt();
+	__get_cpu_var(irq_stat).irq_call_count++;
+	irq_exit();
 
-	return 0;
+	return IRQ_HANDLED;
 }
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index f1063ae08037..a636ab5e1341 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -46,13 +46,8 @@ void xen_smp_cpus_done(unsigned int max_cpus);
 
 void xen_smp_send_stop(void);
 void xen_smp_send_reschedule(int cpu);
-int xen_smp_call_function (void (*func) (void *info), void *info, int nonatomic,
-			   int wait);
-int xen_smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-				 int nonatomic, int wait);
-
-int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *),
-			       void *info, int wait);
+void xen_smp_send_call_function_ipi(cpumask_t mask);
+void xen_smp_send_call_function_single_ipi(int cpu);
 
 
 /* Declare an asm function, along with symbols needed to make it
diff --git a/include/asm-x86/hw_irq_32.h b/include/asm-x86/hw_irq_32.h
index ea88054e03f3..a87b1320c78f 100644
--- a/include/asm-x86/hw_irq_32.h
+++ b/include/asm-x86/hw_irq_32.h
@@ -32,6 +32,7 @@ extern void (*const interrupt[NR_IRQS])(void);
 void reschedule_interrupt(void);
 void invalidate_interrupt(void);
 void call_function_interrupt(void);
+void call_function_single_interrupt(void);
 #endif
 
 #ifdef CONFIG_X86_LOCAL_APIC
diff --git a/include/asm-x86/hw_irq_64.h b/include/asm-x86/hw_irq_64.h
index 0062ef390f67..fe657812d4df 100644
--- a/include/asm-x86/hw_irq_64.h
+++ b/include/asm-x86/hw_irq_64.h
@@ -68,6 +68,7 @@
 #define ERROR_APIC_VECTOR	0xfe
 #define RESCHEDULE_VECTOR	0xfd
 #define CALL_FUNCTION_VECTOR	0xfc
+#define	CALL_FUNCTION_SINGLE_VECTOR	0xfb
 /* fb free - please don't readd KDB here because it's useless
    (hint - think what a NMI bit does to a vector) */
 #define THERMAL_APIC_VECTOR	0xfa
@@ -102,6 +103,7 @@ void spurious_interrupt(void);
 void error_interrupt(void);
 void reschedule_interrupt(void);
 void call_function_interrupt(void);
+void call_function_single_interrupt(void);
 void irq_move_cleanup_interrupt(void);
 void invalidate_interrupt0(void);
 void invalidate_interrupt1(void);
diff --git a/include/asm-x86/mach-default/entry_arch.h b/include/asm-x86/mach-default/entry_arch.h
index bc861469bdba..9283b60a1dd2 100644
--- a/include/asm-x86/mach-default/entry_arch.h
+++ b/include/asm-x86/mach-default/entry_arch.h
@@ -13,6 +13,7 @@
 BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
 BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
 BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
 #endif
 
 /*
diff --git a/include/asm-x86/mach-default/irq_vectors.h b/include/asm-x86/mach-default/irq_vectors.h
index 881c63ca61ad..ed7d4955c653 100644
--- a/include/asm-x86/mach-default/irq_vectors.h
+++ b/include/asm-x86/mach-default/irq_vectors.h
@@ -48,6 +48,7 @@
 #define INVALIDATE_TLB_VECTOR	0xfd
 #define RESCHEDULE_VECTOR	0xfc
 #define CALL_FUNCTION_VECTOR	0xfb
+#define CALL_FUNCTION_SINGLE_VECTOR	0xfa
 
 #define THERMAL_APIC_VECTOR	0xf0
 /*
diff --git a/include/asm-x86/mach-voyager/entry_arch.h b/include/asm-x86/mach-voyager/entry_arch.h
index 4a1e1e8c10b6..ae52624b5937 100644
--- a/include/asm-x86/mach-voyager/entry_arch.h
+++ b/include/asm-x86/mach-voyager/entry_arch.h
@@ -23,4 +23,4 @@ BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
 BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
 BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
 BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
-
+BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/include/asm-x86/mach-voyager/irq_vectors.h b/include/asm-x86/mach-voyager/irq_vectors.h
index 165421f5821c..fda57ad37b5d 100644
--- a/include/asm-x86/mach-voyager/irq_vectors.h
+++ b/include/asm-x86/mach-voyager/irq_vectors.h
@@ -33,6 +33,7 @@
 #define VIC_RESCHEDULE_CPI		4
 #define VIC_ENABLE_IRQ_CPI		5
 #define VIC_CALL_FUNCTION_CPI		6
+#define VIC_CALL_FUNCTION_SINGLE_CPI	7
 
 /* Now the QIC CPIs:  Since we don't need the two initial levels,
  * these are 2 less than the VIC CPIs */
@@ -42,9 +43,10 @@
 #define QIC_RESCHEDULE_CPI		(VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
 #define QIC_ENABLE_IRQ_CPI		(VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
 #define QIC_CALL_FUNCTION_CPI		(VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
+#define QIC_CALL_FUNCTION_SINGLE_CPI	(VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
 
 #define VIC_START_FAKE_CPI		VIC_TIMER_CPI
-#define VIC_END_FAKE_CPI		VIC_CALL_FUNCTION_CPI
+#define VIC_END_FAKE_CPI		VIC_CALL_FUNCTION_SINGLE_CPI
 
 /* this is the SYS_INT CPI. */
 #define VIC_SYS_INT			8
diff --git a/include/asm-x86/smp.h b/include/asm-x86/smp.h
index 1ebaa5cd3112..e3c24807b59b 100644
--- a/include/asm-x86/smp.h
+++ b/include/asm-x86/smp.h
@@ -59,9 +59,9 @@ struct smp_ops {
 
 	void (*smp_send_stop)(void);
 	void (*smp_send_reschedule)(int cpu);
-	int (*smp_call_function_mask)(cpumask_t mask,
-				      void (*func)(void *info), void *info,
-				      int wait);
+
+	void (*send_call_func_ipi)(cpumask_t mask);
+	void (*send_call_func_single_ipi)(int cpu);
 };
 
 /* Globals due to paravirt */
@@ -103,17 +103,22 @@ static inline void smp_send_reschedule(int cpu)
 	smp_ops.smp_send_reschedule(cpu);
 }
 
-static inline int smp_call_function_mask(cpumask_t mask,
-					 void (*func) (void *info), void *info,
-					 int wait)
+static inline void arch_send_call_function_single_ipi(int cpu)
+{
+	smp_ops.send_call_func_single_ipi(cpu);
+}
+
+static inline void arch_send_call_function_ipi(cpumask_t mask)
 {
-	return smp_ops.smp_call_function_mask(mask, func, info, wait);
+	smp_ops.send_call_func_ipi(mask);
 }
 
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum);
+void native_send_call_func_ipi(cpumask_t mask);
+void native_send_call_func_single_ipi(int cpu);
 
 extern int __cpu_disable(void);
 extern void __cpu_die(unsigned int cpu);
@@ -202,7 +207,5 @@ extern void cpu_uninit(void);
 #endif
 
 extern void smp_alloc_memory(void);
-extern void lock_ipi_call_lock(void);
-extern void unlock_ipi_call_lock(void);
 #endif /* __ASSEMBLY__ */
 #endif
diff --git a/include/asm-x86/xen/events.h b/include/asm-x86/xen/events.h
index 596312a7bfc9..f8d57ea1f05f 100644
--- a/include/asm-x86/xen/events.h
+++ b/include/asm-x86/xen/events.h
@@ -4,6 +4,7 @@
 enum ipi_vector {
 	XEN_RESCHEDULE_VECTOR,
 	XEN_CALL_FUNCTION_VECTOR,
+	XEN_CALL_FUNCTION_SINGLE_VECTOR,
 
 	XEN_NR_IPIS,
 };
-- 
cgit v1.2.3


From 8691e5a8f691cc2a4fda0651e8d307aaba0e7d68 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Fri, 6 Jun 2008 11:18:06 +0200
Subject: smp_call_function: get rid of the unused nonatomic/retry argument

It's never used and the comments refer to nonatomic and retry
interchangably. So get rid of it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/kernel/core_marvel.c      |  2 +-
 arch/alpha/kernel/smp.c              |  6 +++---
 arch/alpha/oprofile/common.c         |  6 +++---
 arch/arm/oprofile/op_model_mpcore.c  |  2 +-
 arch/arm/vfp/vfpmodule.c             |  2 +-
 arch/cris/arch-v32/kernel/smp.c      |  5 ++---
 arch/ia64/kernel/mca.c               |  2 +-
 arch/ia64/kernel/palinfo.c           |  2 +-
 arch/ia64/kernel/perfmon.c           |  2 +-
 arch/ia64/kernel/process.c           |  2 +-
 arch/ia64/kernel/smpboot.c           |  2 +-
 arch/ia64/kernel/uncached.c          |  5 ++---
 arch/ia64/sn/kernel/sn2/sn_hwperf.c  |  2 +-
 arch/m32r/kernel/smp.c               |  4 ++--
 arch/mips/kernel/smp.c               |  4 ++--
 arch/mips/mm/c-r4k.c                 | 18 +++++++++---------
 arch/mips/pmc-sierra/yosemite/prom.c |  2 +-
 arch/mips/sibyte/cfe/setup.c         |  2 +-
 arch/mips/sibyte/sb1250/prom.c       |  2 +-
 arch/powerpc/kernel/smp.c            |  2 +-
 arch/s390/appldata/appldata_base.c   |  4 ++--
 arch/s390/kernel/smp.c               | 16 ++++++----------
 arch/s390/kernel/time.c              |  4 ++--
 arch/sh/kernel/smp.c                 | 10 +++++-----
 arch/sparc64/kernel/smp.c            | 12 ++++--------
 arch/um/kernel/smp.c                 |  3 +--
 arch/x86/kernel/cpu/mtrr/main.c      |  4 ++--
 arch/x86/kernel/cpuid.c              |  2 +-
 arch/x86/kernel/ldt.c                |  2 +-
 arch/x86/kernel/nmi_32.c             |  2 +-
 arch/x86/kernel/nmi_64.c             |  2 +-
 arch/x86/kernel/smp.c                |  2 +-
 arch/x86/kernel/vsyscall_64.c        |  2 +-
 arch/x86/kvm/vmx.c                   |  2 +-
 arch/x86/kvm/x86.c                   |  2 +-
 arch/x86/lib/msr-on-cpu.c            |  8 ++++----
 arch/x86/mach-voyager/voyager_smp.c  |  2 +-
 arch/x86/xen/smp.c                   |  2 +-
 drivers/acpi/processor_idle.c        |  2 +-
 drivers/cpuidle/cpuidle.c            |  2 +-
 include/asm-alpha/smp.h              |  2 +-
 include/asm-sparc/smp.h              |  2 +-
 include/linux/smp.h                  |  8 ++++----
 kernel/smp.c                         |  6 ++----
 kernel/softirq.c                     |  2 +-
 kernel/time/tick-broadcast.c         |  2 +-
 net/core/flow.c                      |  2 +-
 net/iucv/iucv.c                      | 14 +++++++-------
 virt/kvm/kvm_main.c                  |  6 +++---
 49 files changed, 95 insertions(+), 108 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/core_marvel.c b/arch/alpha/kernel/core_marvel.c
index ced4aae8b804..04dcc5e5d4c1 100644
--- a/arch/alpha/kernel/core_marvel.c
+++ b/arch/alpha/kernel/core_marvel.c
@@ -662,7 +662,7 @@ __marvel_rtc_io(u8 b, unsigned long addr, int write)
 		if (smp_processor_id() != boot_cpuid)
 			smp_call_function_single(boot_cpuid,
 						 __marvel_access_rtc,
-						 &rtc_access, 1, 1);
+						 &rtc_access, 1);
 		else
 			__marvel_access_rtc(&rtc_access);
 #else
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 95c905be9154..44114c8dbb2a 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -710,7 +710,7 @@ flush_tlb_mm(struct mm_struct *mm)
 		}
 	}
 
-	if (smp_call_function(ipi_flush_tlb_mm, mm, 1, 1)) {
+	if (smp_call_function(ipi_flush_tlb_mm, mm, 1)) {
 		printk(KERN_CRIT "flush_tlb_mm: timed out\n");
 	}
 
@@ -763,7 +763,7 @@ flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
 	data.mm = mm;
 	data.addr = addr;
 
-	if (smp_call_function(ipi_flush_tlb_page, &data, 1, 1)) {
+	if (smp_call_function(ipi_flush_tlb_page, &data, 1)) {
 		printk(KERN_CRIT "flush_tlb_page: timed out\n");
 	}
 
@@ -815,7 +815,7 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page,
 		}
 	}
 
-	if (smp_call_function(ipi_flush_icache_page, mm, 1, 1)) {
+	if (smp_call_function(ipi_flush_icache_page, mm, 1)) {
 		printk(KERN_CRIT "flush_icache_page: timed out\n");
 	}
 
diff --git a/arch/alpha/oprofile/common.c b/arch/alpha/oprofile/common.c
index 9fc0eeb4f0ab..7c3d5ec6ec67 100644
--- a/arch/alpha/oprofile/common.c
+++ b/arch/alpha/oprofile/common.c
@@ -65,7 +65,7 @@ op_axp_setup(void)
 	model->reg_setup(&reg, ctr, &sys);
 
 	/* Configure the registers on all cpus.  */
-	(void)smp_call_function(model->cpu_setup, &reg, 0, 1);
+	(void)smp_call_function(model->cpu_setup, &reg, 1);
 	model->cpu_setup(&reg);
 	return 0;
 }
@@ -86,7 +86,7 @@ op_axp_cpu_start(void *dummy)
 static int
 op_axp_start(void)
 {
-	(void)smp_call_function(op_axp_cpu_start, NULL, 0, 1);
+	(void)smp_call_function(op_axp_cpu_start, NULL, 1);
 	op_axp_cpu_start(NULL);
 	return 0;
 }
@@ -101,7 +101,7 @@ op_axp_cpu_stop(void *dummy)
 static void
 op_axp_stop(void)
 {
-	(void)smp_call_function(op_axp_cpu_stop, NULL, 0, 1);
+	(void)smp_call_function(op_axp_cpu_stop, NULL, 1);
 	op_axp_cpu_stop(NULL);
 }
 
diff --git a/arch/arm/oprofile/op_model_mpcore.c b/arch/arm/oprofile/op_model_mpcore.c
index 74fae6045650..4458705021e0 100644
--- a/arch/arm/oprofile/op_model_mpcore.c
+++ b/arch/arm/oprofile/op_model_mpcore.c
@@ -201,7 +201,7 @@ static int em_call_function(int (*fn)(void))
 	data.ret = 0;
 
 	preempt_disable();
-	smp_call_function(em_func, &data, 1, 1);
+	smp_call_function(em_func, &data, 1);
 	em_func(&data);
 	preempt_enable();
 
diff --git a/arch/arm/vfp/vfpmodule.c b/arch/arm/vfp/vfpmodule.c
index 32455c633f1c..c0d2c9bb952b 100644
--- a/arch/arm/vfp/vfpmodule.c
+++ b/arch/arm/vfp/vfpmodule.c
@@ -352,7 +352,7 @@ static int __init vfp_init(void)
 	else if (vfpsid & FPSID_NODOUBLE) {
 		printk("no double precision support\n");
 	} else {
-		smp_call_function(vfp_enable, NULL, 1, 1);
+		smp_call_function(vfp_enable, NULL, 1);
 
 		VFP_arch = (vfpsid & FPSID_ARCH_MASK) >> FPSID_ARCH_BIT;  /* Extract the architecture version */
 		printk("implementor %02x architecture %d part %02x variant %x rev %x\n",
diff --git a/arch/cris/arch-v32/kernel/smp.c b/arch/cris/arch-v32/kernel/smp.c
index a9c3334e46c9..952a24b2f5a9 100644
--- a/arch/cris/arch-v32/kernel/smp.c
+++ b/arch/cris/arch-v32/kernel/smp.c
@@ -194,7 +194,7 @@ void stop_this_cpu(void* dummy)
 /* Other calls */
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 int setup_profiling_timer(unsigned int multiplier)
@@ -316,8 +316,7 @@ int send_ipi(int vector, int wait, cpumask_t cpu_mask)
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
 	cpumask_t cpu_mask = CPU_MASK_ALL;
 	struct call_data_struct data;
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 705176b434b3..9cd818cc7008 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1881,7 +1881,7 @@ static int __cpuinit mca_cpu_callback(struct notifier_block *nfb,
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 		smp_call_function_single(hotcpu, ia64_mca_cmc_vector_adjust,
-					 NULL, 1, 0);
+					 NULL, 0);
 		break;
 	}
 	return NOTIFY_OK;
diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c
index 9dc00f7fe10e..e5c57f413ca2 100644
--- a/arch/ia64/kernel/palinfo.c
+++ b/arch/ia64/kernel/palinfo.c
@@ -921,7 +921,7 @@ int palinfo_handle_smp(pal_func_cpu_u_t *f, char *page)
 
 
 	/* will send IPI to other CPU and wait for completion of remote call */
-	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 0, 1))) {
+	if ((ret=smp_call_function_single(f->req_cpu, palinfo_smp_call, &ptr, 1))) {
 		printk(KERN_ERR "palinfo: remote CPU call from %d to %d on function %d: "
 		       "error %d\n", smp_processor_id(), f->req_cpu, f->func_id, ret);
 		return 0;
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 7714a97b0104..9baa48255c12 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -1820,7 +1820,7 @@ pfm_syswide_cleanup_other_cpu(pfm_context_t *ctx)
 	int ret;
 
 	DPRINT(("calling CPU%d for cleanup\n", ctx->ctx_cpu));
-	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 0, 1);
+	ret = smp_call_function_single(ctx->ctx_cpu, pfm_syswide_force_stop, ctx, 1);
 	DPRINT(("called CPU%d for cleanup ret=%d\n", ctx->ctx_cpu, ret));
 }
 #endif /* CONFIG_SMP */
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index a3a34b4eb038..fabaf08d9a69 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -286,7 +286,7 @@ void cpu_idle_wait(void)
 {
 	smp_mb();
 	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
+	smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index eaa1b6795a13..9d1d429c6c59 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -317,7 +317,7 @@ ia64_sync_itc (unsigned int master)
 
 	go[MASTER] = 1;
 
-	if (smp_call_function_single(master, sync_master, NULL, 1, 0) < 0) {
+	if (smp_call_function_single(master, sync_master, NULL, 0) < 0) {
 		printk(KERN_ERR "sync_itc: failed to get attention of CPU %u!\n", master);
 		return;
 	}
diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c
index e77995a6e3ed..8eff8c1d40a6 100644
--- a/arch/ia64/kernel/uncached.c
+++ b/arch/ia64/kernel/uncached.c
@@ -123,8 +123,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	status = ia64_pal_prefetch_visibility(PAL_VISIBILITY_PHYSICAL);
 	if (status == PAL_VISIBILITY_OK_REMOTE_NEEDED) {
 		atomic_set(&uc_pool->status, 0);
-		status = smp_call_function(uncached_ipi_visibility, uc_pool,
-					   0, 1);
+		status = smp_call_function(uncached_ipi_visibility, uc_pool, 1);
 		if (status || atomic_read(&uc_pool->status))
 			goto failed;
 	} else if (status != PAL_VISIBILITY_OK)
@@ -146,7 +145,7 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid)
 	if (status != PAL_STATUS_SUCCESS)
 		goto failed;
 	atomic_set(&uc_pool->status, 0);
-	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 0, 1);
+	status = smp_call_function(uncached_ipi_mc_drain, uc_pool, 1);
 	if (status || atomic_read(&uc_pool->status))
 		goto failed;
 
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index 8cc0c4753d89..636588e7e068 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -629,7 +629,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
 		if (use_ipi) {
 			/* use an interprocessor interrupt to call SAL */
 			smp_call_function_single(cpu, sn_hwperf_call_sal,
-				op_info, 1, 1);
+				op_info, 1);
 		}
 		else {
 			/* migrate the task before calling SAL */ 
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 74eb7bcd5a40..7577f971ea4e 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -212,7 +212,7 @@ void smp_flush_tlb_all(void)
 	local_irq_save(flags);
 	__flush_tlb_all();
 	local_irq_restore(flags);
-	smp_call_function(flush_tlb_all_ipi, NULL, 1, 1);
+	smp_call_function(flush_tlb_all_ipi, NULL, 1);
 	preempt_enable();
 }
 
@@ -505,7 +505,7 @@ void smp_invalidate_interrupt(void)
  *==========================================================================*/
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 /*==========================================================================*
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index c75b26cb61df..7a9ae830be86 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -167,7 +167,7 @@ static void stop_this_cpu(void *dummy)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 1, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 void __init smp_cpus_done(unsigned int max_cpus)
@@ -266,7 +266,7 @@ static void flush_tlb_mm_ipi(void *mm)
 static inline void smp_on_other_tlbs(void (*func) (void *info), void *info)
 {
 #ifndef CONFIG_MIPS_MT_SMTC
-	smp_call_function(func, info, 1, 1);
+	smp_call_function(func, info, 1);
 #endif
 }
 
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index 27096751ddce..71df3390c07b 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -43,12 +43,12 @@
  *    primary cache.
  */
 static inline void r4k_on_each_cpu(void (*func) (void *info), void *info,
-                                   int retry, int wait)
+                                   int wait)
 {
 	preempt_disable();
 
 #if !defined(CONFIG_MIPS_MT_SMP) && !defined(CONFIG_MIPS_MT_SMTC)
-	smp_call_function(func, info, retry, wait);
+	smp_call_function(func, info, wait);
 #endif
 	func(info);
 	preempt_enable();
@@ -350,7 +350,7 @@ static inline void local_r4k___flush_cache_all(void * args)
 
 static void r4k___flush_cache_all(void)
 {
-	r4k_on_each_cpu(local_r4k___flush_cache_all, NULL, 1, 1);
+	r4k_on_each_cpu(local_r4k___flush_cache_all, NULL, 1);
 }
 
 static inline int has_valid_asid(const struct mm_struct *mm)
@@ -397,7 +397,7 @@ static void r4k_flush_cache_range(struct vm_area_struct *vma,
 	int exec = vma->vm_flags & VM_EXEC;
 
 	if (cpu_has_dc_aliases || (exec && !cpu_has_ic_fills_f_dc))
-		r4k_on_each_cpu(local_r4k_flush_cache_range, vma, 1, 1);
+		r4k_on_each_cpu(local_r4k_flush_cache_range, vma, 1);
 }
 
 static inline void local_r4k_flush_cache_mm(void * args)
@@ -429,7 +429,7 @@ static void r4k_flush_cache_mm(struct mm_struct *mm)
 	if (!cpu_has_dc_aliases)
 		return;
 
-	r4k_on_each_cpu(local_r4k_flush_cache_mm, mm, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_mm, mm, 1);
 }
 
 struct flush_cache_page_args {
@@ -521,7 +521,7 @@ static void r4k_flush_cache_page(struct vm_area_struct *vma,
 	args.addr = addr;
 	args.pfn = pfn;
 
-	r4k_on_each_cpu(local_r4k_flush_cache_page, &args, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_page, &args, 1);
 }
 
 static inline void local_r4k_flush_data_cache_page(void * addr)
@@ -535,7 +535,7 @@ static void r4k_flush_data_cache_page(unsigned long addr)
 		local_r4k_flush_data_cache_page((void *)addr);
 	else
 		r4k_on_each_cpu(local_r4k_flush_data_cache_page, (void *) addr,
-			        1, 1);
+			        1);
 }
 
 struct flush_icache_range_args {
@@ -571,7 +571,7 @@ static void r4k_flush_icache_range(unsigned long start, unsigned long end)
 	args.start = start;
 	args.end = end;
 
-	r4k_on_each_cpu(local_r4k_flush_icache_range, &args, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_icache_range, &args, 1);
 	instruction_hazard();
 }
 
@@ -672,7 +672,7 @@ static void local_r4k_flush_cache_sigtramp(void * arg)
 
 static void r4k_flush_cache_sigtramp(unsigned long addr)
 {
-	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr, 1, 1);
+	r4k_on_each_cpu(local_r4k_flush_cache_sigtramp, (void *) addr, 1);
 }
 
 static void r4k_flush_icache_all(void)
diff --git a/arch/mips/pmc-sierra/yosemite/prom.c b/arch/mips/pmc-sierra/yosemite/prom.c
index 35dc435846a6..cf4c868715ac 100644
--- a/arch/mips/pmc-sierra/yosemite/prom.c
+++ b/arch/mips/pmc-sierra/yosemite/prom.c
@@ -64,7 +64,7 @@ static void prom_exit(void)
 #ifdef CONFIG_SMP
 	if (smp_processor_id())
 		/* CPU 1 */
-		smp_call_function(prom_cpu0_exit, NULL, 1, 1);
+		smp_call_function(prom_cpu0_exit, NULL, 1);
 #endif
 	prom_cpu0_exit(NULL);
 }
diff --git a/arch/mips/sibyte/cfe/setup.c b/arch/mips/sibyte/cfe/setup.c
index 33fce826f8bf..fd9604d5555a 100644
--- a/arch/mips/sibyte/cfe/setup.c
+++ b/arch/mips/sibyte/cfe/setup.c
@@ -74,7 +74,7 @@ static void __noreturn cfe_linux_exit(void *arg)
 		if (!reboot_smp) {
 			/* Get CPU 0 to do the cfe_exit */
 			reboot_smp = 1;
-			smp_call_function(cfe_linux_exit, arg, 1, 0);
+			smp_call_function(cfe_linux_exit, arg, 0);
 		}
 	} else {
 		printk("Passing control back to CFE...\n");
diff --git a/arch/mips/sibyte/sb1250/prom.c b/arch/mips/sibyte/sb1250/prom.c
index cf8f6b3de86c..65b1af66b674 100644
--- a/arch/mips/sibyte/sb1250/prom.c
+++ b/arch/mips/sibyte/sb1250/prom.c
@@ -66,7 +66,7 @@ static void prom_linux_exit(void)
 {
 #ifdef CONFIG_SMP
 	if (smp_processor_id()) {
-		smp_call_function(prom_cpu0_exit, NULL, 1, 1);
+		smp_call_function(prom_cpu0_exit, NULL, 1);
 	}
 #endif
 	while(1);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 37a5ab410dcc..5191b46a611e 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -168,7 +168,7 @@ static void stop_this_cpu(void *dummy)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, NULL, 0, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 }
 
 extern struct gettimeofday_struct do_gtod;
diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c
index ad40729bec3d..837a3b3e7759 100644
--- a/arch/s390/appldata/appldata_base.c
+++ b/arch/s390/appldata/appldata_base.c
@@ -209,7 +209,7 @@ __appldata_vtimer_setup(int cmd)
 			per_cpu(appldata_timer, i).expires = per_cpu_interval;
 			smp_call_function_single(i, add_virt_timer_periodic,
 						 &per_cpu(appldata_timer, i),
-						 0, 1);
+						 1);
 		}
 		appldata_timer_active = 1;
 		P_INFO("Monitoring timer started.\n");
@@ -236,7 +236,7 @@ __appldata_vtimer_setup(int cmd)
 			args.timer = &per_cpu(appldata_timer, i);
 			args.expires = per_cpu_interval;
 			smp_call_function_single(i, __appldata_mod_vtimer_wrap,
-						 &args, 0, 1);
+						 &args, 1);
 		}
 	}
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 5d4fa4b1c74c..276b105fb2a4 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -109,7 +109,7 @@ static void do_call_function(void)
 }
 
 static void __smp_call_function_map(void (*func) (void *info), void *info,
-				    int nonatomic, int wait, cpumask_t map)
+				    int wait, cpumask_t map)
 {
 	struct call_data_struct data;
 	int cpu, local = 0;
@@ -162,7 +162,6 @@ out:
  * smp_call_function:
  * @func: the function to run; this must be fast and non-blocking
  * @info: an arbitrary pointer to pass to the function
- * @nonatomic: unused
  * @wait: if true, wait (atomically) until function has completed on other CPUs
  *
  * Run a function on all other CPUs.
@@ -170,15 +169,14 @@ out:
  * You must not call this function with disabled interrupts, from a
  * hardware interrupt handler or from a bottom half.
  */
-int smp_call_function(void (*func) (void *info), void *info, int nonatomic,
-		      int wait)
+int smp_call_function(void (*func) (void *info), void *info, int wait)
 {
 	cpumask_t map;
 
 	spin_lock(&call_lock);
 	map = cpu_online_map;
 	cpu_clear(smp_processor_id(), map);
-	__smp_call_function_map(func, info, nonatomic, wait, map);
+	__smp_call_function_map(func, info, wait, map);
 	spin_unlock(&call_lock);
 	return 0;
 }
@@ -189,7 +187,6 @@ EXPORT_SYMBOL(smp_call_function);
  * @cpu: the CPU where func should run
  * @func: the function to run; this must be fast and non-blocking
  * @info: an arbitrary pointer to pass to the function
- * @nonatomic: unused
  * @wait: if true, wait (atomically) until function has completed on other CPUs
  *
  * Run a function on one processor.
@@ -198,11 +195,10 @@ EXPORT_SYMBOL(smp_call_function);
  * hardware interrupt handler or from a bottom half.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int nonatomic, int wait)
+			     int wait)
 {
 	spin_lock(&call_lock);
-	__smp_call_function_map(func, info, nonatomic, wait,
-				cpumask_of_cpu(cpu));
+	__smp_call_function_map(func, info, wait, cpumask_of_cpu(cpu));
 	spin_unlock(&call_lock);
 	return 0;
 }
@@ -228,7 +224,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
 {
 	spin_lock(&call_lock);
 	cpu_clear(smp_processor_id(), mask);
-	__smp_call_function_map(func, info, 0, wait, mask);
+	__smp_call_function_map(func, info, wait, mask);
 	spin_unlock(&call_lock);
 	return 0;
 }
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 7aec676fefd5..bf7bf2c2236a 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -690,7 +690,7 @@ static int etr_sync_clock(struct etr_aib *aib, int port)
 	 */
 	memset(&etr_sync, 0, sizeof(etr_sync));
 	preempt_disable();
-	smp_call_function(etr_sync_cpu_start, NULL, 0, 0);
+	smp_call_function(etr_sync_cpu_start, NULL, 0);
 	local_irq_disable();
 	etr_enable_sync_clock();
 
@@ -729,7 +729,7 @@ static int etr_sync_clock(struct etr_aib *aib, int port)
 		rc = -EAGAIN;
 	}
 	local_irq_enable();
-	smp_call_function(etr_sync_cpu_end,NULL,0,0);
+	smp_call_function(etr_sync_cpu_end,NULL,0);
 	preempt_enable();
 	return rc;
 }
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 2ed8dceb297b..71781ba2675b 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -168,7 +168,7 @@ static void stop_this_cpu(void *unused)
 
 void smp_send_stop(void)
 {
-	smp_call_function(stop_this_cpu, 0, 1, 0);
+	smp_call_function(stop_this_cpu, 0, 0);
 }
 
 void arch_send_call_function_ipi(cpumask_t mask)
@@ -223,7 +223,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 	preempt_disable();
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
-		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1);
+		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -260,7 +260,7 @@ void flush_tlb_range(struct vm_area_struct *vma,
 		fd.vma = vma;
 		fd.addr1 = start;
 		fd.addr2 = end;
-		smp_call_function(flush_tlb_range_ipi, (void *)&fd, 1, 1);
+		smp_call_function(flush_tlb_range_ipi, (void *)&fd, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -303,7 +303,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 
 		fd.vma = vma;
 		fd.addr1 = page;
-		smp_call_function(flush_tlb_page_ipi, (void *)&fd, 1, 1);
+		smp_call_function(flush_tlb_page_ipi, (void *)&fd, 1);
 	} else {
 		int i;
 		for (i = 0; i < num_online_cpus(); i++)
@@ -327,6 +327,6 @@ void flush_tlb_one(unsigned long asid, unsigned long vaddr)
 	fd.addr1 = asid;
 	fd.addr2 = vaddr;
 
-	smp_call_function(flush_tlb_one_ipi, (void *)&fd, 1, 1);
+	smp_call_function(flush_tlb_one_ipi, (void *)&fd, 1);
 	local_flush_tlb_one(asid, vaddr);
 }
diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c
index b82d017a1744..c099d96f1239 100644
--- a/arch/sparc64/kernel/smp.c
+++ b/arch/sparc64/kernel/smp.c
@@ -807,7 +807,6 @@ extern unsigned long xcall_call_function;
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @nonatomic: currently unused.
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code. Does not return until
@@ -817,8 +816,7 @@ extern unsigned long xcall_call_function;
  * hardware interrupt handler or from a bottom half handler.
  */
 static int sparc64_smp_call_function_mask(void (*func)(void *info), void *info,
-					  int nonatomic, int wait,
-					  cpumask_t mask)
+					  int wait, cpumask_t mask)
 {
 	struct call_data_struct data;
 	int cpus;
@@ -853,11 +851,9 @@ out_unlock:
 	return 0;
 }
 
-int smp_call_function(void (*func)(void *info), void *info,
-		      int nonatomic, int wait)
+int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
-	return sparc64_smp_call_function_mask(func, info, nonatomic, wait,
-						cpu_online_map);
+	return sparc64_smp_call_function_mask(func, info, wait, cpu_online_map);
 }
 
 void smp_call_function_client(int irq, struct pt_regs *regs)
@@ -894,7 +890,7 @@ static void tsb_sync(void *info)
 
 void smp_tsb_sync(struct mm_struct *mm)
 {
-	sparc64_smp_call_function_mask(tsb_sync, mm, 0, 1, mm->cpu_vm_mask);
+	sparc64_smp_call_function_mask(tsb_sync, mm, 1, mm->cpu_vm_mask);
 }
 
 extern unsigned long xcall_flush_tlb_mm;
diff --git a/arch/um/kernel/smp.c b/arch/um/kernel/smp.c
index e1062ec36d40..be2d50c3aa95 100644
--- a/arch/um/kernel/smp.c
+++ b/arch/um/kernel/smp.c
@@ -214,8 +214,7 @@ void smp_call_function_slave(int cpu)
 	atomic_inc(&scf_finished);
 }
 
-int smp_call_function(void (*_func)(void *info), void *_info, int nonatomic,
-		      int wait)
+int smp_call_function(void (*_func)(void *info), void *_info, int wait)
 {
 	int cpus = num_online_cpus() - 1;
 	int i;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6a1e278d9323..290652cefddb 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -222,7 +222,7 @@ static void set_mtrr(unsigned int reg, unsigned long base,
 	atomic_set(&data.gate,0);
 
 	/*  Start the ball rolling on other CPUs  */
-	if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
+	if (smp_call_function(ipi_handler, &data, 0) != 0)
 		panic("mtrr: timed out waiting for other CPUs\n");
 
 	local_irq_save(flags);
@@ -822,7 +822,7 @@ void mtrr_ap_init(void)
  */
 void mtrr_save_state(void)
 {
-	smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1, 1);
+	smp_call_function_single(0, mtrr_save_fixed_ranges, NULL, 1);
 }
 
 static int __init mtrr_init_finialize(void)
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index daff52a62248..336dd43c9158 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -95,7 +95,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
 	for (; count; count -= 16) {
 		cmd.eax = pos;
 		cmd.ecx = pos >> 32;
-		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1, 1);
+		smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
 		if (copy_to_user(tmp, &cmd, 16))
 			return -EFAULT;
 		tmp += 16;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 0224c3637c73..cb0a6398c64b 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -68,7 +68,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		load_LDT(pc);
 		mask = cpumask_of_cpu(smp_processor_id());
 		if (!cpus_equal(current->mm->cpu_vm_mask, mask))
-			smp_call_function(flush_ldt, NULL, 1, 1);
+			smp_call_function(flush_ldt, NULL, 1);
 		preempt_enable();
 #else
 		load_LDT(pc);
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 84160f74eeb0..5562dab0bd20 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -87,7 +87,7 @@ int __init check_nmi_watchdog(void)
 
 #ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
 #endif
 
 	for_each_possible_cpu(cpu)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 5a29ded994fa..2f1e4f503c9e 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -96,7 +96,7 @@ int __init check_nmi_watchdog(void)
 
 #ifdef CONFIG_SMP
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0, 0);
+		smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
 #endif
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 575aa3d7248a..56546e8a13ac 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -164,7 +164,7 @@ static void native_smp_send_stop(void)
 	if (reboot_force)
 		return;
 
-	smp_call_function(stop_this_cpu, NULL, 0, 0);
+	smp_call_function(stop_this_cpu, NULL, 0);
 	local_irq_save(flags);
 	disable_local_APIC();
 	local_irq_restore(flags);
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 61efa2f7d564..0a03d57f9b3b 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -278,7 +278,7 @@ cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg)
 {
 	long cpu = (long)arg;
 	if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
-		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1);
+		smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 1);
 	return NOTIFY_DONE;
 }
 
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 540e95179074..5534fe59b5fc 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -335,7 +335,7 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 {
 	if (vmx->vcpu.cpu == -1)
 		return;
-	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 0, 1);
+	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 	vmx->launched = 0;
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 63a77caa59f1..0faa2546b1cd 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4044,6 +4044,6 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 	 * So need not to call smp_call_function_single() in that case.
 	 */
 	if (vcpu->guest_mode && vcpu->cpu != cpu)
-		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0, 0);
+		smp_call_function_single(ipi_pcpu, vcpu_kick_intr, vcpu, 0);
 	put_cpu();
 }
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 57d043fa893e..d5a2b39f882b 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -30,10 +30,10 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe)
 
 	rv.msr_no = msr_no;
 	if (safe) {
-		smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
 		err = rv.err;
 	} else {
-		smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
 	}
 	*l = rv.l;
 	*h = rv.h;
@@ -64,10 +64,10 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe)
 	rv.l = l;
 	rv.h = h;
 	if (safe) {
-		smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
 		err = rv.err;
 	} else {
-		smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 0, 1);
+		smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
 	}
 
 	return err;
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index cb34407a9930..04f596eab749 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1113,7 +1113,7 @@ int safe_smp_processor_id(void)
 /* broadcast a halt to all other CPUs */
 static void voyager_smp_send_stop(void)
 {
-	smp_call_function(smp_stop_cpu_function, NULL, 1, 1);
+	smp_call_function(smp_stop_cpu_function, NULL, 1);
 }
 
 /* this function is triggered in time.c when a clock tick fires
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index b3786e749b8e..a1651d029ea8 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -331,7 +331,7 @@ static void stop_self(void *v)
 
 void xen_smp_send_stop(void)
 {
-	smp_call_function(stop_self, NULL, 0, 0);
+	smp_call_function(stop_self, NULL, 0);
 }
 
 void xen_smp_send_reschedule(int cpu)
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 556ee1585192..4976e5db2b3f 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -1339,7 +1339,7 @@ static void smp_callback(void *v)
 static int acpi_processor_latency_notify(struct notifier_block *b,
 		unsigned long l, void *v)
 {
-	smp_call_function(smp_callback, NULL, 0, 1);
+	smp_call_function(smp_callback, NULL, 1);
 	return NOTIFY_OK;
 }
 
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 23554b676d6e..5405769020a1 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -340,7 +340,7 @@ static void smp_callback(void *v)
 static int cpuidle_latency_notify(struct notifier_block *b,
 		unsigned long l, void *v)
 {
-	smp_call_function(smp_callback, NULL, 0, 1);
+	smp_call_function(smp_callback, NULL, 1);
 	return NOTIFY_OK;
 }
 
diff --git a/include/asm-alpha/smp.h b/include/asm-alpha/smp.h
index 2f60a362d75e..544c69af8168 100644
--- a/include/asm-alpha/smp.h
+++ b/include/asm-alpha/smp.h
@@ -53,7 +53,7 @@ extern void arch_send_call_function_ipi(cpumask_t mask);
 #else /* CONFIG_SMP */
 
 #define hard_smp_processor_id()		0
-#define smp_call_function_on_cpu(func,info,retry,wait,cpu)    ({ 0; })
+#define smp_call_function_on_cpu(func,info,wait,cpu)    ({ 0; })
 
 #endif /* CONFIG_SMP */
 
diff --git a/include/asm-sparc/smp.h b/include/asm-sparc/smp.h
index e6d561599726..b61e74bea06a 100644
--- a/include/asm-sparc/smp.h
+++ b/include/asm-sparc/smp.h
@@ -72,7 +72,7 @@ static inline void xc5(smpfunc_t func, unsigned long arg1, unsigned long arg2,
 			   unsigned long arg3, unsigned long arg4, unsigned long arg5)
 { smp_cross_call(func, arg1, arg2, arg3, arg4, arg5); }
 
-static inline int smp_call_function(void (*func)(void *info), void *info, int nonatomic, int wait)
+static inline int smp_call_function(void (*func)(void *info), void *info, int wait)
 {
 	xc1((smpfunc_t)func, (unsigned long)info);
 	return 0;
diff --git a/include/linux/smp.h b/include/linux/smp.h
index eac3e062250f..338cad1b9548 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -62,11 +62,11 @@ extern void smp_cpus_done(unsigned int max_cpus);
 /*
  * Call a function on all other processors
  */
-int smp_call_function(void(*func)(void *info), void *info, int retry, int wait);
+int smp_call_function(void(*func)(void *info), void *info, int wait);
 int smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 				int wait);
 int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
-				int retry, int wait);
+				int wait);
 void __smp_call_function_single(int cpuid, struct call_single_data *data);
 
 /*
@@ -119,7 +119,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 {
 	return 0;
 }
-#define smp_call_function(func, info, retry, wait) \
+#define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
 #define on_each_cpu(func,info,retry,wait)	\
 	({					\
@@ -131,7 +131,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 static inline void smp_send_reschedule(int cpu) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
-#define smp_call_function_single(cpuid, func, info, retry, wait) \
+#define smp_call_function_single(cpuid, func, info, wait) \
 ({ \
 	WARN_ON(cpuid != 0);	\
 	local_irq_disable();	\
diff --git a/kernel/smp.c b/kernel/smp.c
index f77b75c027ad..7e0432a4a0e2 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -195,7 +195,6 @@ void generic_smp_call_function_single_interrupt(void)
  * smp_call_function_single - Run a function on a specific CPU
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @retry: Unused
  * @wait: If true, wait until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code. Note that @wait
@@ -203,7 +202,7 @@ void generic_smp_call_function_single_interrupt(void)
  * we fall back to on-stack allocation.
  */
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
-			     int retry, int wait)
+			     int wait)
 {
 	struct call_single_data d;
 	unsigned long flags;
@@ -339,7 +338,6 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @natomic: Unused
  * @wait: If true, wait (atomically) until function has completed on other CPUs.
  *
  * Returns 0 on success, else a negative status code.
@@ -351,7 +349,7 @@ EXPORT_SYMBOL(smp_call_function_mask);
  * You must not call this function with disabled interrupts or from a
  * hardware interrupt handler or from a bottom half handler.
  */
-int smp_call_function(void (*func)(void *), void *info, int natomic, int wait)
+int smp_call_function(void (*func)(void *), void *info, int wait)
 {
 	int ret;
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..d73afb4764ef 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -679,7 +679,7 @@ int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
 	int ret = 0;
 
 	preempt_disable();
-	ret = smp_call_function(func, info, retry, wait);
+	ret = smp_call_function(func, info, wait);
 	local_irq_disable();
 	func(info);
 	local_irq_enable();
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..75e718539dcb 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -266,7 +266,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
 		       "offline CPU #%d\n", *oncpu);
 	else
 		smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
-					 &reason, 1, 1);
+					 &reason, 1);
 }
 
 /*
diff --git a/net/core/flow.c b/net/core/flow.c
index 19991175fdeb..5cf81052d044 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -298,7 +298,7 @@ void flow_cache_flush(void)
 	init_completion(&info.completion);
 
 	local_bh_disable();
-	smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0);
+	smp_call_function(flow_cache_flush_per_cpu, &info, 0);
 	flow_cache_flush_tasklet((unsigned long)&info);
 	local_bh_enable();
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 918970762131..94d5a45c3a57 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -480,7 +480,7 @@ static void iucv_setmask_mp(void)
 		if (cpu_isset(cpu, iucv_buffer_cpumask) &&
 		    !cpu_isset(cpu, iucv_irq_cpumask))
 			smp_call_function_single(cpu, iucv_allow_cpu,
-						 NULL, 0, 1);
+						 NULL, 1);
 	preempt_enable();
 }
 
@@ -498,7 +498,7 @@ static void iucv_setmask_up(void)
 	cpumask = iucv_irq_cpumask;
 	cpu_clear(first_cpu(iucv_irq_cpumask), cpumask);
 	for_each_cpu_mask(cpu, cpumask)
-		smp_call_function_single(cpu, iucv_block_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_block_cpu, NULL, 1);
 }
 
 /**
@@ -523,7 +523,7 @@ static int iucv_enable(void)
 	rc = -EIO;
 	preempt_disable();
 	for_each_online_cpu(cpu)
-		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
 	preempt_enable();
 	if (cpus_empty(iucv_buffer_cpumask))
 		/* No cpu could declare an iucv buffer. */
@@ -580,7 +580,7 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 	case CPU_ONLINE_FROZEN:
 	case CPU_DOWN_FAILED:
 	case CPU_DOWN_FAILED_FROZEN:
-		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_declare_cpu, NULL, 1);
 		break;
 	case CPU_DOWN_PREPARE:
 	case CPU_DOWN_PREPARE_FROZEN:
@@ -589,10 +589,10 @@ static int __cpuinit iucv_cpu_notify(struct notifier_block *self,
 		if (cpus_empty(cpumask))
 			/* Can't offline last IUCV enabled cpu. */
 			return NOTIFY_BAD;
-		smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 0, 1);
+		smp_call_function_single(cpu, iucv_retrieve_cpu, NULL, 1);
 		if (cpus_empty(iucv_irq_cpumask))
 			smp_call_function_single(first_cpu(iucv_buffer_cpumask),
-						 iucv_allow_cpu, NULL, 0, 1);
+						 iucv_allow_cpu, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
@@ -652,7 +652,7 @@ static void iucv_cleanup_queue(void)
 	 * pending interrupts force them to the work queue by calling
 	 * an empty function on all cpus.
 	 */
-	smp_call_function(__iucv_cleanup_queue, NULL, 0, 1);
+	smp_call_function(__iucv_cleanup_queue, NULL, 1);
 	spin_lock_irq(&iucv_queue_lock);
 	list_for_each_entry_safe(p, n, &iucv_task_queue, list) {
 		/* Remove stale work items from the task queue. */
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2d29e260da3d..ea1f595f8a87 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1266,12 +1266,12 @@ static int kvm_cpu_hotplug(struct notifier_block *notifier, unsigned long val,
 	case CPU_UP_CANCELED:
 		printk(KERN_INFO "kvm: disabling virtualization on CPU%d\n",
 		       cpu);
-		smp_call_function_single(cpu, hardware_disable, NULL, 0, 1);
+		smp_call_function_single(cpu, hardware_disable, NULL, 1);
 		break;
 	case CPU_ONLINE:
 		printk(KERN_INFO "kvm: enabling virtualization on CPU%d\n",
 		       cpu);
-		smp_call_function_single(cpu, hardware_enable, NULL, 0, 1);
+		smp_call_function_single(cpu, hardware_enable, NULL, 1);
 		break;
 	}
 	return NOTIFY_OK;
@@ -1474,7 +1474,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 	for_each_online_cpu(cpu) {
 		smp_call_function_single(cpu,
 				kvm_arch_check_processor_compat,
-				&r, 0, 1);
+				&r, 1);
 		if (r < 0)
 			goto out_free_1;
 	}
-- 
cgit v1.2.3


From 15c8b6c1aaaf1c4edd67e2f02e4d8e1bd1a51c0d Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Fri, 9 May 2008 09:39:44 +0200
Subject: on_each_cpu(): kill unused 'retry' parameter

It's not even passed on to smp_call_function() anymore, since that
was removed. So kill it.

Acked-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/alpha/kernel/process.c            |  2 +-
 arch/alpha/kernel/smp.c                |  4 ++--
 arch/arm/kernel/smp.c                  |  6 +++---
 arch/ia64/kernel/mca.c                 |  4 ++--
 arch/ia64/kernel/perfmon.c             |  4 ++--
 arch/ia64/kernel/smp.c                 |  4 ++--
 arch/mips/kernel/irq-rm9000.c          |  4 ++--
 arch/mips/kernel/smp.c                 |  4 ++--
 arch/mips/oprofile/common.c            |  6 +++---
 arch/parisc/kernel/cache.c             |  6 +++---
 arch/parisc/kernel/smp.c               |  2 +-
 arch/parisc/mm/init.c                  |  2 +-
 arch/powerpc/kernel/rtas.c             |  2 +-
 arch/powerpc/kernel/tau_6xx.c          |  4 ++--
 arch/powerpc/kernel/time.c             |  2 +-
 arch/powerpc/mm/slice.c                |  2 +-
 arch/powerpc/oprofile/common.c         |  6 +++---
 arch/s390/kernel/smp.c                 |  6 +++---
 arch/s390/kernel/time.c                |  2 +-
 arch/sh/kernel/smp.c                   |  4 ++--
 arch/sparc64/mm/hugetlbpage.c          |  2 +-
 arch/x86/kernel/cpu/mcheck/mce_64.c    |  6 +++---
 arch/x86/kernel/cpu/mcheck/non-fatal.c |  2 +-
 arch/x86/kernel/cpu/perfctr-watchdog.c |  4 ++--
 arch/x86/kernel/io_apic_32.c           |  2 +-
 arch/x86/kernel/io_apic_64.c           |  2 +-
 arch/x86/kernel/nmi_32.c               |  4 ++--
 arch/x86/kernel/nmi_64.c               |  4 ++--
 arch/x86/kernel/tlb_32.c               |  2 +-
 arch/x86/kernel/tlb_64.c               |  2 +-
 arch/x86/kernel/vsyscall_64.c          |  2 +-
 arch/x86/kvm/vmx.c                     |  2 +-
 arch/x86/mach-voyager/voyager_smp.c    |  2 +-
 arch/x86/mm/pageattr.c                 |  4 ++--
 arch/x86/oprofile/nmi_int.c            | 10 +++++-----
 drivers/char/agp/generic.c             |  2 +-
 drivers/lguest/x86/core.c              |  4 ++--
 fs/buffer.c                            |  2 +-
 include/linux/smp.h                    |  4 ++--
 kernel/hrtimer.c                       |  2 +-
 kernel/profile.c                       |  6 +++---
 kernel/rcupdate.c                      |  2 +-
 kernel/softirq.c                       |  2 +-
 mm/page_alloc.c                        |  2 +-
 mm/slab.c                              |  4 ++--
 mm/slub.c                              |  2 +-
 net/iucv/iucv.c                        |  2 +-
 virt/kvm/kvm_main.c                    |  8 ++++----
 48 files changed, 84 insertions(+), 84 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 96ed82fd9eef..351407e07e71 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -160,7 +160,7 @@ common_shutdown(int mode, char *restart_cmd)
 	struct halt_info args;
 	args.mode = mode;
 	args.restart_cmd = restart_cmd;
-	on_each_cpu(common_shutdown_1, &args, 1, 0);
+	on_each_cpu(common_shutdown_1, &args, 0);
 }
 
 void
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index 44114c8dbb2a..83df541650fc 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -657,7 +657,7 @@ void
 smp_imb(void)
 {
 	/* Must wait other processors to flush their icache before continue. */
-	if (on_each_cpu(ipi_imb, NULL, 1, 1))
+	if (on_each_cpu(ipi_imb, NULL, 1))
 		printk(KERN_CRIT "smp_imb: timed out\n");
 }
 EXPORT_SYMBOL(smp_imb);
@@ -673,7 +673,7 @@ flush_tlb_all(void)
 {
 	/* Although we don't have any data to pass, we do want to
 	   synchronize with the other processors.  */
-	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1)) {
+	if (on_each_cpu(ipi_flush_tlb_all, NULL, 1)) {
 		printk(KERN_CRIT "flush_tlb_all: timed out\n");
 	}
 }
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index 6344466b2113..5a7c09564d13 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -604,7 +604,7 @@ static inline void ipi_flush_tlb_kernel_range(void *arg)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(ipi_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(ipi_flush_tlb_all, NULL, 1);
 }
 
 void flush_tlb_mm(struct mm_struct *mm)
@@ -631,7 +631,7 @@ void flush_tlb_kernel_page(unsigned long kaddr)
 
 	ta.ta_start = kaddr;
 
-	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_page, &ta, 1);
 }
 
 void flush_tlb_range(struct vm_area_struct *vma,
@@ -654,5 +654,5 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	ta.ta_start = start;
 	ta.ta_end = end;
 
-	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1, 1);
+	on_each_cpu(ipi_flush_tlb_kernel_range, &ta, 1);
 }
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index 9cd818cc7008..7dd96c127177 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -707,7 +707,7 @@ ia64_mca_cmc_vector_enable (void *dummy)
 static void
 ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_disable, NULL, 0);
 }
 
 /*
@@ -719,7 +719,7 @@ ia64_mca_cmc_vector_disable_keventd(struct work_struct *unused)
 static void
 ia64_mca_cmc_vector_enable_keventd(struct work_struct *unused)
 {
-	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 1, 0);
+	on_each_cpu(ia64_mca_cmc_vector_enable, NULL, 0);
 }
 
 /*
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 9baa48255c12..19d4493c6193 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -6508,7 +6508,7 @@ pfm_install_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 	}
 
 	/* save the current system wide pmu states */
-	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_save_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 		goto cleanup_reserve;
@@ -6553,7 +6553,7 @@ pfm_remove_alt_pmu_interrupt(pfm_intr_handler_desc_t *hdl)
 
 	pfm_alt_intr_handler = NULL;
 
-	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 0, 1);
+	ret = on_each_cpu(pfm_alt_restore_pmu_state, NULL, 1);
 	if (ret) {
 		DPRINT(("on_each_cpu() failed: %d\n", ret));
 	}
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index 19152dcbf6e4..3676468612b6 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -285,7 +285,7 @@ smp_flush_tlb_cpumask(cpumask_t xcpumask)
 void
 smp_flush_tlb_all (void)
 {
-	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu((void (*)(void *))local_flush_tlb_all, NULL, 1);
 }
 
 void
@@ -308,7 +308,7 @@ smp_flush_tlb_mm (struct mm_struct *mm)
 	 * anyhow, and once a CPU is interrupted, the cost of local_flush_tlb_all() is
 	 * rather trivial.
 	 */
-	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1, 1);
+	on_each_cpu((void (*)(void *))local_finish_flush_tlb_mm, mm, 1);
 }
 
 void arch_send_call_function_single_ipi(int cpu)
diff --git a/arch/mips/kernel/irq-rm9000.c b/arch/mips/kernel/irq-rm9000.c
index ed9febe63d72..b47e4615ec12 100644
--- a/arch/mips/kernel/irq-rm9000.c
+++ b/arch/mips/kernel/irq-rm9000.c
@@ -49,7 +49,7 @@ static void local_rm9k_perfcounter_irq_startup(void *args)
 
 static unsigned int rm9k_perfcounter_irq_startup(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_startup, (void *) irq, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static void local_rm9k_perfcounter_irq_shutdown(void *args)
 
 static void rm9k_perfcounter_irq_shutdown(unsigned int irq)
 {
-	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 0, 1);
+	on_each_cpu(local_rm9k_perfcounter_irq_shutdown, (void *) irq, 1);
 }
 
 static struct irq_chip rm9k_irq_controller = {
diff --git a/arch/mips/kernel/smp.c b/arch/mips/kernel/smp.c
index 7a9ae830be86..4410f172b8ab 100644
--- a/arch/mips/kernel/smp.c
+++ b/arch/mips/kernel/smp.c
@@ -246,7 +246,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, NULL, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -366,7 +366,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 		.addr2 = end,
 	};
 
-	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, &fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/mips/oprofile/common.c b/arch/mips/oprofile/common.c
index b5f6f71b27bc..dd2fbd6645c1 100644
--- a/arch/mips/oprofile/common.c
+++ b/arch/mips/oprofile/common.c
@@ -27,7 +27,7 @@ static int op_mips_setup(void)
 	model->reg_setup(ctr);
 
 	/* Configure the registers on all cpus.  */
-	on_each_cpu(model->cpu_setup, NULL, 0, 1);
+	on_each_cpu(model->cpu_setup, NULL, 1);
 
         return 0;
 }
@@ -58,7 +58,7 @@ static int op_mips_create_files(struct super_block * sb, struct dentry * root)
 
 static int op_mips_start(void)
 {
-	on_each_cpu(model->cpu_start, NULL, 0, 1);
+	on_each_cpu(model->cpu_start, NULL, 1);
 
 	return 0;
 }
@@ -66,7 +66,7 @@ static int op_mips_start(void)
 static void op_mips_stop(void)
 {
 	/* Disable performance monitoring for all counters.  */
-	on_each_cpu(model->cpu_stop, NULL, 0, 1);
+	on_each_cpu(model->cpu_stop, NULL, 1);
 }
 
 int __init oprofile_arch_init(struct oprofile_operations *ops)
diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c
index e10d25d2d9c9..5259d8c20676 100644
--- a/arch/parisc/kernel/cache.c
+++ b/arch/parisc/kernel/cache.c
@@ -51,12 +51,12 @@ static struct pdc_btlb_info btlb_info __read_mostly;
 void
 flush_data_cache(void)
 {
-	on_each_cpu(flush_data_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_data_cache_local, NULL, 1);
 }
 void 
 flush_instruction_cache(void)
 {
-	on_each_cpu(flush_instruction_cache_local, NULL, 1, 1);
+	on_each_cpu(flush_instruction_cache_local, NULL, 1);
 }
 #endif
 
@@ -515,7 +515,7 @@ static void cacheflush_h_tmp_function(void *dummy)
 
 void flush_cache_all(void)
 {
-	on_each_cpu(cacheflush_h_tmp_function, NULL, 1, 1);
+	on_each_cpu(cacheflush_h_tmp_function, NULL, 1);
 }
 
 void flush_cache_mm(struct mm_struct *mm)
diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c
index 126105c76a44..d47f3975c9c6 100644
--- a/arch/parisc/kernel/smp.c
+++ b/arch/parisc/kernel/smp.c
@@ -292,7 +292,7 @@ void arch_send_call_function_single_ipi(int cpu)
 void
 smp_flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 }
 
 /*
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index ce0da689a89d..b4d6c8777ed0 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -1053,7 +1053,7 @@ void flush_tlb_all(void)
 	    do_recycle++;
 	}
 	spin_unlock(&sid_lock);
-	on_each_cpu(flush_tlb_all_local, NULL, 1, 1);
+	on_each_cpu(flush_tlb_all_local, NULL, 1);
 	if (do_recycle) {
 	    spin_lock(&sid_lock);
 	    recycle_sids(recycle_ndirty,recycle_dirty_array);
diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index 34843c318419..647f3e8677dc 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -747,7 +747,7 @@ static int rtas_ibm_suspend_me(struct rtas_args *args)
 	/* Call function on all CPUs.  One of us will make the
 	 * rtas call
 	 */
-	if (on_each_cpu(rtas_percpu_suspend_me, &data, 1, 0))
+	if (on_each_cpu(rtas_percpu_suspend_me, &data, 0))
 		data.error = -EINVAL;
 
 	wait_for_completion(&done);
diff --git a/arch/powerpc/kernel/tau_6xx.c b/arch/powerpc/kernel/tau_6xx.c
index 368a4934f7ee..c3a56d65c5a9 100644
--- a/arch/powerpc/kernel/tau_6xx.c
+++ b/arch/powerpc/kernel/tau_6xx.c
@@ -192,7 +192,7 @@ static void tau_timeout_smp(unsigned long unused)
 
 	/* schedule ourselves to be run again */
 	mod_timer(&tau_timer, jiffies + shrink_timer) ;
-	on_each_cpu(tau_timeout, NULL, 1, 0);
+	on_each_cpu(tau_timeout, NULL, 0);
 }
 
 /*
@@ -234,7 +234,7 @@ int __init TAU_init(void)
 	tau_timer.expires = jiffies + shrink_timer;
 	add_timer(&tau_timer);
 
-	on_each_cpu(TAU_init_smp, NULL, 1, 0);
+	on_each_cpu(TAU_init_smp, NULL, 0);
 
 	printk("Thermal assist unit ");
 #ifdef CONFIG_TAU_INT
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 73401e83739a..f1a38a6c1e2d 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -322,7 +322,7 @@ void snapshot_timebases(void)
 {
 	if (!cpu_has_feature(CPU_FTR_PURR))
 		return;
-	on_each_cpu(snapshot_tb_and_purr, NULL, 0, 1);
+	on_each_cpu(snapshot_tb_and_purr, NULL, 1);
 }
 
 /*
diff --git a/arch/powerpc/mm/slice.c b/arch/powerpc/mm/slice.c
index ad928edafb0a..2bd12d965db1 100644
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -218,7 +218,7 @@ static void slice_convert(struct mm_struct *mm, struct slice_mask mask, int psiz
 	mb();
 
 	/* XXX this is sub-optimal but will do for now */
-	on_each_cpu(slice_flush_segments, mm, 0, 1);
+	on_each_cpu(slice_flush_segments, mm, 1);
 #ifdef CONFIG_SPU_BASE
 	spu_flush_all_slbs(mm);
 #endif
diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c
index 4908dc98f9ca..17807acb05d9 100644
--- a/arch/powerpc/oprofile/common.c
+++ b/arch/powerpc/oprofile/common.c
@@ -65,7 +65,7 @@ static int op_powerpc_setup(void)
 
 	/* Configure the registers on all cpus.	 If an error occurs on one
 	 * of the cpus, op_per_cpu_rc will be set to the error */
-	on_each_cpu(op_powerpc_cpu_setup, NULL, 0, 1);
+	on_each_cpu(op_powerpc_cpu_setup, NULL, 1);
 
 out:	if (op_per_cpu_rc) {
 		/* error on setup release the performance counter hardware */
@@ -100,7 +100,7 @@ static int op_powerpc_start(void)
 	if (model->global_start)
 		return model->global_start(ctr);
 	if (model->start) {
-		on_each_cpu(op_powerpc_cpu_start, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_start, NULL, 1);
 		return op_per_cpu_rc;
 	}
 	return -EIO; /* No start function is defined for this
@@ -115,7 +115,7 @@ static inline void op_powerpc_cpu_stop(void *dummy)
 static void op_powerpc_stop(void)
 {
 	if (model->stop)
-		on_each_cpu(op_powerpc_cpu_stop, NULL, 0, 1);
+		on_each_cpu(op_powerpc_cpu_stop, NULL, 1);
         if (model->global_stop)
                 model->global_stop();
 }
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 276b105fb2a4..b6781030cfbd 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -299,7 +299,7 @@ static void smp_ptlb_callback(void *info)
 
 void smp_ptlb_all(void)
 {
-	on_each_cpu(smp_ptlb_callback, NULL, 0, 1);
+	on_each_cpu(smp_ptlb_callback, NULL, 1);
 }
 EXPORT_SYMBOL(smp_ptlb_all);
 #endif /* ! CONFIG_64BIT */
@@ -347,7 +347,7 @@ void smp_ctl_set_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.orvals[cr] = 1 << bit;
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_set_bit);
 
@@ -361,7 +361,7 @@ void smp_ctl_clear_bit(int cr, int bit)
 	memset(&parms.orvals, 0, sizeof(parms.orvals));
 	memset(&parms.andvals, 0xff, sizeof(parms.andvals));
 	parms.andvals[cr] = ~(1L << bit);
-	on_each_cpu(smp_ctl_bit_callback, &parms, 0, 1);
+	on_each_cpu(smp_ctl_bit_callback, &parms, 1);
 }
 EXPORT_SYMBOL(smp_ctl_clear_bit);
 
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index bf7bf2c2236a..6037ed2b7471 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -909,7 +909,7 @@ static void etr_work_fn(struct work_struct *work)
 	if (!eacr.ea) {
 		/* Both ports offline. Reset everything. */
 		eacr.dp = eacr.es = eacr.sl = 0;
-		on_each_cpu(etr_disable_sync_clock, NULL, 0, 1);
+		on_each_cpu(etr_disable_sync_clock, NULL, 1);
 		del_timer_sync(&etr_timer);
 		etr_update_eacr(eacr);
 		set_bit(ETR_FLAG_EACCES, &etr_flags);
diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c
index 71781ba2675b..60c50841143e 100644
--- a/arch/sh/kernel/smp.c
+++ b/arch/sh/kernel/smp.c
@@ -197,7 +197,7 @@ static void flush_tlb_all_ipi(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(flush_tlb_all_ipi, 0, 1, 1);
+	on_each_cpu(flush_tlb_all_ipi, 0, 1);
 }
 
 static void flush_tlb_mm_ipi(void *mm)
@@ -284,7 +284,7 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 
 	fd.addr1 = start;
 	fd.addr2 = end;
-	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1, 1);
+	on_each_cpu(flush_tlb_kernel_range_ipi, (void *)&fd, 1);
 }
 
 static void flush_tlb_page_ipi(void *info)
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c
index 6cfab2e4d340..ebefd2a14375 100644
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -344,7 +344,7 @@ void hugetlb_prefault_arch_hook(struct mm_struct *mm)
 			 * also executing in this address space.
 			 */
 			mm->context.sparc64_ctx_val = ctx;
-			on_each_cpu(context_reload, mm, 0, 0);
+			on_each_cpu(context_reload, mm, 0);
 		}
 		spin_unlock(&ctx_alloc_lock);
 	}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..43b7cb594912 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -363,7 +363,7 @@ static void mcheck_check_cpu(void *info)
 
 static void mcheck_timer(struct work_struct *work)
 {
-	on_each_cpu(mcheck_check_cpu, NULL, 1, 1);
+	on_each_cpu(mcheck_check_cpu, NULL, 1);
 
 	/*
 	 * Alert userspace if needed.  If we logged an MCE, reduce the
@@ -612,7 +612,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
 	 * Collect entries that were still getting written before the
 	 * synchronize.
 	 */
-	on_each_cpu(collect_tscs, cpu_tsc, 1, 1);
+	on_each_cpu(collect_tscs, cpu_tsc, 1);
 	for (i = next; i < MCE_LOG_LEN; i++) {
 		if (mcelog.entry[i].finished &&
 		    mcelog.entry[i].tsc < cpu_tsc[mcelog.entry[i].cpu]) {
@@ -737,7 +737,7 @@ static void mce_restart(void)
 	if (next_interval)
 		cancel_delayed_work(&mcheck_work);
 	/* Timer race is harmless here */
-	on_each_cpu(mce_init, NULL, 1, 1);
+	on_each_cpu(mce_init, NULL, 1);
 	next_interval = check_interval * HZ;
 	if (next_interval)
 		schedule_delayed_work(&mcheck_work,
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index 00ccb6c14ec2..cc1fccdd31e0 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -59,7 +59,7 @@ static DECLARE_DELAYED_WORK(mce_work, mce_work_fn);
 
 static void mce_work_fn(struct work_struct *work)
 {
-	on_each_cpu(mce_checkregs, NULL, 1, 1);
+	on_each_cpu(mce_checkregs, NULL, 1);
 	schedule_delayed_work(&mce_work, round_jiffies_relative(MCE_RATE));
 }
 
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index f9ae93adffe5..58043f06d7e2 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -180,7 +180,7 @@ void disable_lapic_nmi_watchdog(void)
 	if (atomic_read(&nmi_active) <= 0)
 		return;
 
-	on_each_cpu(stop_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
 	wd_ops->unreserve();
 
 	BUG_ON(atomic_read(&nmi_active) != 0);
@@ -202,7 +202,7 @@ void enable_lapic_nmi_watchdog(void)
 		return;
 	}
 
-	on_each_cpu(setup_apic_nmi_watchdog, NULL, 0, 1);
+	on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
 	touch_nmi_watchdog();
 }
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index 4dc8600d9d20..720640ff36ca 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -1565,7 +1565,7 @@ void /*__init*/ print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void /*__init*/ print_PIC(void)
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index ef1a8dfcc529..4504c7f50012 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -1146,7 +1146,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
 
 void print_all_local_APICs (void)
 {
-	on_each_cpu(print_local_APIC, NULL, 1, 1);
+	on_each_cpu(print_local_APIC, NULL, 1);
 }
 
 void __apicdebuginit print_PIC(void)
diff --git a/arch/x86/kernel/nmi_32.c b/arch/x86/kernel/nmi_32.c
index 5562dab0bd20..11008e0857c0 100644
--- a/arch/x86/kernel/nmi_32.c
+++ b/arch/x86/kernel/nmi_32.c
@@ -218,7 +218,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -232,7 +232,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/nmi_64.c b/arch/x86/kernel/nmi_64.c
index 2f1e4f503c9e..bbdcb17b3dfe 100644
--- a/arch/x86/kernel/nmi_64.c
+++ b/arch/x86/kernel/nmi_64.c
@@ -225,7 +225,7 @@ static void __acpi_nmi_enable(void *__unused)
 void acpi_nmi_enable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_enable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_enable, NULL, 1);
 }
 
 static void __acpi_nmi_disable(void *__unused)
@@ -239,7 +239,7 @@ static void __acpi_nmi_disable(void *__unused)
 void acpi_nmi_disable(void)
 {
 	if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
-		on_each_cpu(__acpi_nmi_disable, NULL, 0, 1);
+		on_each_cpu(__acpi_nmi_disable, NULL, 1);
 }
 
 void setup_apic_nmi_watchdog(void *unused)
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index 9bb2363851af..fec1ecedc9b7 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -238,6 +238,6 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
 
diff --git a/arch/x86/kernel/tlb_64.c b/arch/x86/kernel/tlb_64.c
index a1f07d793202..184a367516d3 100644
--- a/arch/x86/kernel/tlb_64.c
+++ b/arch/x86/kernel/tlb_64.c
@@ -270,5 +270,5 @@ static void do_flush_tlb_all(void *info)
 
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, NULL, 1, 1);
+	on_each_cpu(do_flush_tlb_all, NULL, 1);
 }
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 0a03d57f9b3b..0dcae19ed627 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -301,7 +301,7 @@ static int __init vsyscall_init(void)
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2);
 #endif
-	on_each_cpu(cpu_vsyscall_init, NULL, 0, 1);
+	on_each_cpu(cpu_vsyscall_init, NULL, 1);
 	hotcpu_notifier(cpu_vsyscall_notifier, 0);
 	return 0;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5534fe59b5fc..10ce6ee4c491 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2968,7 +2968,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 	if (vmx->vmcs) {
-		on_each_cpu(__vcpu_clear, vmx, 0, 1);
+		on_each_cpu(__vcpu_clear, vmx, 1);
 		free_vmcs(vmx->vmcs);
 		vmx->vmcs = NULL;
 	}
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index 04f596eab749..abea08459a73 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -1072,7 +1072,7 @@ static void do_flush_tlb_all(void *info)
 /* flush the TLB of every active CPU in the system */
 void flush_tlb_all(void)
 {
-	on_each_cpu(do_flush_tlb_all, 0, 1, 1);
+	on_each_cpu(do_flush_tlb_all, 0, 1);
 }
 
 /* used to set up the trampoline for other CPUs when the memory manager
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 60bcb5b6a37e..9b836ba9dedd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -106,7 +106,7 @@ static void cpa_flush_all(unsigned long cache)
 {
 	BUG_ON(irqs_disabled());
 
-	on_each_cpu(__cpa_flush_all, (void *) cache, 1, 1);
+	on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
 static void __cpa_flush_range(void *arg)
@@ -127,7 +127,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
 	BUG_ON(irqs_disabled());
 	WARN_ON(PAGE_ALIGN(start) != start);
 
-	on_each_cpu(__cpa_flush_range, NULL, 1, 1);
+	on_each_cpu(__cpa_flush_range, NULL, 1);
 
 	if (!cache)
 		return;
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cc48d3fde545..3238ad32ffd8 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -218,8 +218,8 @@ static int nmi_setup(void)
 		}
 
 	}
-	on_each_cpu(nmi_save_registers, NULL, 0, 1);
-	on_each_cpu(nmi_cpu_setup, NULL, 0, 1);
+	on_each_cpu(nmi_save_registers, NULL, 1);
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
 	return 0;
 }
@@ -271,7 +271,7 @@ static void nmi_shutdown(void)
 {
 	struct op_msrs *msrs = &__get_cpu_var(cpu_msrs);
 	nmi_enabled = 0;
-	on_each_cpu(nmi_cpu_shutdown, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_shutdown, NULL, 1);
 	unregister_die_notifier(&profile_exceptions_nb);
 	model->shutdown(msrs);
 	free_msrs();
@@ -285,7 +285,7 @@ static void nmi_cpu_start(void *dummy)
 
 static int nmi_start(void)
 {
-	on_each_cpu(nmi_cpu_start, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_start, NULL, 1);
 	return 0;
 }
 
@@ -297,7 +297,7 @@ static void nmi_cpu_stop(void *dummy)
 
 static void nmi_stop(void)
 {
-	on_each_cpu(nmi_cpu_stop, NULL, 0, 1);
+	on_each_cpu(nmi_cpu_stop, NULL, 1);
 }
 
 struct op_counter_config counter_config[OP_MAX_COUNTER];
diff --git a/drivers/char/agp/generic.c b/drivers/char/agp/generic.c
index 564daaa6c7d0..eaa1a355bb32 100644
--- a/drivers/char/agp/generic.c
+++ b/drivers/char/agp/generic.c
@@ -1249,7 +1249,7 @@ static void ipi_handler(void *null)
 
 void global_cache_flush(void)
 {
-	if (on_each_cpu(ipi_handler, NULL, 1, 1) != 0)
+	if (on_each_cpu(ipi_handler, NULL, 1) != 0)
 		panic(PFX "timed out waiting for the other CPUs!\n");
 }
 EXPORT_SYMBOL(global_cache_flush);
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 2e554a4ab337..95dfda52b4f9 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -478,7 +478,7 @@ void __init lguest_arch_host_init(void)
 		cpu_had_pge = 1;
 		/* adjust_pge is a helper function which sets or unsets the PGE
 		 * bit on its CPU, depending on the argument (0 == unset). */
-		on_each_cpu(adjust_pge, (void *)0, 0, 1);
+		on_each_cpu(adjust_pge, (void *)0, 1);
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
@@ -493,7 +493,7 @@ void __exit lguest_arch_host_fini(void)
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
-		on_each_cpu(adjust_pge, (void *)1, 0, 1);
+		on_each_cpu(adjust_pge, (void *)1, 1);
 	}
 	put_online_cpus();
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index a073f3f4f013..5c23ef560d01 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1464,7 +1464,7 @@ static void invalidate_bh_lru(void *arg)
 	
 void invalidate_bh_lrus(void)
 {
-	on_each_cpu(invalidate_bh_lru, NULL, 1, 1);
+	on_each_cpu(invalidate_bh_lru, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 338cad1b9548..55261101d09a 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -89,7 +89,7 @@ static inline void init_call_single_data(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait);
+int on_each_cpu(void (*func) (void *info), void *info, int wait);
 
 #define MSG_ALL_BUT_SELF	0x8000	/* Assume <32768 CPU's */
 #define MSG_ALL			0x8001
@@ -121,7 +121,7 @@ static inline int up_smp_call_function(void (*func)(void *), void *info)
 }
 #define smp_call_function(func, info, wait) \
 			(up_smp_call_function(func, info))
-#define on_each_cpu(func,info,retry,wait)	\
+#define on_each_cpu(func,info,wait)		\
 	({					\
 		local_irq_disable();		\
 		func(info);			\
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..50e8616d7955 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -623,7 +623,7 @@ static void retrigger_next_event(void *arg)
 void clock_was_set(void)
 {
 	/* Retrigger the CPU local events everywhere */
-	on_each_cpu(retrigger_next_event, NULL, 0, 1);
+	on_each_cpu(retrigger_next_event, NULL, 1);
 }
 
 /*
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..58926411eb2a 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -252,7 +252,7 @@ static void profile_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	j = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
 		for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +275,7 @@ static void profile_discard_flip_buffers(void)
 	mutex_lock(&profile_flip_mutex);
 	i = per_cpu(cpu_profile_flip, get_cpu());
 	put_cpu();
-	on_each_cpu(__profile_flip_buffers, NULL, 0, 1);
+	on_each_cpu(__profile_flip_buffers, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
 		memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +558,7 @@ static int __init create_hash_tables(void)
 out_cleanup:
 	prof_on = 0;
 	smp_mb();
-	on_each_cpu(profile_nop, NULL, 0, 1);
+	on_each_cpu(profile_nop, NULL, 1);
 	for_each_online_cpu(cpu) {
 		struct page *page;
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..6addab5e6d88 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -127,7 +127,7 @@ void rcu_barrier(void)
 	 * until all the callbacks are queued.
 	 */
 	rcu_read_lock();
-	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	on_each_cpu(rcu_barrier_func, NULL, 1);
 	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d73afb4764ef..c159fd094772 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -674,7 +674,7 @@ __init int spawn_ksoftirqd(void)
 /*
  * Call a function on all processors
  */
-int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait)
+int on_each_cpu(void (*func) (void *info), void *info, int wait)
 {
 	int ret = 0;
 
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2f552955a02f..53242344a774 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,7 +918,7 @@ void drain_local_pages(void *arg)
  */
 void drain_all_pages(void)
 {
-	on_each_cpu(drain_local_pages, NULL, 0, 1);
+	on_each_cpu(drain_local_pages, NULL, 1);
 }
 
 #ifdef CONFIG_HIBERNATION
diff --git a/mm/slab.c b/mm/slab.c
index 046607f05f3e..0772abb412b9 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2454,7 +2454,7 @@ static void drain_cpu_caches(struct kmem_cache *cachep)
 	struct kmem_list3 *l3;
 	int node;
 
-	on_each_cpu(do_drain, cachep, 1, 1);
+	on_each_cpu(do_drain, cachep, 1);
 	check_irq_on();
 	for_each_online_node(node) {
 		l3 = cachep->nodelists[node];
@@ -3939,7 +3939,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	}
 	new->cachep = cachep;
 
-	on_each_cpu(do_ccupdate_local, (void *)new, 1, 1);
+	on_each_cpu(do_ccupdate_local, (void *)new, 1);
 
 	check_irq_on();
 	cachep->batchcount = batchcount;
diff --git a/mm/slub.c b/mm/slub.c
index 0987d1cd943c..44715eb70c06 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1497,7 +1497,7 @@ static void flush_cpu_slab(void *d)
 static void flush_all(struct kmem_cache *s)
 {
 #ifdef CONFIG_SMP
-	on_each_cpu(flush_cpu_slab, s, 1, 1);
+	on_each_cpu(flush_cpu_slab, s, 1);
 #else
 	unsigned long flags;
 
diff --git a/net/iucv/iucv.c b/net/iucv/iucv.c
index 94d5a45c3a57..a178e27e7b1a 100644
--- a/net/iucv/iucv.c
+++ b/net/iucv/iucv.c
@@ -545,7 +545,7 @@ out:
  */
 static void iucv_disable(void)
 {
-	on_each_cpu(iucv_retrieve_cpu, NULL, 0, 1);
+	on_each_cpu(iucv_retrieve_cpu, NULL, 1);
 	kfree(iucv_path_table);
 }
 
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ea1f595f8a87..d4eae6af0738 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1286,7 +1286,7 @@ static int kvm_reboot(struct notifier_block *notifier, unsigned long val,
 		 * in vmx root mode.
 		 */
 		printk(KERN_INFO "kvm: exiting hardware virtualization\n");
-		on_each_cpu(hardware_disable, NULL, 0, 1);
+		on_each_cpu(hardware_disable, NULL, 1);
 	}
 	return NOTIFY_OK;
 }
@@ -1479,7 +1479,7 @@ int kvm_init(void *opaque, unsigned int vcpu_size,
 			goto out_free_1;
 	}
 
-	on_each_cpu(hardware_enable, NULL, 0, 1);
+	on_each_cpu(hardware_enable, NULL, 1);
 	r = register_cpu_notifier(&kvm_cpu_notifier);
 	if (r)
 		goto out_free_2;
@@ -1525,7 +1525,7 @@ out_free_3:
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
 out_free_2:
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 out_free_1:
 	kvm_arch_hardware_unsetup();
 out_free_0:
@@ -1547,7 +1547,7 @@ void kvm_exit(void)
 	sysdev_class_unregister(&kvm_sysdev_class);
 	unregister_reboot_notifier(&kvm_reboot_notifier);
 	unregister_cpu_notifier(&kvm_cpu_notifier);
-	on_each_cpu(hardware_disable, NULL, 0, 1);
+	on_each_cpu(hardware_disable, NULL, 1);
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	kvm_exit_debug();
-- 
cgit v1.2.3


From 127a237a1ff49fa5b8e00af91e841598aeea3513 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Fri, 27 Jun 2008 11:48:22 +0200
Subject: fix "smp_call_function: get rid of the unused nonatomic/retry
 argument"

fix:

arch/x86/kernel/process.c: In function 'cpu_idle_wait':
arch/x86/kernel/process.c:64: error: too many arguments to function 'smp_call_function'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index ba370dc8685b..2dad8fef391c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -61,7 +61,7 @@ void cpu_idle_wait(void)
 {
 	smp_mb();
 	/* kick all the CPUs so that they exit out of pm_idle */
-	smp_call_function(do_nothing, NULL, 0, 1);
+	smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(cpu_idle_wait);
 
-- 
cgit v1.2.3


From 8594698ebddeef5443b7da8258ae33b3eaca61d5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Fri, 27 Jun 2008 21:20:17 +0200
Subject: stacktrace: fix modular build, export print_stack_trace and
 save_stack_trace

fix:

ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined!
ERROR: "save_stack_trace" [kernel/backtracetest.ko] undefined!

Signed-off-by: Ingo Molnar <mingo@elte.hu>

and fix:

  Building modules, stage 2.
  MODPOST 376 modules
ERROR: "print_stack_trace" [kernel/backtracetest.ko] undefined!
make[1]: *** [__modpost] Error 1

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/stacktrace.c | 2 ++
 kernel/stacktrace.c          | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c28c342c162f..a03e7f6d90c3 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -74,6 +74,7 @@ void save_stack_trace(struct stack_trace *trace)
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
+EXPORT_SYMBOL_GPL(save_stack_trace);
 
 void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 {
@@ -81,3 +82,4 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
 	if (trace->nr_entries < trace->max_entries)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
+EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index 7eaea9d02a52..94b527ef1d1e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,6 +6,7 @@
  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  */
 #include <linux/sched.h>
+#include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
 
@@ -21,4 +22,5 @@ void print_stack_trace(struct stack_trace *trace, int spaces)
 		print_ip_sym(trace->entries[i]);
 	}
 }
+EXPORT_SYMBOL_GPL(print_stack_trace);
 
-- 
cgit v1.2.3


From 38c4c97c62a30aef276663c1128a2051a25ead7d Mon Sep 17 00:00:00 2001
From: Arnd Bergmann
Date: Tue, 20 May 2008 19:17:02 +0200
Subject: x86-mce: BKL pushdown

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
---
 arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index e07e8c068ae0..4ef151633e8b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -9,6 +9,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/string.h>
 #include <linux/rcupdate.h>
 #include <linux/kallsyms.h>
@@ -527,10 +528,12 @@ static int open_exclu;	/* already open exclusive? */
 
 static int mce_open(struct inode *inode, struct file *file)
 {
+	lock_kernel();
 	spin_lock(&mce_state_lock);
 
 	if (open_exclu || (open_count && (file->f_flags & O_EXCL))) {
 		spin_unlock(&mce_state_lock);
+		unlock_kernel();
 		return -EBUSY;
 	}
 
@@ -539,6 +542,7 @@ static int mce_open(struct inode *inode, struct file *file)
 	open_count++;
 
 	spin_unlock(&mce_state_lock);
+	unlock_kernel();
 
 	return nonseekable_open(inode, file);
 }
-- 
cgit v1.2.3


From 5e374fb62621aca9522f76c2317c9acda75a8e88 Mon Sep 17 00:00:00 2001
From: Jens Axboe
Date: Tue, 1 Jul 2008 13:12:04 +0200
Subject: generic-ipi: fixlet

create proper stackframe.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/smp.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 56546e8a13ac..361b7a4c640c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -198,7 +198,7 @@ void smp_call_function_interrupt(struct pt_regs *regs)
 	irq_exit();
 }
 
-void smp_call_function_single_interrupt(void)
+void smp_call_function_single_interrupt(struct pt_regs *regs)
 {
 	ack_APIC_irq();
 	irq_enter();
-- 
cgit v1.2.3


From a13b04af713bfa60d44cbac956ab00d3a5793de7 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman
Date: Thu, 29 May 2008 10:05:08 -0700
Subject: x86 microcode: firmware data is const

Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
---
 arch/x86/kernel/microcode.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
index 69729e38b78a..649dfd761f23 100644
--- a/arch/x86/kernel/microcode.c
+++ b/arch/x86/kernel/microcode.c
@@ -488,7 +488,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 #define microcode_dev_exit() do { } while(0)
 #endif
 
-static long get_next_ucode_from_buffer(void **mc, void *buf,
+static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
 	unsigned long size, long offset)
 {
 	microcode_header_t *mc_header;
@@ -522,7 +522,7 @@ static int cpu_request_microcode(int cpu)
 	char name[30];
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	const struct firmware *firmware;
-	void *buf;
+	const u8 *buf;
 	unsigned long size;
 	long offset = 0;
 	int error;
-- 
cgit v1.2.3


From 7ab073b6e0cde1544f4e79fadb75532528af7595 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sat, 12 Jul 2008 14:30:35 -0700
Subject: x86: max_low_pfn_mapped fix, #1

fix crash on Ingo's big box:

calling  pci_iommu_init+0x0/0x17
PCI-DMA: Disabling AGP.
PCI-DMA: aperture base @ d0000000 size 65536 KB
PCI-DMA: using GART IOMMU.
PCI-DMA: Reserving 64MB of IOMMU area in the AGP aperture
BUG: unable to handle kernel paging request at ffff88000003be88
IP: [<ffffffff8026d377>] __alloc_pages_internal+0xc3/0x3f2
PGD 202063 PUD 206063 PMD 22fc00163 PTE 3b162
Oops: 0000 [1] SMP

and e820 is:

 BIOS-e820: 0000000000000000 - 000000000009ac00 (usable)
 BIOS-e820: 000000000009ac00 - 00000000000a0000 (reserved)
 BIOS-e820: 00000000000ca000 - 0000000000100000 (reserved)
 BIOS-e820: 0000000000100000 - 000000007ff70000 (usable)
 BIOS-e820: 000000007ff70000 - 000000007ff86000 (ACPI data)
 BIOS-e820: 000000007ff86000 - 0000000080000000 (ACPI NVS)
 BIOS-e820: 0000000080000000 - 00000000cfe00000 (usable)
 BIOS-e820: 00000000cfe00000 - 00000000d0000000 (reserved)
 BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved)
 BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved)
 BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved)
 BIOS-e820: 00000000fff80000 - 0000000100000000 (reserved)
 BIOS-e820: 0000000100000000 - 0000000830000000 (usable)

system has 32 GB RAM installed.

max_low_pfn_mapped is 0xcfe00, and GART aperture is not mapped.

So try to use init_memory_mapping to map that area, because the iommu
thinks that area is ram ...

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index d0d18db5d2a4..a614ee10f846 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -630,6 +630,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	struct pci_dev *dev;
 	void *gatt;
 	int i, error;
+	unsigned long start_pfn, end_pfn;
 
 	printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
 	aper_size = aper_base = info->aper_size = 0;
@@ -674,6 +675,16 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
 	printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
 	       aper_base, aper_size>>10);
+
+	/* need to map that range */
+	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
+	if (end_pfn > max_low_pfn_mapped) {
+		start_pfn = max_low_pfn_mapped;
+		max_low_pfn_mapped = init_memory_mapping(start_pfn<<PAGE_SHIFT,
+							 end_pfn<<PAGE_SHIFT);
+		if (max_pfn_mapped < max_low_pfn_mapped)
+			max_pfn_mapped = max_low_pfn_mapped;
+	}
 	return 0;
 
  nommu:
-- 
cgit v1.2.3


From 965194c15dc9e4f3bc44432b39c441c86af7f11d Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sat, 12 Jul 2008 14:31:28 -0700
Subject: x86: max_low_pfn_mapped fix, #2

tighten the boundary checks around max_low_pfn_mapped - dont overmap
nor undermap into holes.

also print out tseg for AMD cpus, for diagnostic purposes.
(this is an SMM area, and we split up any big mappings around that area)

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/amd_64.c | 1 +
 arch/x86/mm/pageattr.c       | 4 ++--
 arch/x86/mm/pat.c            | 4 ++--
 arch/x86/pci/i386.c          | 4 ++--
 4 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
index bd182b7616ee..7c36fb8a28d4 100644
--- a/arch/x86/kernel/cpu/amd_64.c
+++ b/arch/x86/kernel/cpu/amd_64.c
@@ -200,6 +200,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
 		 * benefit in doing so.
 		 */
 		if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
+		    printk(KERN_DEBUG "tseg: %010llx\n", tseg);
 		    if ((tseg>>PMD_SHIFT) <
 				(max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
 			((tseg>>PMD_SHIFT) <
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 0389cb8f6b1a..fb6f2ab40dda 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -658,11 +658,11 @@ static int cpa_process_alias(struct cpa_data *cpa)
 	struct cpa_data alias_cpa;
 	int ret = 0;
 
-	if (cpa->pfn > max_pfn_mapped)
+	if (cpa->pfn >= max_pfn_mapped)
 		return 0;
 
 #ifdef CONFIG_X86_64
-	if (cpa->pfn > max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
+	if (cpa->pfn >= max_low_pfn_mapped && cpa->pfn < (1UL<<(32-PAGE_SHIFT)))
 		return 0;
 #endif
 	/*
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 749766c3c5cd..d4585077977a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -449,8 +449,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
 	if (retval < 0)
 		return 0;
 
-	if (((pfn <= max_low_pfn_mapped) ||
-	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn <= max_pfn_mapped)) &&
+	if (((pfn < max_low_pfn_mapped) ||
+	     (pfn >= (1UL<<(32 - PAGE_SHIFT)) && pfn < max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(offset), size, flags) < 0) {
 		free_memtype(offset, offset + size);
 		printk(KERN_INFO
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 5281e343dd9f..2aafb67dc5f1 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -334,9 +334,9 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		flags = new_flags;
 	}
 
-	if (((vma->vm_pgoff <= max_low_pfn_mapped) ||
+	if (((vma->vm_pgoff < max_low_pfn_mapped) ||
 	     (vma->vm_pgoff >= (1UL<<(32 - PAGE_SHIFT)) &&
-	      vma->vm_pgoff <= max_pfn_mapped)) &&
+	      vma->vm_pgoff < max_pfn_mapped)) &&
 	    ioremap_change_attr((unsigned long)__va(addr), len, flags)) {
 		free_memtype(addr, addr + len);
 		return -EINVAL;
-- 
cgit v1.2.3


From 9958e810f8ac92f8a447035ee6555420ba27b847 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sat, 12 Jul 2008 14:32:45 -0700
Subject: x86: max_low_pfn_mapped fix, #3

optimization: try to merge the range with same page size in
init_memory_mapping, to get the best possible linear mappings set up.

thus when GBpages is not there, we could do 2M pages.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 122bcef222fc..a25cc6fa2207 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -763,6 +763,20 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
 	end_pfn = end>>PAGE_SHIFT;
 	nr_range = save_mr(mr, nr_range, start_pfn, end_pfn, 0);
 
+	/* try to merge same page size and continuous */
+	for (i = 0; nr_range > 1 && i < nr_range - 1; i++) {
+		unsigned long old_start;
+		if (mr[i].end != mr[i+1].start ||
+		    mr[i].page_size_mask != mr[i+1].page_size_mask)
+			continue;
+		/* move it */
+		old_start = mr[i].start;
+		memmove(&mr[i], &mr[i+1],
+			 (nr_range - 1 - i) * sizeof (struct map_range));
+		mr[i].start = old_start;
+		nr_range--;
+	}
+
 	for (i = 0; i < nr_range; i++)
 		printk(KERN_DEBUG " %010lx - %010lx page %s\n",
 				mr[i].start, mr[i].end,
-- 
cgit v1.2.3


From 7b479becdb8c1fb4ff6fbb2a4076c471c737b54c Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sat, 12 Jul 2008 22:57:07 -0700
Subject: x86, e820: remove end_user_pfn

end_user_pfn used to modify the meaning of the e820 maps.

Now that all e820 operations are cleaned up, unified, tightened up,
the e820 map always get updated to reality, we don't need to keep
this secondary mechanism anymore.

If you hit this commit in bisection it means something slipped through.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/e820.c | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index a5383ae2cbe3..28c29180b380 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -1048,11 +1048,6 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
 # define MAX_ARCH_PFN MAXMEM>>PAGE_SHIFT
 #endif
 
-/*
- * Last pfn which the user wants to use.
- */
-unsigned long __initdata end_user_pfn = MAX_ARCH_PFN;
-
 /*
  * Find the highest page frame number we have available
  */
@@ -1085,8 +1080,6 @@ static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type)
 
 	if (last_pfn > max_arch_pfn)
 		last_pfn = max_arch_pfn;
-	if (last_pfn > end_user_pfn)
-		last_pfn = end_user_pfn;
 
 	printk(KERN_INFO "last_pfn = %#lx max_arch_pfn = %#lx\n",
 			 last_pfn, max_arch_pfn);
@@ -1131,12 +1124,6 @@ int __init e820_find_active_region(const struct e820entry *ei,
 	if (*ei_endpfn > last_pfn)
 		*ei_endpfn = last_pfn;
 
-	/* Obey end_user_pfn to save on memmap */
-	if (*ei_startpfn >= end_user_pfn)
-		return 0;
-	if (*ei_endpfn > end_user_pfn)
-		*ei_endpfn = end_user_pfn;
-
 	return 1;
 }
 
@@ -1201,7 +1188,6 @@ static int __init parse_memopt(char *p)
 
 	userdef = 1;
 	mem_size = memparse(p, &p);
-	end_user_pfn = mem_size>>PAGE_SHIFT;
 	e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
 
 	return 0;
@@ -1245,10 +1231,9 @@ static int __init parse_memmap_opt(char *p)
 	} else if (*p == '$') {
 		start_at = memparse(p+1, &p);
 		e820_add_region(start_at, mem_size, E820_RESERVED);
-	} else {
-		end_user_pfn = (mem_size >> PAGE_SHIFT);
+	} else
 		e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
-	}
+
 	return *p == '\0' ? 0 : -EINVAL;
 }
 early_param("memmap", parse_memmap_opt);
-- 
cgit v1.2.3


From 3d88cca7085cffce077f808f36551e9050eb9e3a Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sat, 12 Jul 2008 22:52:55 -0700
Subject: x86: fix numaq_tsc_disable calling

got this on a test-system:

 calling  numaq_tsc_disable+0x0/0x39
 NUMAQ: disabling TSC
 initcall numaq_tsc_disable+0x0/0x39 returned 0 after 0 msecs

that's because we should not be using arch_initcall to call numaq_tsc_disable.

need to call it in setup_arch before time_init()/tsc_init()
and call it in init_intel() to make the cpu feature bits right.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/intel.c | 4 ++++
 arch/x86/kernel/numaq_32.c  | 7 ++++---
 arch/x86/kernel/setup.c     | 8 ++++++++
 include/asm-x86/numaq.h     | 2 ++
 4 files changed, 18 insertions(+), 3 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index fe9224c51d37..70609efdf1da 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -226,6 +226,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 
 	if (cpu_has_bts)
 		ds_init_intel(c);
+
+#ifdef CONFIG_X86_NUMAQ
+	numaq_tsc_disable();
+#endif
 }
 
 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index f0f1de1c4a1d..5b20a5e7ac28 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -93,12 +93,13 @@ int __init get_memcfg_numaq(void)
 	return 1;
 }
 
-static int __init numaq_tsc_disable(void)
+void __init numaq_tsc_disable(void)
 {
+	if (!found_numaq)
+		return -1;
+
 	if (num_online_nodes() > 1) {
 		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
 		setup_clear_cpu_cap(X86_FEATURE_TSC);
 	}
-	return 0;
 }
-arch_initcall(numaq_tsc_disable);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 987b6fde3a99..36c540d4ac4b 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -849,6 +849,14 @@ void __init setup_arch(char **cmdline_p)
 	init_cpu_to_node();
 #endif
 
+#ifdef CONFIG_X86_NUMAQ
+	/*
+	 * need to check online nodes num, call it
+	 * here before time_init/tsc_init
+	 */
+	numaq_tsc_disable();
+#endif
+
 	init_apic_mappings();
 	ioapic_init_mappings();
 
diff --git a/include/asm-x86/numaq.h b/include/asm-x86/numaq.h
index ef068d2465d6..34b92d581fa3 100644
--- a/include/asm-x86/numaq.h
+++ b/include/asm-x86/numaq.h
@@ -157,6 +157,8 @@ struct sys_cfg_data {
 	struct		eachquadmem eq[MAX_NUMNODES];	/* indexed by quad id */
 };
 
+void numaq_tsc_disable(void);
+
 #else
 static inline int get_memcfg_numaq(void)
 {
-- 
cgit v1.2.3


From ce8b06b985ae48f9425de6e4641e77cb3613ef00 Mon Sep 17 00:00:00 2001
From: Maciej W. Rozycki
Date: Sun, 13 Jul 2008 03:29:42 +0100
Subject: x86: I/O APIC: remove an IRQ2-mask hack

Now that IRQ2 is never made available to the I/O APIC, there is no need
to special-case it and mask as a workaround for broken systems.  Actually,
because of the former, mask_IO_APIC_irq(2) is a no-op already.

Signed-off-by: Maciej W. Rozycki <macro@linux-mips.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Andreas Herrmann <andreas.herrmann3@amd.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/boot.c  |  1 -
 arch/x86/kernel/io_apic_32.c | 10 ----------
 arch/x86/kernel/io_apic_64.c | 10 ----------
 include/asm-x86/genapic_32.h |  5 -----
 include/asm-x86/genapic_64.h |  6 ------
 5 files changed, 32 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 785700a08e9d..f489d7a9be92 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -1409,7 +1409,6 @@ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
 {
 	pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident);
 	acpi_skip_timer_override = 1;
-	force_mask_ioapic_irq_2();
 	return 0;
 }
 
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
index c50adb84ea6f..603261a5885c 100644
--- a/arch/x86/kernel/io_apic_32.c
+++ b/arch/x86/kernel/io_apic_32.c
@@ -59,13 +59,6 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
 static DEFINE_SPINLOCK(ioapic_lock);
 static DEFINE_SPINLOCK(vector_lock);
 
-static bool mask_ioapic_irq_2 __initdata;
-
-void __init force_mask_ioapic_irq_2(void)
-{
-	mask_ioapic_irq_2 = true;
-}
-
 int timer_through_8259 __initdata;
 
 /*
@@ -2187,9 +2180,6 @@ static inline void __init check_timer(void)
 	printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		vector, apic1, pin1, apic2, pin2);
 
-	if (mask_ioapic_irq_2)
-		mask_IO_APIC_irq(2);
-
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c
index 9e645cba11c4..b16ef029cf88 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic_64.c
@@ -94,13 +94,6 @@ static int no_timer_check;
 
 static int disable_timer_pin_1 __initdata;
 
-static bool mask_ioapic_irq_2 __initdata;
-
-void __init force_mask_ioapic_irq_2(void)
-{
-	mask_ioapic_irq_2 = true;
-}
-
 int timer_through_8259 __initdata;
 
 /* Where if anywhere is the i8259 connect in external int mode */
@@ -1706,9 +1699,6 @@ static inline void __init check_timer(void)
 	apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
 		cfg->vector, apic1, pin1, apic2, pin2);
 
-	if (mask_ioapic_irq_2)
-		mask_IO_APIC_irq(2);
-
 	/*
 	 * Some BIOS writers are clueless and report the ExtINTA
 	 * I/O APIC input from the cascaded 8259A as the timer
diff --git a/include/asm-x86/genapic_32.h b/include/asm-x86/genapic_32.h
index 33a73f5ed222..b02ea6e17de8 100644
--- a/include/asm-x86/genapic_32.h
+++ b/include/asm-x86/genapic_32.h
@@ -119,10 +119,5 @@ enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
 #define is_uv_system()			0
 #define uv_wakeup_secondary(a, b)	1
 
-#ifdef CONFIG_X86_IO_APIC
-extern void force_mask_ioapic_irq_2(void);
-#else
-static inline void force_mask_ioapic_irq_2(void) { }
-#endif
 
 #endif
diff --git a/include/asm-x86/genapic_64.h b/include/asm-x86/genapic_64.h
index 647e4e5c2580..0f8504627c41 100644
--- a/include/asm-x86/genapic_64.h
+++ b/include/asm-x86/genapic_64.h
@@ -46,10 +46,4 @@ extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
 
 extern void setup_apic_routing(void);
 
-#ifdef CONFIG_X86_IO_APIC
-extern void force_mask_ioapic_irq_2(void);
-#else
-static inline void force_mask_ioapic_irq_2(void) { }
-#endif
-
 #endif
-- 
cgit v1.2.3


From 11369f356b66d363a615fde2c5526962f7683674 Mon Sep 17 00:00:00 2001
From: Mike Travis
Date: Tue, 8 Jul 2008 14:35:21 -0700
Subject: x86: change _node_to_cpumask_ptr to return const ptr

  * Strengthen the return type for the _node_to_cpumask_ptr to be
    a const pointer.  This adds compiler checking to insure that
    node_to_cpumask_map[] is not changed inadvertently.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: "akpm@linux-foundation.org" <akpm@linux-foundation.org>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Acked-by: Vegard Nossum <vegard.nossum@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/setup_percpu.c |  8 ++++----
 include/asm-generic/topology.h |  3 ++-
 include/asm-x86/topology.h     | 10 +++++-----
 3 files changed, 11 insertions(+), 10 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 5fc310f746fc..cac68430d31f 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -343,23 +343,23 @@ static const cpumask_t cpu_mask_none;
 /*
  * Returns a pointer to the bitmask of CPUs on Node 'node'.
  */
-cpumask_t *_node_to_cpumask_ptr(int node)
+const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	if (node_to_cpumask_map == NULL) {
 		printk(KERN_WARNING
 			"_node_to_cpumask_ptr(%d): no node_to_cpumask_map!\n",
 			node);
 		dump_stack();
-		return &cpu_online_map;
+		return (const cpumask_t *)&cpu_online_map;
 	}
 	if (node >= nr_node_ids) {
 		printk(KERN_WARNING
 			"_node_to_cpumask_ptr(%d): node > nr_node_ids(%d)\n",
 			node, nr_node_ids);
 		dump_stack();
-		return (cpumask_t *)&cpu_mask_none;
+		return &cpu_mask_none;
 	}
-	return (cpumask_t *)&node_to_cpumask_map[node];
+	return &node_to_cpumask_map[node];
 }
 EXPORT_SYMBOL(_node_to_cpumask_ptr);
 
diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h
index a6aea79bca4f..54bbf6e04ee8 100644
--- a/include/asm-generic/topology.h
+++ b/include/asm-generic/topology.h
@@ -60,7 +60,8 @@
 #ifndef node_to_cpumask_ptr
 
 #define	node_to_cpumask_ptr(v, node) 					\
-		cpumask_t _##v = node_to_cpumask(node), *v = &_##v
+		cpumask_t _##v = node_to_cpumask(node);			\
+		const cpumask_t *v = &_##v
 
 #define node_to_cpumask_ptr_next(v, node)				\
 			  _##v = node_to_cpumask(node)
diff --git a/include/asm-x86/topology.h b/include/asm-x86/topology.h
index 98e5f17ea856..90ac7718469a 100644
--- a/include/asm-x86/topology.h
+++ b/include/asm-x86/topology.h
@@ -82,7 +82,7 @@ DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
 #ifdef CONFIG_DEBUG_PER_CPU_MAPS
 extern int cpu_to_node(int cpu);
 extern int early_cpu_to_node(int cpu);
-extern cpumask_t *_node_to_cpumask_ptr(int node);
+extern const cpumask_t *_node_to_cpumask_ptr(int node);
 extern cpumask_t node_to_cpumask(int node);
 
 #else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
@@ -103,7 +103,7 @@ static inline int early_cpu_to_node(int cpu)
 }
 
 /* Returns a pointer to the cpumask of CPUs on Node 'node'. */
-static inline cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	return &node_to_cpumask_map[node];
 }
@@ -118,7 +118,7 @@ static inline cpumask_t node_to_cpumask(int node)
 
 /* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = _node_to_cpumask_ptr(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = _node_to_cpumask_ptr(node)
@@ -186,7 +186,7 @@ extern int __node_distance(int, int);
 #define	cpu_to_node(cpu)	0
 #define	early_cpu_to_node(cpu)	0
 
-static inline cpumask_t *_node_to_cpumask_ptr(int node)
+static inline const cpumask_t *_node_to_cpumask_ptr(int node)
 {
 	return &cpu_online_map;
 }
@@ -201,7 +201,7 @@ static inline int node_to_first_cpu(int node)
 
 /* Replace default node_to_cpumask_ptr with optimized version */
 #define node_to_cpumask_ptr(v, node)		\
-		cpumask_t *v = _node_to_cpumask_ptr(node)
+		const cpumask_t *v = _node_to_cpumask_ptr(node)
 
 #define node_to_cpumask_ptr_next(v, node)	\
 			   v = _node_to_cpumask_ptr(node)
-- 
cgit v1.2.3


From 32b23e9a7331fce57eb0af52e19e8409fdef831b Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sun, 13 Jul 2008 14:29:41 -0700
Subject: x86: max_low_pfn_mapped fix #4

only add direct mapping for aperture

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/pci-gart_64.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a614ee10f846..c3fe78406d18 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -679,11 +679,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 	/* need to map that range */
 	end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
 	if (end_pfn > max_low_pfn_mapped) {
-		start_pfn = max_low_pfn_mapped;
-		max_low_pfn_mapped = init_memory_mapping(start_pfn<<PAGE_SHIFT,
-							 end_pfn<<PAGE_SHIFT);
-		if (max_pfn_mapped < max_low_pfn_mapped)
-			max_pfn_mapped = max_low_pfn_mapped;
+		start_pfn = (aper_base>>PAGE_SHIFT);
+		init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
 	}
 	return 0;
 
-- 
cgit v1.2.3


From 87a1c441e1aeaf00f97e63dfc310ea7684ec9dda Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sun, 13 Jul 2008 14:30:35 -0700
Subject: x86: get x86_phys_bits early

when try to make hpet_enable use io_remap instead fixmap got

ioremap: invalid physical address fed00000
------------[ cut here ]------------
WARNING: at arch/x86/mm/ioremap.c:161 __ioremap_caller+0x8c/0x2f3()
Modules linked in:
Pid: 0, comm: swapper Not tainted 2.6.26-rc9-tip-01873-ga9827e7-dirty #358

Call Trace:
 [<ffffffff8026615e>] warn_on_slowpath+0x6c/0xa7
 [<ffffffff802e2313>] ? __slab_alloc+0x20a/0x3fb
 [<ffffffff802d85c5>] ? mpol_new+0x88/0x17d
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8024b0d2>] __ioremap_caller+0x8c/0x2f3
 [<ffffffff80e86dbd>] ? hpet_enable+0x39/0x241
 [<ffffffff8022a4f4>] ? mcount_call+0x5/0x31
 [<ffffffff8024b466>] ioremap_nocache+0x2a/0x40
 [<ffffffff80e86dbd>] hpet_enable+0x39/0x241
 [<ffffffff80e7a1f6>] hpet_time_init+0x21/0x4e
 [<ffffffff80e730e9>] start_kernel+0x302/0x395
 [<ffffffff80e722aa>] x86_64_start_reservations+0xb9/0xd4
 [<ffffffff80e722fe>] ? x86_64_init_pda+0x39/0x4f
 [<ffffffff80e72400>] x86_64_start_kernel+0xec/0x107

---[ end trace a7919e7f17c0a725 ]---

it seems for amd system that is set later...
try to move setting early in early_identify_cpu.
and remove same code for intel and centaur.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/centaur_64.c | 10 ----------
 arch/x86/kernel/cpu/common_64.c  | 14 ++++++++------
 arch/x86/kernel/cpu/intel_64.c   | 10 ----------
 3 files changed, 8 insertions(+), 26 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 2026d2119cdb..1d181c40e2e1 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,16 +16,6 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
 
 static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
 {
-	/* Cache sizes */
-	unsigned n;
-
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
 	if (c->x86 == 0x6 && c->x86_model >= 0xf) {
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 		set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
index 36537ab9e56a..7b8cc72feb40 100644
--- a/arch/x86/kernel/cpu/common_64.c
+++ b/arch/x86/kernel/cpu/common_64.c
@@ -98,7 +98,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
 
 void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 {
-	unsigned int n, dummy, eax, ebx, ecx, edx;
+	unsigned int n, dummy, ebx, ecx, edx;
 
 	n = c->extended_cpuid_level;
 
@@ -121,11 +121,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
 		printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
 		c->x86_cache_size, ecx & 0xFF);
 	}
-	if (n >= 0x80000008) {
-		cpuid(0x80000008, &eax, &dummy, &dummy, &dummy);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
 }
 
 void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -314,6 +309,13 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->extended_cpuid_level >= 0x80000007)
 		c->x86_power = cpuid_edx(0x80000007);
 
+	if (c->extended_cpuid_level >= 0x80000008) {
+		u32 eax = cpuid_eax(0x80000008);
+
+		c->x86_virt_bits = (eax >> 8) & 0xff;
+		c->x86_phys_bits = eax & 0xff;
+	}
+
 	/* Assume all 64-bit CPUs support 32-bit syscall */
 	set_cpu_cap(c, X86_FEATURE_SYSCALL32);
 
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
index 02f773399e39..1019c58d39f0 100644
--- a/arch/x86/kernel/cpu/intel_64.c
+++ b/arch/x86/kernel/cpu/intel_64.c
@@ -54,9 +54,6 @@ static void __cpuinit srat_detect_node(void)
 
 static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 {
-	/* Cache sizes */
-	unsigned n;
-
 	init_intel_cacheinfo(c);
 	if (c->cpuid_level > 9) {
 		unsigned eax = cpuid_eax(10);
@@ -78,13 +75,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has_bts)
 		ds_init_intel(c);
 
-	n = c->extended_cpuid_level;
-	if (n >= 0x80000008) {
-		unsigned eax = cpuid_eax(0x80000008);
-		c->x86_virt_bits = (eax >> 8) & 0xff;
-		c->x86_phys_bits = eax & 0xff;
-	}
-
 	if (c->x86 == 15)
 		c->x86_cache_alignment = c->x86_clflush_size * 2;
 	if (c->x86 == 6)
-- 
cgit v1.2.3


From 2387ce57a8167490d3b34a7e1ffa9a64a1a76244 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Sun, 13 Jul 2008 14:50:56 -0700
Subject: x86: make 64bit hpet_set_mapping to use ioremap too, v2

keep the one for VSYSCALL_HPET

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/hpet.c      | 20 ++++----------------
 include/asm-x86/fixmap_64.h |  1 -
 2 files changed, 4 insertions(+), 17 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index ea230ec69057..0ea6a19bfdfe 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -36,26 +36,15 @@ static inline void hpet_writel(unsigned long d, unsigned long a)
 }
 
 #ifdef CONFIG_X86_64
-
 #include <asm/pgtable.h>
-
-static inline void hpet_set_mapping(void)
-{
-	set_fixmap_nocache(FIX_HPET_BASE, hpet_address);
-	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
-	hpet_virt_address = (void __iomem *)fix_to_virt(FIX_HPET_BASE);
-}
-
-static inline void hpet_clear_mapping(void)
-{
-	hpet_virt_address = NULL;
-}
-
-#else
+#endif
 
 static inline void hpet_set_mapping(void)
 {
 	hpet_virt_address = ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+#ifdef CONFIG_X86_64
+	__set_fixmap(VSYSCALL_HPET, hpet_address, PAGE_KERNEL_VSYSCALL_NOCACHE);
+#endif
 }
 
 static inline void hpet_clear_mapping(void)
@@ -63,7 +52,6 @@ static inline void hpet_clear_mapping(void)
 	iounmap(hpet_virt_address);
 	hpet_virt_address = NULL;
 }
-#endif
 
 /*
  * HPET command line enable / disable
diff --git a/include/asm-x86/fixmap_64.h b/include/asm-x86/fixmap_64.h
index 6a4789d57e6c..00f3d74a0524 100644
--- a/include/asm-x86/fixmap_64.h
+++ b/include/asm-x86/fixmap_64.h
@@ -40,7 +40,6 @@ enum fixed_addresses {
 	VSYSCALL_HPET,
 	FIX_DBGP_BASE,
 	FIX_EARLYCON_MEM_BASE,
-	FIX_HPET_BASE,
 	FIX_APIC_BASE,	/* local (CPU) APIC) -- required for SMP or not */
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
-- 
cgit v1.2.3


From 1c776bf87c855a6e823e13c3667f0cf7c14635bd Mon Sep 17 00:00:00 2001
From: Krzysztof Oledzki
Date: Wed, 4 Jun 2008 03:40:17 +0200
Subject: x86: add another PCI ID for ICH6 force-hpet

Tested on Asus P5GDC-V

$ lspci -n -n |grep ISA
00:1f.0 ISA bridge [0601]: Intel Corporation 82801FB/FR (ICH6/ICH6R) LPC Interface Bridge [8086:2640] (rev 03)

Force enabled HPET at base address 0xfed00000
hpet clockevent registered
hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
hpet0: 3 64-bit timers, 14318180 Hz

Signed-off-by: Krzysztof Piotr Oledzki <ole@ans.pl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d89a648fe710..06e1fd6be835 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -158,6 +158,8 @@ static void ich_force_enable_hpet(struct pci_dev *dev)
 
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB2_0,
 			 ich_force_enable_hpet);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_0,
+			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH6_1,
 			 ich_force_enable_hpet);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ICH7_0,
-- 
cgit v1.2.3


From 4c2a997c34c0aa952ba9c247b0c2043526054919 Mon Sep 17 00:00:00 2001
From: Joe Buehler
Date: Mon, 9 Jun 2008 08:55:20 -0400
Subject: x86: add PCI ID for 6300ESB force hpet

00:1f.0 ISA bridge: Intel Corporation 6300ESB LPC Interface Controller (rev 02)
00:1f.0 Class 0601: 8086:25a1 (rev 02)

kernel: pci 0000:00:1f.0: Force enabled HPET at 0xfed00000
kernel: hpet clockevent registered
kernel: hpet0: at MMIO 0xfed00000, IRQs 2, 8, 0
kernel: hpet0: 3 64-bit timers, 14318180 Hz

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/quirks.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 06e1fd6be835..f327abafe3e6 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -257,6 +257,8 @@ static void old_ich_force_enable_hpet_user(struct pci_dev *dev)
 		old_ich_force_enable_hpet(dev);
 }
 
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_ESB_1,
+			 old_ich_force_enable_hpet_user);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_0,
 			 old_ich_force_enable_hpet_user);
 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82801CA_12,
-- 
cgit v1.2.3


From 3bf2e77453a87c22eb57ed4926760ac131c84459 Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Sun, 13 Jul 2008 21:18:02 -0700
Subject: x86, suspend, acpi: enter Big Real Mode

The explanation for recent video BIOS suspend quirk failures is that
the VESA BIOS expects to be entered in Big Real Mode (*.limit = 0xffffffff)
instead of ordinary Real Mode (*.limit = 0xffff).

This patch changes the segment descriptors to Big Real Mode instead.

The segment descriptor registers (what Intel calls "segment cache") is
always active.  The only thing that changes based on CR0.PE is how it is
*loaded* and the interpretation of the CS flags.

The segment descriptor registers contain of the following sub-registers:
selector (the "visible" part), base, limit and flags.  In protected mode
or long mode, they are loaded from descriptors (or fs.base or gs.base can
be manipulated directly in long mode.)  In real mode, the only thing
changed by a segment register load is the selector and the base, where the
base <- selector << 4.  In particular, *the limit and the flags are not
changed*.

As far as the handling of the CS flags: a code segment cannot be writable
in protected mode, whereas it is "just another segment" in real mode, so
there is some kind of quirk that kicks in for this when CR0.PE <- 0.  I'm
not sure if this is accomplished by actually changing the cs.flags register
or just changing the interpretation; it might be something that is
CPU-specific.  In particular, the Transmeta CPUs had an explicit "CS is
writable if you're in real mode" override, so even if you had loaded CS
with an execute-only segment it'd be writable (but not readable!) on return
to real mode.  I'm not at all sure if that is how other CPUs behave.

Signed-off-by: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/acpi/sleep.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 36af01f029ed..130711f1454b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -23,6 +23,15 @@ static unsigned long acpi_realmode;
 static char temp_stack[10240];
 #endif
 
+/* XXX: this macro should move to asm-x86/segment.h and be shared with the
+   boot code... */
+#define GDT_ENTRY(flags, base, limit)		\
+	(((u64)(base & 0xff000000) << 32) |	\
+	 ((u64)flags << 40) |			\
+	 ((u64)(limit & 0x00ff0000) << 32) |	\
+	 ((u64)(base & 0x00ffffff) << 16) |	\
+	 ((u64)(limit & 0x0000ffff)))
+
 /**
  * acpi_save_state_mem - save kernel state
  *
@@ -58,11 +67,11 @@ int acpi_save_state_mem(void)
 			((char *)&header->wakeup_gdt - (char *)acpi_realmode))
 				<< 16);
 	/* GDT[1]: real-mode-like code segment */
-	header->wakeup_gdt[1] = (0x009bULL << 40) +
-		((u64)acpi_wakeup_address << 16) + 0xffff;
+	header->wakeup_gdt[1] =
+		GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
 	/* GDT[2]: real-mode-like data segment */
-	header->wakeup_gdt[2] = (0x0093ULL << 40) +
-		((u64)acpi_wakeup_address << 16) + 0xffff;
+	header->wakeup_gdt[2] =
+		GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
 
 #ifndef CONFIG_64BIT
 	store_gdt((struct desc_ptr *)&header->pmode_gdt);
-- 
cgit v1.2.3


From 065cb3dfe24978651caedfa54da585388ad15dde Mon Sep 17 00:00:00 2001
From: H. Peter Anvin
Date: Mon, 14 Jul 2008 11:44:26 -0700
Subject: x86, suspend, acpi: correct and add comments about Big Real Mode

Explain that we set up the descriptors for Big Real Mode, and why we
do so.  In particular, one system that is known to fail without it is
the Lenovo X61.

Signed-off-by: H. Peter Anvin <hpa@zytor.com>
---
 arch/x86/kernel/acpi/sleep.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 130711f1454b..41bb130c31db 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -60,16 +60,25 @@ int acpi_save_state_mem(void)
 	header->video_mode = saved_video_mode;
 
 	header->wakeup_jmp_seg = acpi_wakeup_address >> 4;
+
+	/*
+	 * Set up the wakeup GDT.  We set these up as Big Real Mode,
+	 * that is, with limits set to 4 GB.  At least the Lenovo
+	 * Thinkpad X61 is known to need this for the video BIOS
+	 * initialization quirk to work; this is likely to also
+	 * be the case for other laptops or integrated video devices.
+	 */
+
 	/* GDT[0]: GDT self-pointer */
 	header->wakeup_gdt[0] =
 		(u64)(sizeof(header->wakeup_gdt) - 1) +
 		((u64)(acpi_wakeup_address +
 			((char *)&header->wakeup_gdt - (char *)acpi_realmode))
 				<< 16);
-	/* GDT[1]: real-mode-like code segment */
+	/* GDT[1]: big real mode-like code segment */
 	header->wakeup_gdt[1] =
 		GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
-	/* GDT[2]: real-mode-like data segment */
+	/* GDT[2]: big real mode-like data segment */
 	header->wakeup_gdt[2] =
 		GDT_ENTRY(0x8093, acpi_wakeup_address, 0xfffff);
 
-- 
cgit v1.2.3


From 7daf705f362e349983e92037a198b8821db198af Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 14 Jul 2008 12:12:53 -0700
Subject: Start using the new '%pS' infrastructure to print symbols

This simplifies the code significantly, and was the whole point of the
exercise.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/traps_64.c | 25 +------------------------
 mm/slub.c                  |  5 ++---
 2 files changed, 3 insertions(+), 27 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
index adff76ea97c4..f1a95d105953 100644
--- a/arch/x86/kernel/traps_64.c
+++ b/arch/x86/kernel/traps_64.c
@@ -104,30 +104,7 @@ int kstack_depth_to_print = 12;
 
 void printk_address(unsigned long address, int reliable)
 {
-#ifdef CONFIG_KALLSYMS
-	unsigned long offset = 0, symsize;
-	const char *symname;
-	char *modname;
-	char *delim = ":";
-	char namebuf[KSYM_NAME_LEN];
-	char reliab[4] = "";
-
-	symname = kallsyms_lookup(address, &symsize, &offset,
-					&modname, namebuf);
-	if (!symname) {
-		printk(" [<%016lx>]\n", address);
-		return;
-	}
-	if (!reliable)
-		strcpy(reliab, "? ");
-
-	if (!modname)
-		modname = delim = "";
-	printk(" [<%016lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
-		address, reliab, delim, modname, delim, symname, offset, symsize);
-#else
-	printk(" [<%016lx>]\n", address);
-#endif
+	printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
 }
 
 static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
diff --git a/mm/slub.c b/mm/slub.c
index 315c392253c7..5f6e2c4a2ba7 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -431,9 +431,8 @@ static void print_track(const char *s, struct track *t)
 	if (!t->addr)
 		return;
 
-	printk(KERN_ERR "INFO: %s in ", s);
-	__print_symbol("%s", (unsigned long)t->addr);
-	printk(" age=%lu cpu=%u pid=%d\n", jiffies - t->when, t->cpu, t->pid);
+	printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
+		s, t->addr, jiffies - t->when, t->cpu, t->pid);
 }
 
 static void print_tracking(struct kmem_cache *s, void *object)
-- 
cgit v1.2.3


From beef3129b3afb74817acff72fda4a9d951e3973e Mon Sep 17 00:00:00 2001
From: Matthew Wilcox
Date: Fri, 11 Jul 2008 15:21:17 -0600
Subject: x86/PCI: Fix PCI config space for domains > 0

John Keller reports that PCI config space access is broken on machines
with more than one domain.  conf1 accesses only work for domain 0, so make sure
we check the domain number in the raw routines before trying conf1.

Reported-by: John Keller <jpk@sgi.com>
Signed-off-by: Matthew Wilcox <willy@linux.intel.com>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index d19fd07bafd6..86aff81a0829 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -32,7 +32,7 @@ struct pci_raw_ops *raw_pci_ext_ops;
 int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 *val)
 {
-	if (reg < 256 && raw_pci_ops)
+	if (domain == 0 && reg < 256 && raw_pci_ops)
 		return raw_pci_ops->read(domain, bus, devfn, reg, len, val);
 	if (raw_pci_ext_ops)
 		return raw_pci_ext_ops->read(domain, bus, devfn, reg, len, val);
@@ -42,7 +42,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn,
 int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
 						int reg, int len, u32 val)
 {
-	if (reg < 256 && raw_pci_ops)
+	if (domain == 0 && reg < 256 && raw_pci_ops)
 		return raw_pci_ops->write(domain, bus, devfn, reg, len, val);
 	if (raw_pci_ext_ops)
 		return raw_pci_ext_ops->write(domain, bus, devfn, reg, len, val);
-- 
cgit v1.2.3


From 116a9fb3ed98c19d1ee0c6c55971f5b753949393 Mon Sep 17 00:00:00 2001
From: Linus Torvalds
Date: Mon, 14 Jul 2008 15:03:25 -0700
Subject: x86: MMIOTRACE should not default to on

Even the help-text makes it clear that normal people shouldn't enable
it.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/Kconfig.debug | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 5236621350bc..ae36bfa814e5 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -179,7 +179,6 @@ config MMIOTRACE
 	depends on DEBUG_KERNEL && PCI
 	select TRACING
 	select MMIOTRACE_HOOKS
-	default y
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
 	  debugging and reverse engineering. It is called from the ioremap
-- 
cgit v1.2.3


From aba3728ce2e8ce85e1e5f6b275131e9332256789 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 15 Jul 2008 14:48:48 +0200
Subject: x86: sanitize Kconfig

Set default n for MEMTEST and MTRR_SANITIZER and fix the help texts.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Kconfig | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6958d6bcaf70..2642b4bf41b9 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -447,7 +447,6 @@ config PARAVIRT_DEBUG
 config MEMTEST
 	bool "Memtest"
 	depends on X86_64
-	default y
 	help
 	  This option adds a kernel parameter 'memtest', which allows memtest
 	  to be set.
@@ -455,7 +454,7 @@ config MEMTEST
 		memtest=1, mean do 1 test pattern;
 		...
 		memtest=4, mean do 4 test patterns.
-	  If you are unsure how to answer this question, answer Y.
+	  If you are unsure how to answer this question, answer N.
 
 config X86_SUMMIT_NUMA
 	def_bool y
@@ -1135,21 +1134,18 @@ config MTRR
 	  See <file:Documentation/mtrr.txt> for more information.
 
 config MTRR_SANITIZER
-	def_bool y
+	bool
 	prompt "MTRR cleanup support"
 	depends on MTRR
 	help
-	  Convert MTRR layout from continuous to discrete, so some X driver
-	  could add WB entries.
+	  Convert MTRR layout from continuous to discrete, so X drivers can
+	  add writeback entries.
 
-	  Say N here if you see bootup problems (boot crash, boot hang,
-	  spontaneous reboots).
+	  Can be disabled with disable_mtrr_cleanup on the kernel command line.
+	  The largest mtrr entry size for a continous block can be set with
+	  mtrr_chunk_size.
 
-	  Could be disabled with disable_mtrr_cleanup. Also mtrr_chunk_size
-	  could be used to send largest mtrr entry size for continuous block
-	  to hold holes (aka. UC entries)
-
-	  If unsure, say Y.
+	  If unsure, say N.
 
 config MTRR_SANITIZER_ENABLE_DEFAULT
 	int "MTRR cleanup enable value (0-1)"
@@ -1166,7 +1162,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
 	depends on MTRR_SANITIZER
 	help
 	  mtrr cleanup spare entries default, it can be changed via
-	  mtrr_spare_reg_nr=
+	  mtrr_spare_reg_nr=N on the kernel command line.
 
 config X86_PAT
 	bool
-- 
cgit v1.2.3


From b3c9816b9fa9a7b75ab36111eb76eca03e5bab78 Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Tue, 15 Jul 2008 22:03:56 +0200
Subject: generic-ipi: merge fix

fix merge fallout:

arch/x86/pci/amd_bus.c: In function ‘enable_pci_io_ecs':
arch/x86/pci/amd_bus.c:581: error: too many arguments to function ‘on_each_cpu'

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/pci/amd_bus.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index a18141ae3f02..dbf532369711 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -578,7 +578,7 @@ static int __init enable_pci_io_ecs(void)
 	/* assume all cpus from fam10h have IO ECS */
         if (boot_cpu_data.x86 < 0x10)
 		return 0;
-	on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1, 1);
+	on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1);
 	pci_probe |= PCI_HAS_IO_ECS;
 	return 0;
 }
-- 
cgit v1.2.3


From 431ceb83f703a343bdd14350480a2224fa4bfedf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 15 Jul 2008 22:08:04 +0200
Subject: x86: fix TSC build error on 32bit

Dave Hansen reported a build error on 32bit which went unnoticed
as newer gcc versions seem to optimize unused static functions
away before compiling them.

Make vread_tsc() depend on CONFIG_X86_64

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/tsc.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 3c36f92160c9..7603c0553909 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -358,6 +358,7 @@ static cycle_t read_tsc(void)
 		ret : clocksource_tsc.cycle_last;
 }
 
+#ifdef CONFIG_X86_64
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
 	cycle_t ret = (cycle_t)vget_cycles();
@@ -365,6 +366,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
 	return ret >= __vsyscall_gtod_data.clock.cycle_last ?
 		ret : __vsyscall_gtod_data.clock.cycle_last;
 }
+#endif
 
 static struct clocksource clocksource_tsc = {
 	.name                   = "tsc",
-- 
cgit v1.2.3


From 809d9a8f93bd8504dcc34b16bbfdfd1a8c9bb1ed Mon Sep 17 00:00:00 2001
From: Alok Kataria
Date: Tue, 15 Jul 2008 11:59:42 -0700
Subject: x86/PCI: ACPI based PCI gap calculation

Using ACPI to find free address space allows us to find a gap for the
unallocated PCI resources or MMIO resources for hotplug devices within
the BIOS allowed PCI regions.

It works by evaluating the _CRS object under PCI0 looking for producer
resources.  Then searches the e820 memory space for a gap within these
producer resources.

Signed-off-by: Alok N Kataria <akataria@vmware.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d95de2f199cd..d1ffb5709174 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,6 +4,7 @@
 #include <linux/irq.h>
 #include <linux/dmi.h>
 #include <asm/numa.h>
+#include <asm/e820.h>
 #include "pci.h"
 
 struct pci_root_info {
@@ -14,6 +15,11 @@ struct pci_root_info {
 	int busnum;
 };
 
+struct gap_info {
+	unsigned long gapstart;
+	unsigned long gapsize;
+};
+
 static acpi_status
 resource_to_addr(struct acpi_resource *resource,
 			struct acpi_resource_address64 *addr)
@@ -110,6 +116,78 @@ adjust_transparent_bridge_resources(struct pci_bus *bus)
 	}
 }
 
+static acpi_status search_gap(struct acpi_resource *resource, void *data)
+{
+	struct acpi_resource_address64 addr;
+	acpi_status status;
+	struct gap_info *gap = data;
+	unsigned long long start_addr, end_addr;
+
+	status = resource_to_addr(resource, &addr);
+	if (ACPI_SUCCESS(status) &&
+	    addr.resource_type == ACPI_MEMORY_RANGE &&
+	    addr.address_length > gap->gapsize) {
+		start_addr = addr.minimum + addr.translation_offset;
+		/*
+		 * We want space only in the 32bit address range
+		 */
+		if (start_addr < UINT_MAX) {
+			end_addr = start_addr + addr.address_length;
+			e820_search_gap(&gap->gapstart, &gap->gapsize,
+					start_addr, end_addr);
+		}
+	}
+
+	return AE_OK;
+}
+
+/*
+ * Search for a hole in the 32 bit address space for PCI to assign MMIO
+ * resources, for hotplug or unconfigured resources.
+ * We query the CRS object of the PCI root device to look for possible producer
+ * resources in the tree and consider these while calulating the start address
+ * for this hole.
+ */
+static void pci_setup_gap(acpi_handle *handle)
+{
+	struct gap_info gap;
+	acpi_status status;
+
+	gap.gapstart = 0;
+	gap.gapsize = 0x400000;
+
+	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
+				     search_gap, &gap);
+
+	if (ACPI_SUCCESS(status)) {
+		unsigned long round;
+
+		if (!gap.gapstart) {
+			printk(KERN_ERR "ACPI: Warning: Cannot find a gap "
+					"in the 32bit address range for PCI\n"
+					"ACPI: PCI devices may collide with "
+					"hotpluggable memory address range\n");
+		}
+		/*
+		 * Round the gapstart, uses the same logic as in
+		 * e820_gap_setup
+		 */
+		round = 0x100000;
+		while ((gap.gapsize >> 4) > round)
+			round += round;
+		/* Fun with two's complement */
+		pci_mem_start = (gap.gapstart + round) & -round;
+
+		printk(KERN_INFO "ACPI: PCI resources should "
+				"start at %lx (gap: %lx:%lx)\n",
+				pci_mem_start, gap.gapstart, gap.gapsize);
+	} else {
+		printk(KERN_ERR "ACPI: Error while searching for gap in "
+				"the 32bit address range for PCI\n");
+	}
+}
+
+
 static void
 get_current_resources(struct acpi_device *device, int busnum,
 			int domain, struct pci_bus *bus)
@@ -215,6 +293,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 
 	if (bus && (pci_probe & PCI_USE__CRS))
 		get_current_resources(device, busnum, domain, bus);
+
+	pci_setup_gap(device->handle);
 	return bus;
 }
 
-- 
cgit v1.2.3


From e22146e610bb7aed63282148740ab1d1b91e1d90 Mon Sep 17 00:00:00 2001
From: Jack Steiner
Date: Wed, 16 Jul 2008 11:11:59 -0500
Subject: x86: fix kernel_physical_mapping_init() for large x86 systems

Fix bug in kernel_physical_mapping_init() that causes kernel
page table to be built incorrectly for systems with greater
than 512GB of memory.

Signed-off-by: Jack Steiner <steiner@sgi.com>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/mm/init_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 27de2435e008..306049edd553 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -644,7 +644,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
 		unsigned long pud_phys;
 		pud_t *pud;
 
-		next = start + PGDIR_SIZE;
+		next = (start + PGDIR_SIZE) & PGDIR_MASK;
 		if (next > end)
 			next = end;
 
-- 
cgit v1.2.3


From 19d0cfe9ddfdf7afa8d1765ab0bd2a7dd30e47c9 Mon Sep 17 00:00:00 2001
From: Bob Moore
Date: Tue, 10 Jun 2008 15:54:40 +0800
Subject: ACPICA: Update DMAR and SRAT table definitions

Synchronized tables with current specifications.

Signed-off-by: Bob Moore <robert.moore@intel.com>
Signed-off-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/mm/srat_32.c |  3 +--
 drivers/acpi/numa.c   |  4 ++--
 include/acpi/actbl1.h | 19 +++++++++++++++++--
 3 files changed, 20 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index f41d67f8f831..1eb2973a301c 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -156,10 +156,9 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
 
 	num_memory_chunks++;
 
-	printk(KERN_DEBUG "Memory range %08lx to %08lx (type %x)"
+	printk(KERN_DEBUG "Memory range %08lx to %08lx"
 			  " in proximity domain %02x %s\n",
 		start_pfn, end_pfn,
-		memory_affinity->memory_type,
 		pxm,
 		((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
 		 "enabled and removable" : "enabled" ) );
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 658e5f3abae0..cb9864e39bae 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -120,10 +120,10 @@ acpi_table_print_srat_entry(struct acpi_subtable_header *header)
 			struct acpi_srat_mem_affinity *p =
 			    (struct acpi_srat_mem_affinity *)header;
 			ACPI_DEBUG_PRINT((ACPI_DB_INFO,
-					  "SRAT Memory (0x%lx length 0x%lx type 0x%x) in proximity domain %d %s%s\n",
+					  "SRAT Memory (0x%lx length 0x%lx) in proximity domain %d %s%s\n",
 					  (unsigned long)p->base_address,
 					  (unsigned long)p->length,
-					  p->memory_type, p->proximity_domain,
+					  p->proximity_domain,
 					  (p->flags & ACPI_SRAT_MEM_ENABLED)?
 					  "enabled" : "disabled",
 					  (p->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)?
diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h
index 9af239bd1153..dfb0fb577d97 100644
--- a/include/acpi/actbl1.h
+++ b/include/acpi/actbl1.h
@@ -300,6 +300,7 @@ struct acpi_table_dbgp {
 /*******************************************************************************
  *
  * DMAR - DMA Remapping table
+ *	  From "Intel Virtualization Technology for Directed I/O", Sept. 2007
  *
  ******************************************************************************/
 
@@ -382,6 +383,20 @@ struct acpi_dmar_reserved_memory {
 
 #define ACPI_DMAR_ALLOW_ALL         (1)
 
+
+/* 2: Root Port ATS Capability Reporting Structure */
+
+struct acpi_dmar_atsr {
+       struct acpi_dmar_header header;
+       u8 flags;
+       u8 reserved;
+       u16 segment;
+};
+
+/* Flags */
+
+#define ACPI_DMAR_ALL_PORTS	    (1)
+
 /*******************************************************************************
  *
  * ECDT - Embedded Controller Boot Resources Table
@@ -1156,9 +1171,9 @@ struct acpi_srat_mem_affinity {
 	u16 reserved;		/* Reserved, must be zero */
 	u64 base_address;
 	u64 length;
-	u32 memory_type;	/* See acpi_address_range_id */
+       u32 reserved1;
 	u32 flags;
-	u64 reserved1;		/* Reserved, must be zero */
+       u64 reserved2;	       /* Reserved, must be zero */
 };
 
 /* Flags */
-- 
cgit v1.2.3


From 5b53496a5ad79e91052f72761a7c5516b069bc99 Mon Sep 17 00:00:00 2001
From: Zhao Yakui
Date: Tue, 17 Jun 2008 14:39:59 +0800
Subject: ACPI: Disable the C2C3_FFH access mode HW has no MWAIT support

991528d7348667924176f3e29addea0675298944
(ACPI: Processor native C-states using MWAIT)
started passing C2C3_FFH to _PDC to tell the BIOS
that Linux supports MWAIT for deep C-states.

However, we should first double check with the hardware
that it actually supports MWAIT before potentially exposing
a BIOS bug of an MWAIT _CST on HW that doesn't support MWAIT.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
Signed-off-by: Li Shaohua <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 arch/x86/kernel/acpi/processor.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
index de2d2e4ebad9..7c074eec39fb 100644
--- a/arch/x86/kernel/acpi/processor.c
+++ b/arch/x86/kernel/acpi/processor.c
@@ -56,6 +56,12 @@ static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_ACPI))
 		buf[2] |= ACPI_PDC_T_FFH;
 
+	/*
+	 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
+	 */
+	if (!cpu_has(c, X86_FEATURE_MWAIT))
+		buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
+
 	obj->type = ACPI_TYPE_BUFFER;
 	obj->buffer.length = 12;
 	obj->buffer.pointer = (u8 *) buf;
-- 
cgit v1.2.3


From c1e3b377ad48febba6f91b8ae42c44ee4d4ab45e Mon Sep 17 00:00:00 2001
From: Zhao Yakui
Date: Tue, 24 Jun 2008 17:58:53 +0800
Subject: ACPI: Create "idle=halt" bootparam

"idle=halt" limits the idle loop to using
the halt instruction.  No MWAIT, no IO accesses,
no C-states deeper than C1.

If something is broken in the idle code,
"idle=halt" is a less severe workaround
than "idle=poll" which disables all power savings.

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 Documentation/kernel-parameters.txt |  4 +++-
 arch/ia64/kernel/process.c          |  2 ++
 arch/x86/kernel/process.c           | 17 ++++++++++++++++-
 drivers/acpi/processor_idle.c       | 22 ++++++++++++++++++++++
 include/asm-ia64/processor.h        |  1 +
 include/asm-x86/processor.h         |  1 +
 6 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 312fe77764a4..65db7f4711aa 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -818,7 +818,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See Documentation/ide/ide.txt.
 
 	idle=		[X86]
-			Format: idle=poll or idle=mwait
+			Format: idle=poll or idle=mwait, idle=halt
 			Poll forces a polling idle loop that can slightly improves the performance
 			of waking up a idle CPU, but will use a lot of power and make the system
 			run hot. Not recommended.
@@ -826,6 +826,8 @@ and is between 256 and 4096 characters. It is defined in the file
 			to not use it because it doesn't save as much power as a normal idle
 			loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same
 			as idle=poll.
+			idle=halt. Halt is forced to be used for CPU idle.
+			In such case C2/C3 won't be used again.
 
 	ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
 			Claim all unknown PCI IDE storage controllers.
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index fabaf08d9a69..612b3c4a0603 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -55,6 +55,8 @@ void (*ia64_mark_idle)(int);
 
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
+unsigned long idle_halt;
+EXPORT_SYMBOL(idle_halt);
 
 void
 ia64_do_show_stack (struct unw_frame_info *info, void *arg)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7dceea947232..7fc729498760 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -7,6 +7,10 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <asm/system.h>
+
+unsigned long idle_halt;
+EXPORT_SYMBOL(idle_halt);
 
 struct kmem_cache *task_xstate_cachep;
 
@@ -325,7 +329,18 @@ static int __init idle_setup(char *str)
 		pm_idle = poll_idle;
 	} else if (!strcmp(str, "mwait"))
 		force_mwait = 1;
-	else
+	else if (!strcmp(str, "halt")) {
+		/*
+		 * When the boot option of idle=halt is added, halt is
+		 * forced to be used for CPU idle. In such case CPU C2/C3
+		 * won't be used again.
+		 * To continue to load the CPU idle driver, don't touch
+		 * the boot_option_idle_override.
+		 */
+		pm_idle = default_idle;
+		idle_halt = 1;
+		return 0;
+	} else
 		return -1;
 
 	boot_option_idle_override = 1;
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index 0fc310e7dfd6..c75c7ace8c13 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -41,6 +41,7 @@
 #include <linux/pm_qos_params.h>
 #include <linux/clockchips.h>
 #include <linux/cpuidle.h>
+#include <linux/cpuidle.h>
 
 /*
  * Include the apic definitions for x86 to have the APIC timer related defines
@@ -57,6 +58,7 @@
 
 #include <acpi/acpi_bus.h>
 #include <acpi/processor.h>
+#include <asm/processor.h>
 
 #define ACPI_PROCESSOR_COMPONENT        0x01000000
 #define ACPI_PROCESSOR_CLASS            "processor"
@@ -955,6 +957,17 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 			} else {
 				continue;
 			}
+			if (cx.type == ACPI_STATE_C1 && idle_halt) {
+				/*
+				 * In most cases the C1 space_id obtained from
+				 * _CST object is FIXED_HARDWARE access mode.
+				 * But when the option of idle=halt is added,
+				 * the entry_method type should be changed from
+				 * CSTATE_FFH to CSTATE_HALT.
+				 */
+				cx.entry_method = ACPI_CSTATE_HALT;
+				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
+			}
 		} else {
 			snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
 				 cx.address);
@@ -1780,6 +1793,15 @@ int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
 		return 0;
 
 	if (!first_run) {
+		if (idle_halt) {
+			/*
+			 * When the boot option of "idle=halt" is added, halt
+			 * is used for CPU IDLE.
+			 * In such case C2/C3 is meaningless. So the max_cstate
+			 * is set to one.
+			 */
+			max_cstate = 1;
+		}
 		dmi_check_system(processor_power_dmi_table);
 		max_cstate = acpi_processor_cstate_check(max_cstate);
 		if (max_cstate < ACPI_C_STATES_MAX)
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index 6aff126fc07e..f36e28a5f61e 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -763,6 +763,7 @@ prefetchw (const void *x)
 #define spin_lock_prefetch(x)	prefetchw(x)
 
 extern unsigned long boot_option_idle_override;
+extern unsigned long idle_halt;
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index 7f7382704592..bc221623248e 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -727,6 +727,7 @@ extern int			force_mwait;
 extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long		boot_option_idle_override;
+extern unsigned long		idle_halt;
 
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
-- 
cgit v1.2.3


From da5e09a1b3e5a9fc0b15a3feb64e921ccc55ba74 Mon Sep 17 00:00:00 2001
From: Zhao Yakui
Date: Tue, 24 Jun 2008 18:01:09 +0800
Subject: ACPI : Create "idle=nomwait" bootparam

"idle=nomwait" disables the use of the MWAIT
instruction from both C1 (C1_FFH) and deeper (C2C3_FFH)
C-states.

When MWAIT is unavailable, the BIOS and OS generally
negotiate to use the HALT instruction for C1,
and use IO accesses for deeper C-states.

This option is useful for power and performance
comparisons, and also to work around BIOS bugs
where broken MWAIT support is advertised.

http://bugzilla.kernel.org/show_bug.cgi?id=10807
http://bugzilla.kernel.org/show_bug.cgi?id=10914

Signed-off-by: Zhao Yakui <yakui.zhao@intel.com>
Signed-off-by: Li Shaohua <shaohua.li@intel.com>
Signed-off-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 Documentation/kernel-parameters.txt |  3 ++-
 arch/ia64/kernel/process.c          |  2 ++
 arch/x86/kernel/process.c           | 11 +++++++++++
 drivers/acpi/processor_core.c       | 13 +++++++++++++
 drivers/acpi/processor_idle.c       |  6 +++++-
 include/asm-ia64/processor.h        |  1 +
 include/asm-x86/processor.h         |  1 +
 7 files changed, 35 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 65db7f4711aa..5e497d16fb51 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -818,7 +818,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			See Documentation/ide/ide.txt.
 
 	idle=		[X86]
-			Format: idle=poll or idle=mwait, idle=halt
+			Format: idle=poll or idle=mwait, idle=halt, idle=nomwait
 			Poll forces a polling idle loop that can slightly improves the performance
 			of waking up a idle CPU, but will use a lot of power and make the system
 			run hot. Not recommended.
@@ -828,6 +828,7 @@ and is between 256 and 4096 characters. It is defined in the file
 			as idle=poll.
 			idle=halt. Halt is forced to be used for CPU idle.
 			In such case C2/C3 won't be used again.
+			idle=nomwait. Disable mwait for CPU C-states
 
 	ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem
 			Claim all unknown PCI IDE storage controllers.
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index 612b3c4a0603..3ab8373103ec 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -57,6 +57,8 @@ unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
+unsigned long idle_nomwait;
+EXPORT_SYMBOL(idle_nomwait);
 
 void
 ia64_do_show_stack (struct unw_frame_info *info, void *arg)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc729498760..4d629c62f4f8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -11,6 +11,8 @@
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
+unsigned long idle_nomwait;
+EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
@@ -340,6 +342,15 @@ static int __init idle_setup(char *str)
 		pm_idle = default_idle;
 		idle_halt = 1;
 		return 0;
+	} else if (!strcmp(str, "nomwait")) {
+		/*
+		 * If the boot option of "idle=nomwait" is added,
+		 * it means that mwait will be disabled for CPU C2/C3
+		 * states. In such case it won't touch the variable
+		 * of boot_option_idle_override.
+		 */
+		idle_nomwait = 1;
+		return 0;
 	} else
 		return -1;
 
diff --git a/drivers/acpi/processor_core.c b/drivers/acpi/processor_core.c
index 9a803f85ccfe..4e1bb89fd6c3 100644
--- a/drivers/acpi/processor_core.c
+++ b/drivers/acpi/processor_core.c
@@ -265,7 +265,20 @@ static int acpi_processor_set_pdc(struct acpi_processor *pr)
 
 	if (!pdc_in)
 		return status;
+	if (idle_nomwait) {
+		/*
+		 * If mwait is disabled for CPU C-states, the C2C3_FFH access
+		 * mode will be disabled in the parameter of _PDC object.
+		 * Of course C1_FFH access mode will also be disabled.
+		 */
+		union acpi_object *obj;
+		u32 *buffer = NULL;
 
+		obj = pdc_in->pointer;
+		buffer = (u32 *)(obj->buffer.pointer);
+		buffer[2] &= ~(ACPI_PDC_C_C2C3_FFH | ACPI_PDC_C_C1_FFH);
+
+	}
 	status = acpi_evaluate_object(pr->handle, "_PDC", pdc_in, NULL);
 
 	if (ACPI_FAILURE(status))
diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c
index c75c7ace8c13..d592dbb1d12a 100644
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -957,13 +957,17 @@ static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
 			} else {
 				continue;
 			}
-			if (cx.type == ACPI_STATE_C1 && idle_halt) {
+			if (cx.type == ACPI_STATE_C1 &&
+					(idle_halt || idle_nomwait)) {
 				/*
 				 * In most cases the C1 space_id obtained from
 				 * _CST object is FIXED_HARDWARE access mode.
 				 * But when the option of idle=halt is added,
 				 * the entry_method type should be changed from
 				 * CSTATE_FFH to CSTATE_HALT.
+				 * When the option of idle=nomwait is added,
+				 * the C1 entry_method type should be
+				 * CSTATE_HALT.
 				 */
 				cx.entry_method = ACPI_CSTATE_HALT;
 				snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
diff --git a/include/asm-ia64/processor.h b/include/asm-ia64/processor.h
index f36e28a5f61e..f88fa054d01d 100644
--- a/include/asm-ia64/processor.h
+++ b/include/asm-ia64/processor.h
@@ -764,6 +764,7 @@ prefetchw (const void *x)
 
 extern unsigned long boot_option_idle_override;
 extern unsigned long idle_halt;
+extern unsigned long idle_nomwait;
 
 #endif /* !__ASSEMBLY__ */
 
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index bc221623248e..55402d2ab938 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -728,6 +728,7 @@ extern void select_idle_routine(const struct cpuinfo_x86 *c);
 
 extern unsigned long		boot_option_idle_override;
 extern unsigned long		idle_halt;
+extern unsigned long		idle_nomwait;
 
 extern void enable_sep_cpu(void);
 extern int sysenter_setup(void);
-- 
cgit v1.2.3


From 58b6e5538460be358fdf1286d9a2fbcfcc2cfaba Mon Sep 17 00:00:00 2001
From: Jesse Barnes
Date: Wed, 16 Jul 2008 16:21:47 -0700
Subject: Revert "x86/PCI: ACPI based PCI gap calculation"

This reverts commit 809d9a8f93bd8504dcc34b16bbfdfd1a8c9bb1ed.

This one isn't quite ready for prime time.  It needs more testing and
additional feedback from the ACPI guys.
---
 arch/x86/pci/acpi.c | 80 -----------------------------------------------------
 1 file changed, 80 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index d1ffb5709174..d95de2f199cd 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -4,7 +4,6 @@
 #include <linux/irq.h>
 #include <linux/dmi.h>
 #include <asm/numa.h>
-#include <asm/e820.h>
 #include "pci.h"
 
 struct pci_root_info {
@@ -15,11 +14,6 @@ struct pci_root_info {
 	int busnum;
 };
 
-struct gap_info {
-	unsigned long gapstart;
-	unsigned long gapsize;
-};
-
 static acpi_status
 resource_to_addr(struct acpi_resource *resource,
 			struct acpi_resource_address64 *addr)
@@ -116,78 +110,6 @@ adjust_transparent_bridge_resources(struct pci_bus *bus)
 	}
 }
 
-static acpi_status search_gap(struct acpi_resource *resource, void *data)
-{
-	struct acpi_resource_address64 addr;
-	acpi_status status;
-	struct gap_info *gap = data;
-	unsigned long long start_addr, end_addr;
-
-	status = resource_to_addr(resource, &addr);
-	if (ACPI_SUCCESS(status) &&
-	    addr.resource_type == ACPI_MEMORY_RANGE &&
-	    addr.address_length > gap->gapsize) {
-		start_addr = addr.minimum + addr.translation_offset;
-		/*
-		 * We want space only in the 32bit address range
-		 */
-		if (start_addr < UINT_MAX) {
-			end_addr = start_addr + addr.address_length;
-			e820_search_gap(&gap->gapstart, &gap->gapsize,
-					start_addr, end_addr);
-		}
-	}
-
-	return AE_OK;
-}
-
-/*
- * Search for a hole in the 32 bit address space for PCI to assign MMIO
- * resources, for hotplug or unconfigured resources.
- * We query the CRS object of the PCI root device to look for possible producer
- * resources in the tree and consider these while calulating the start address
- * for this hole.
- */
-static void pci_setup_gap(acpi_handle *handle)
-{
-	struct gap_info gap;
-	acpi_status status;
-
-	gap.gapstart = 0;
-	gap.gapsize = 0x400000;
-
-	status = acpi_walk_resources(handle, METHOD_NAME__CRS,
-				     search_gap, &gap);
-
-	if (ACPI_SUCCESS(status)) {
-		unsigned long round;
-
-		if (!gap.gapstart) {
-			printk(KERN_ERR "ACPI: Warning: Cannot find a gap "
-					"in the 32bit address range for PCI\n"
-					"ACPI: PCI devices may collide with "
-					"hotpluggable memory address range\n");
-		}
-		/*
-		 * Round the gapstart, uses the same logic as in
-		 * e820_gap_setup
-		 */
-		round = 0x100000;
-		while ((gap.gapsize >> 4) > round)
-			round += round;
-		/* Fun with two's complement */
-		pci_mem_start = (gap.gapstart + round) & -round;
-
-		printk(KERN_INFO "ACPI: PCI resources should "
-				"start at %lx (gap: %lx:%lx)\n",
-				pci_mem_start, gap.gapstart, gap.gapsize);
-	} else {
-		printk(KERN_ERR "ACPI: Error while searching for gap in "
-				"the 32bit address range for PCI\n");
-	}
-}
-
-
 static void
 get_current_resources(struct acpi_device *device, int busnum,
 			int domain, struct pci_bus *bus)
@@ -293,8 +215,6 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
 
 	if (bus && (pci_probe & PCI_USE__CRS))
 		get_current_resources(device, busnum, domain, bus);
-
-	pci_setup_gap(device->handle);
 	return bus;
 }
 
-- 
cgit v1.2.3


From 8e9509c827a28e2f365c203c04224f9e9dd1b63a Mon Sep 17 00:00:00 2001
From: Ingo Molnar
Date: Thu, 17 Jul 2008 13:26:50 +0200
Subject: ftrace: fix merge buglet

-tip testing found a bootup hang here:

  initcall anon_inode_init+0x0/0x130 returned 0 after 0 msecs
  calling  acpi_event_init+0x0/0x57

the bootup should have continued with:

  initcall acpi_event_init+0x0/0x57 returned 0 after 45 msecs

but it hung hard there instead.

bisection led to this commit:

| commit 5806b81ac1c0c52665b91723fd4146a4f86e386b
| Merge: d14c8a6... 6712e29...
| Author: Ingo Molnar <mingo@elte.hu>
| Date:   Mon Jul 14 16:11:52 2008 +0200
|     Merge branch 'auto-ftrace-next' into tracing/for-linus

turns out that i made this mistake in the merge:

  ifdef CONFIG_FTRACE
  # Do not profile debug utilities
  CFLAGS_REMOVE_tsc_64.o = -pg
  CFLAGS_REMOVE_tsc_32.o = -pg

those two files got unified meanwhile - so the dont-profile annotation
got lost. The proper rule is:

  CFLAGS_REMOVE_tsc.o = -pg

i guess this could have been caught sooner if the CFLAGS_REMOVE* kbuild
rule aborted the build if it met a target that does not exist anymore?

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5112c84f5421..da140611bb57 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -8,8 +8,7 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
 
 ifdef CONFIG_FTRACE
 # Do not profile debug utilities
-CFLAGS_REMOVE_tsc_64.o = -pg
-CFLAGS_REMOVE_tsc_32.o = -pg
+CFLAGS_REMOVE_tsc.o = -pg
 CFLAGS_REMOVE_rtc.o = -pg
 endif
 
-- 
cgit v1.2.3


From 9354094a95aed456a46b353b1051a7e2fab29045 Mon Sep 17 00:00:00 2001
From: Yinghai Lu
Date: Mon, 14 Jul 2008 23:29:01 -0700
Subject: x86: fix numaq_tsc_disable

fix:

 arch/x86/kernel/numaq_32.c: In function ‘numaq_tsc_disable’:
 arch/x86/kernel/numaq_32.c:99: warning: ‘return’ with a value, in function returning void

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/numaq_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index 5b20a5e7ac28..a23e8233b9ac 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -96,7 +96,7 @@ int __init get_memcfg_numaq(void)
 void __init numaq_tsc_disable(void)
 {
 	if (!found_numaq)
-		return -1;
+		return;
 
 	if (num_online_nodes() > 1) {
 		printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
-- 
cgit v1.2.3