From 7de4e74430139f2484cb16cedf6c281d1a5a696e Mon Sep 17 00:00:00 2001
From: Joe Perches
Date: Fri, 17 Apr 2015 11:39:18 -0700
Subject: timer_list: Reduce SEQ_printf footprint

This macro can be converted to a static function to reduce
object size.

(x86-64 defconfig)
$ size kernel/time/timer_list.o*
   text	   data	    bss	    dec	    hex	filename
   6583	      8	      0	   6591	   19bf	kernel/time/timer_list.o.old
   4647	      8	      0	   4655	   122f	kernel/time/timer_list.o.new

Signed-off-by: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1429295958.2850.104.camel@perches.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer_list.c | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index e878c2e0ba45..5960af2196ac 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -35,13 +35,20 @@ DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
  */
-#define SEQ_printf(m, x...)			\
- do {						\
-	if (m)					\
-		seq_printf(m, x);		\
-	else					\
-		printk(x);			\
- } while (0)
+__printf(2, 3)
+static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
+{
+	va_list args;
+
+	va_start(args, fmt);
+
+	if (m)
+		seq_vprintf(m, fmt, args);
+	else
+		vprintk(fmt, args);
+
+	va_end(args);
+}
 
 static void print_name_offset(struct seq_file *m, void *sym)
 {
-- 
cgit v1.2.3


From 51a03393bac061a4e13fd17214d3ef93a5b296e3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Wed, 22 Apr 2015 11:44:15 +0200
Subject: timekeeping: Remove stale function prototype

commit 61edec81d260 "timekeeping: Simplify timekeeping_clocktai()"
implemented timekeeping_clocktai() as an inline function, but left the
old extern prototype in the header file. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.h | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index ead8794b9a4e..5b57f6c9ae34 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -15,7 +15,6 @@ extern u64 timekeeping_max_deferment(void);
 extern int timekeeping_inject_offset(struct timespec *ts);
 extern s32 timekeeping_get_tai_offset(void);
 extern void timekeeping_set_tai_offset(s32 tai_offset);
-extern void timekeeping_clocktai(struct timespec *ts);
 extern int timekeeping_suspend(void);
 extern void timekeeping_resume(void);
 
-- 
cgit v1.2.3


From 91e5a2170e795989da9f90c18ba18984f23acc5b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Mon, 13 Apr 2015 21:02:22 +0000
Subject: hrtimer: Document hrtimer_forward[_now]() proper

Document the calling context conditions.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150413210035.178751779@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 17 ++++++++++++++++-
 kernel/time/hrtimer.c   |  8 ++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 05f6df1fdf5b..7770676c387a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -418,7 +418,22 @@ static inline int hrtimer_callback_running(struct hrtimer *timer)
 extern u64
 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval);
 
-/* Forward a hrtimer so it expires after the hrtimer's current now */
+/**
+ * hrtimer_forward_now - forward the timer expiry so it expires after now
+ * @timer:	hrtimer to forward
+ * @interval:	the interval to forward
+ *
+ * Forward the timer expiry so it will expire after the current time
+ * of the hrtimer clock base. Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
+ */
 static inline u64 hrtimer_forward_now(struct hrtimer *timer,
 				      ktime_t interval)
 {
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 76d4bd962b19..b1a74ee3e99a 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -801,6 +801,14 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
  *
  * Forward the timer expiry so it will expire in the future.
  * Returns the number of overruns.
+ *
+ * Can be safely called from the callback function of @timer. If
+ * called from other contexts @timer must neither be enqueued nor
+ * running the callback and the caller needs to take care of
+ * serialization.
+ *
+ * Note: This only updates the timer expiry value and does not requeue
+ * the timer.
  */
 u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 {
-- 
cgit v1.2.3


From d9f0acdeef48570c4e6159d3108f12b64571392e Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Tue, 14 Apr 2015 21:08:25 +0000
Subject: hrtimer: Update active_bases before calling hrtimer_force_reprogram()

'active_bases' indicates which clock-base have active timer. The
intention of this bit field was to avoid evaluating inactive bases. It
was introduced with the introduction of the BOOTTIME and TAI clock
bases, but it was never brought into full use.

We want to use it now, but in __remove_hrtimer() the update happens
after the calling hrtimer_force_reprogram() which has to evaluate all
clock bases for the next expiring timer. So in case the last timer of
a clock base got removed we still see the active bit and therefor
evaluate the clock base for no value. There are further optimizations
possible when active_bases is updated in the right place.

Move the update before the call to hrtimer_force_reprogram()

[ tglx: Massaged changelog ]

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: linaro-kernel@lists.linaro.org
Link: http://lkml.kernel.org/r/20150414203500.533438642@linutronix.de
Link: http://lkml.kernel.org/r/c7c8ebcd9ed88bb09d76059c745a1fafb48314e7.1428039899.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index b1a74ee3e99a..9abd50b60e83 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -887,6 +887,9 @@ static void __remove_hrtimer(struct hrtimer *timer,
 
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
+	if (!timerqueue_getnext(&base->active))
+		base->cpu_base->active_bases &= ~(1 << base->index);
+
 	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
 		/* Reprogram the clock event device. if enabled */
@@ -900,8 +903,6 @@ static void __remove_hrtimer(struct hrtimer *timer,
 		}
 #endif
 	}
-	if (!timerqueue_getnext(&base->active))
-		base->cpu_base->active_bases &= ~(1 << base->index);
 out:
 	timer->state = newstate;
 }
-- 
cgit v1.2.3


From 398ca17fb54b212cdc9da7ff4a17a35c48dd2103 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:27 +0000
Subject: hrtimer: Get rid of the resolution field in hrtimer_clock_base

The field has no value because all clock bases have the same
resolution. The resolution only changes when we switch to high
resolution timer mode. We can evaluate that from a single static
variable as well. In the !HIGHRES case its simply a constant.

Export the variable, so we can simplify the usage sites.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.645454122@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h  |  6 ++++--
 kernel/time/hrtimer.c    | 26 +++++++++-----------------
 kernel/time/timer_list.c |  8 ++++----
 3 files changed, 17 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 7770676c387a..bc6f91b5443b 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -137,7 +137,6 @@ struct hrtimer_sleeper {
  *			timer to a base on another cpu.
  * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
- * @resolution:		the resolution of the clock, in nanoseconds
  * @get_time:		function to retrieve the current time of the clock
  * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
@@ -147,7 +146,6 @@ struct hrtimer_clock_base {
 	int			index;
 	clockid_t		clockid;
 	struct timerqueue_head	active;
-	ktime_t			resolution;
 	ktime_t			(*get_time)(void);
 	ktime_t			softirq_time;
 	ktime_t			offset;
@@ -295,11 +293,15 @@ extern void hrtimer_peek_ahead_timers(void);
 
 extern void clock_was_set_delayed(void);
 
+extern unsigned int hrtimer_resolution;
+
 #else
 
 # define MONOTONIC_RES_NSEC	LOW_RES_NSEC
 # define KTIME_MONOTONIC_RES	KTIME_LOW_RES
 
+#define hrtimer_resolution	LOW_RES_NSEC
+
 static inline void hrtimer_peek_ahead_timers(void) { }
 
 /*
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9abd50b60e83..965687a1fce6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -66,7 +66,6 @@
  */
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
-
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
 	.clock_base =
 	{
@@ -74,25 +73,21 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 			.index = HRTIMER_BASE_MONOTONIC,
 			.clockid = CLOCK_MONOTONIC,
 			.get_time = &ktime_get,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_REALTIME,
 			.clockid = CLOCK_REALTIME,
 			.get_time = &ktime_get_real,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_BOOTTIME,
 			.clockid = CLOCK_BOOTTIME,
 			.get_time = &ktime_get_boottime,
-			.resolution = KTIME_LOW_RES,
 		},
 		{
 			.index = HRTIMER_BASE_TAI,
 			.clockid = CLOCK_TAI,
 			.get_time = &ktime_get_clocktai,
-			.resolution = KTIME_LOW_RES,
 		},
 	}
 };
@@ -478,6 +473,8 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
  * High resolution timer enabled ?
  */
 static int hrtimer_hres_enabled __read_mostly  = 1;
+unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC;
+EXPORT_SYMBOL_GPL(hrtimer_resolution);
 
 /*
  * Enable / Disable high resolution mode
@@ -660,7 +657,7 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int i, cpu = smp_processor_id();
+	int cpu = smp_processor_id();
 	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
 	unsigned long flags;
 
@@ -676,8 +673,7 @@ static int hrtimer_switch_to_hres(void)
 		return 0;
 	}
 	base->hres_active = 1;
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
-		base->clock_base[i].resolution = KTIME_HIGH_RES;
+	hrtimer_resolution = HIGH_RES_NSEC;
 
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
@@ -820,8 +816,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
-	if (interval.tv64 < timer->base->resolution.tv64)
-		interval.tv64 = timer->base->resolution.tv64;
+	if (interval.tv64 < hrtimer_resolution)
+		interval.tv64 = hrtimer_resolution;
 
 	if (unlikely(delta.tv64 >= interval.tv64)) {
 		s64 incr = ktime_to_ns(interval);
@@ -963,7 +959,7 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * timeouts. This will go away with the GTOD framework.
 		 */
 #ifdef CONFIG_TIME_LOW_RES
-		tim = ktime_add_safe(tim, base->resolution);
+		tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
 #endif
 	}
 
@@ -1193,12 +1189,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
  */
 int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 {
-	struct hrtimer_cpu_base *cpu_base;
-	int base = hrtimer_clockid_to_base(which_clock);
-
-	cpu_base = raw_cpu_ptr(&hrtimer_bases);
-	*tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
-
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 5960af2196ac..bdd5e987f115 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -127,10 +127,10 @@ static void
 print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 {
 	SEQ_printf(m, "  .base:       %pK\n", base);
-	SEQ_printf(m, "  .index:      %d\n",
-			base->index);
-	SEQ_printf(m, "  .resolution: %Lu nsecs\n",
-			(unsigned long long)ktime_to_ns(base->resolution));
+	SEQ_printf(m, "  .index:      %d\n", base->index);
+
+	SEQ_printf(m, "  .resolution: %u nsecs\n", (unsigned) hrtimer_resolution);
+
 	SEQ_printf(m,   "  .get_time:   ");
 	print_name_offset(m, base->get_time);
 	SEQ_printf(m,   "\n");
-- 
cgit v1.2.3


From 056a3cacbc46e5aca27b350ce4ecb3b33ebb0700 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:32 +0000
Subject: hrtimer: Get rid of hrtimer_get_res()

The resolution is directly accessible now. So its simpler just to fill
in the values of the timespec and be done with it.

Text size reduction (combined with "hrtimer: Get rid of the resolution
field in hrtimer_clock_base"):
       x8664 -61, i386 -221, ARM -60, power64 -48

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.879888080@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h    |  1 -
 kernel/time/alarmtimer.c   |  6 +++---
 kernel/time/hrtimer.c      | 16 ----------------
 kernel/time/posix-timers.c | 17 ++++++++++++-----
 4 files changed, 15 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bc6f91b5443b..8025156c8fa1 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -385,7 +385,6 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
-extern int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp);
 
 extern ktime_t hrtimer_get_next_event(void);
 
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 1b001ed1edb9..0b55a7570c90 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -495,12 +495,12 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm,
  */
 static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
-	clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
-
 	if (!alarmtimer_get_rtcdev())
 		return -EINVAL;
 
-	return hrtimer_get_res(baseid, tp);
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
 }
 
 /**
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 965687a1fce6..73131dab787e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1179,22 +1179,6 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-/**
- * hrtimer_get_res - get the timer resolution for a clock
- * @which_clock: which clock to query
- * @tp:		 pointer to timespec variable to store the resolution
- *
- * Store the resolution of the clock selected by @which_clock in the
- * variable pointed to by @tp.
- */
-int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
-{
-	tp->tv_sec = 0;
-	tp->tv_nsec = hrtimer_resolution;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(hrtimer_get_res);
-
 static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 {
 	struct hrtimer_clock_base *base = timer->base;
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31ea01f42e1f..31d11ac9fa47 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -272,13 +272,20 @@ static int posix_get_tai(clockid_t which_clock, struct timespec *tp)
 	return 0;
 }
 
+static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp)
+{
+	tp->tv_sec = 0;
+	tp->tv_nsec = hrtimer_resolution;
+	return 0;
+}
+
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
 	struct k_clock clock_realtime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_clock_realtime_get,
 		.clock_set	= posix_clock_realtime_set,
 		.clock_adj	= posix_clock_realtime_adj,
@@ -290,7 +297,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_ktime_get_ts,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -300,7 +307,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_monotonic_raw = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_monotonic_raw,
 	};
 	struct k_clock clock_realtime_coarse = {
@@ -312,7 +319,7 @@ static __init int init_posix_timers(void)
 		.clock_get	= posix_get_monotonic_coarse,
 	};
 	struct k_clock clock_tai = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_tai,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
@@ -322,7 +329,7 @@ static __init int init_posix_timers(void)
 		.timer_del	= common_timer_del,
 	};
 	struct k_clock clock_boottime = {
-		.clock_getres	= hrtimer_get_res,
+		.clock_getres	= posix_get_hrtimer_res,
 		.clock_get	= posix_get_boottime,
 		.nsleep		= common_nsleep,
 		.nsleep_restart	= hrtimer_nanosleep_restart,
-- 
cgit v1.2.3


From a6ffebce7f89f6f97cc22838a5d4383b15d6774f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:34 +0000
Subject: hrtimer: Make the statistics fields smaller

No point in having usigned long for /proc/timer_list statistics. Make
them unsigned int.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203500.959773467@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h  | 8 ++++----
 kernel/time/hrtimer.c    | 4 ++--
 kernel/time/timer_list.c | 2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 8025156c8fa1..d39f2847754c 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -187,10 +187,10 @@ struct hrtimer_cpu_base {
 	int				in_hrtirq;
 	int				hres_active;
 	int				hang_detected;
-	unsigned long			nr_events;
-	unsigned long			nr_retries;
-	unsigned long			nr_hangs;
-	ktime_t				max_hang_time;
+	unsigned int			nr_events;
+	unsigned int			nr_retries;
+	unsigned int			nr_hangs;
+	unsigned int			max_hang_time;
 #endif
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 };
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 73131dab787e..874e0914eb3c 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1327,8 +1327,8 @@ retry:
 	cpu_base->hang_detected = 1;
 	raw_spin_unlock(&cpu_base->lock);
 	delta = ktime_sub(now, entry_time);
-	if (delta.tv64 > cpu_base->max_hang_time.tv64)
-		cpu_base->max_hang_time = delta;
+	if ((unsigned int)delta.tv64 > cpu_base->max_hang_time)
+		cpu_base->max_hang_time = (unsigned int) delta.tv64;
 	/*
 	 * Limit it to a sensible value as we enforce a longer
 	 * delay. Give the CPU at least 100ms to catch up.
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index bdd5e987f115..6232fc536185 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -165,7 +165,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 	P(nr_events);
 	P(nr_retries);
 	P(nr_hangs);
-	P_ns(max_hang_time);
+	P(max_hang_time);
 #endif
 #undef P
 #undef P_ns
-- 
cgit v1.2.3


From 21d6d52a1b7028e6a6840bd82e354aefa9a5e203 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:35 +0000
Subject: hrtimer: Get rid of softirq time

The softirq time field in the clock bases is an optimization from the
early days of hrtimers. It provides a coarse "jiffies" like time
mostly for self rearming timers.

But that comes with a price:
    - Larger code size
    - Extra storage space
    - Duplicated functions with really small differences
   
The benefit of this is optimization is marginal for contemporary
systems.

Consolidate everything on the high resolution timer
implementation. This makes further optimizations possible.

Text size reduction:
       x8664 -95, i386 -356, ARM -148, ARM64 -40, power64 -16

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.039977424@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |  24 ++------
 kernel/time/hrtimer.c     | 148 ++++++++++++++++++----------------------------
 kernel/time/timekeeping.c |  32 ----------
 kernel/time/timekeeping.h |   3 -
 4 files changed, 64 insertions(+), 143 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d39f2847754c..e292830b58f0 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -138,7 +138,6 @@ struct hrtimer_sleeper {
  * @clockid:		clock id for per_cpu support
  * @active:		red black tree root node for the active timers
  * @get_time:		function to retrieve the current time of the clock
- * @softirq_time:	the time when running the hrtimer queue in the softirq
  * @offset:		offset of this clock to the monotonic base
  */
 struct hrtimer_clock_base {
@@ -147,7 +146,6 @@ struct hrtimer_clock_base {
 	clockid_t		clockid;
 	struct timerqueue_head	active;
 	ktime_t			(*get_time)(void);
-	ktime_t			softirq_time;
 	ktime_t			offset;
 };
 
@@ -260,19 +258,16 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer)
 	return ktime_sub(timer->node.expires, timer->base->get_time());
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-struct clock_event_device;
-
-extern void hrtimer_interrupt(struct clock_event_device *dev);
-
-/*
- * In high resolution mode the time reference must be read accurate
- */
 static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 {
 	return timer->base->get_time();
 }
 
+#ifdef CONFIG_HIGH_RES_TIMERS
+struct clock_event_device;
+
+extern void hrtimer_interrupt(struct clock_event_device *dev);
+
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return timer->base->cpu_base->hres_active;
@@ -304,15 +299,6 @@ extern unsigned int hrtimer_resolution;
 
 static inline void hrtimer_peek_ahead_timers(void) { }
 
-/*
- * In non high resolution mode the time reference is taken from
- * the base softirq time variable.
- */
-static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
-{
-	return timer->base->softirq_time;
-}
-
 static inline int hrtimer_is_hres_active(struct hrtimer *timer)
 {
 	return 0;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 874e0914eb3c..9e111dd83ca3 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -104,27 +104,6 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
 	return hrtimer_clock_to_base_table[clock_id];
 }
 
-
-/*
- * Get the coarse grained time at the softirq based on xtime and
- * wall_to_monotonic.
- */
-static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
-{
-	ktime_t xtim, mono, boot, tai;
-	ktime_t off_real, off_boot, off_tai;
-
-	mono = ktime_get_update_offsets_tick(&off_real, &off_boot, &off_tai);
-	boot = ktime_add(mono, off_boot);
-	xtim = ktime_add(mono, off_real);
-	tai = ktime_add(mono, off_tai);
-
-	base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
-	base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
-	base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
-	base->clock_base[HRTIMER_BASE_TAI].softirq_time = tai;
-}
-
 /*
  * Functions and macros which are different for UP/SMP systems are kept in a
  * single place
@@ -466,6 +445,15 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 }
 #endif
 
+static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
+{
+	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
+	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
+	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
+
+	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
+}
+
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 
@@ -516,7 +504,12 @@ static inline int hrtimer_hres_active(void)
 static void
 hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 {
-	ktime_t expires_next = __hrtimer_get_next_event(cpu_base);
+	ktime_t expires_next;
+
+	if (!cpu_base->hres_active)
+		return;
+
+	expires_next = __hrtimer_get_next_event(cpu_base);
 
 	if (skip_equal && expires_next.tv64 == cpu_base->expires_next.tv64)
 		return;
@@ -625,15 +618,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 	base->hres_active = 0;
 }
 
-static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
-{
-	ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset;
-	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
-	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
-
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
-}
-
 /*
  * Retrigger next event is called after clock was set
  *
@@ -1179,10 +1163,10 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
-static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
+static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
+			  struct hrtimer_clock_base *base,
+			  struct hrtimer *timer, ktime_t *now)
 {
-	struct hrtimer_clock_base *base = timer->base;
-	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
@@ -1219,34 +1203,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
 	timer->state &= ~HRTIMER_STATE_CALLBACK;
 }
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-
-/*
- * High resolution timer interrupt
- * Called with interrupts disabled
- */
-void hrtimer_interrupt(struct clock_event_device *dev)
+static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 {
-	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t expires_next, now, entry_time, delta;
-	int i, retries = 0;
-
-	BUG_ON(!cpu_base->hres_active);
-	cpu_base->nr_events++;
-	dev->next_event.tv64 = KTIME_MAX;
-
-	raw_spin_lock(&cpu_base->lock);
-	entry_time = now = hrtimer_update_base(cpu_base);
-retry:
-	cpu_base->in_hrtirq = 1;
-	/*
-	 * We set expires_next to KTIME_MAX here with cpu_base->lock
-	 * held to prevent that a timer is enqueued in our queue via
-	 * the migration code. This does not affect enqueueing of
-	 * timers which run their callback and need to be requeued on
-	 * this CPU.
-	 */
-	cpu_base->expires_next.tv64 = KTIME_MAX;
+	int i;
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
 		struct hrtimer_clock_base *base;
@@ -1279,9 +1238,42 @@ retry:
 			if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer))
 				break;
 
-			__run_hrtimer(timer, &basenow);
+			__run_hrtimer(cpu_base, base, timer, &basenow);
 		}
 	}
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+void hrtimer_interrupt(struct clock_event_device *dev)
+{
+	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
+	ktime_t expires_next, now, entry_time, delta;
+	int retries = 0;
+
+	BUG_ON(!cpu_base->hres_active);
+	cpu_base->nr_events++;
+	dev->next_event.tv64 = KTIME_MAX;
+
+	raw_spin_lock(&cpu_base->lock);
+	entry_time = now = hrtimer_update_base(cpu_base);
+retry:
+	cpu_base->in_hrtirq = 1;
+	/*
+	 * We set expires_next to KTIME_MAX here with cpu_base->lock
+	 * held to prevent that a timer is enqueued in our queue via
+	 * the migration code. This does not affect enqueueing of
+	 * timers which run their callback and need to be requeued on
+	 * this CPU.
+	 */
+	cpu_base->expires_next.tv64 = KTIME_MAX;
+
+	__hrtimer_run_queues(cpu_base, now);
+
 	/* Reevaluate the clock bases for the next expiry */
 	expires_next = __hrtimer_get_next_event(cpu_base);
 	/*
@@ -1416,38 +1408,16 @@ void hrtimer_run_pending(void)
  */
 void hrtimer_run_queues(void)
 {
-	struct timerqueue_node *node;
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	struct hrtimer_clock_base *base;
-	int index, gettime = 1;
+	ktime_t now;
 
 	if (hrtimer_hres_active())
 		return;
 
-	for (index = 0; index < HRTIMER_MAX_CLOCK_BASES; index++) {
-		base = &cpu_base->clock_base[index];
-		if (!timerqueue_getnext(&base->active))
-			continue;
-
-		if (gettime) {
-			hrtimer_get_softirq_time(cpu_base);
-			gettime = 0;
-		}
-
-		raw_spin_lock(&cpu_base->lock);
-
-		while ((node = timerqueue_getnext(&base->active))) {
-			struct hrtimer *timer;
-
-			timer = container_of(node, struct hrtimer, node);
-			if (base->softirq_time.tv64 <=
-					hrtimer_get_expires_tv64(timer))
-				break;
-
-			__run_hrtimer(timer, &base->softirq_time);
-		}
-		raw_spin_unlock(&cpu_base->lock);
-	}
+	raw_spin_lock(&cpu_base->lock);
+	now = hrtimer_update_base(cpu_base);
+	__hrtimer_run_queues(cpu_base, now);
+	raw_spin_unlock(&cpu_base->lock);
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 946acb72179f..dd1efa6a4ea4 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1925,37 +1925,6 @@ void do_timer(unsigned long ticks)
 	calc_global_load(ticks);
 }
 
-/**
- * ktime_get_update_offsets_tick - hrtimer helper
- * @offs_real:	pointer to storage for monotonic -> realtime offset
- * @offs_boot:	pointer to storage for monotonic -> boottime offset
- * @offs_tai:	pointer to storage for monotonic -> clock tai offset
- *
- * Returns monotonic time at last tick and various offsets
- */
-ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
-{
-	struct timekeeper *tk = &tk_core.timekeeper;
-	unsigned int seq;
-	ktime_t base;
-	u64 nsecs;
-
-	do {
-		seq = read_seqcount_begin(&tk_core.seq);
-
-		base = tk->tkr_mono.base;
-		nsecs = tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift;
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
-	} while (read_seqcount_retry(&tk_core.seq, seq));
-
-	return ktime_add_ns(base, nsecs);
-}
-
-#ifdef CONFIG_HIGH_RES_TIMERS
 /**
  * ktime_get_update_offsets_now - hrtimer helper
  * @offs_real:	pointer to storage for monotonic -> realtime offset
@@ -1986,7 +1955,6 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 	return ktime_add_ns(base, nsecs);
 }
-#endif
 
 /**
  * do_adjtimex() - Accessor function to NTP __do_adjtimex function
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 5b57f6c9ae34..4d177fce37d5 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,9 +3,6 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_tick(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
 extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
 						ktime_t *offs_boot,
 						ktime_t *offs_tai);
-- 
cgit v1.2.3


From 868a3e915f7f5eba8f8cb4f7da2276760807c51c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:37 +0000
Subject: hrtimer: Make offset update smarter

On every tick/hrtimer interrupt we update the offset variables of the
clock bases. That's silly because these offsets change very seldom.

Add a sequence counter to the time keeping code which keeps track of
the offset updates (clock_was_set()). Have a sequence cache in the
hrtimer cpu bases to evaluate whether the offsets must be updated or
not. This allows us later to avoid pointless cacheline pollution.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20150414203501.132820245@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: John Stultz <john.stultz@linaro.org>
---
 include/linux/hrtimer.h             |  4 ++--
 include/linux/timekeeper_internal.h |  2 ++
 kernel/time/hrtimer.c               |  3 ++-
 kernel/time/timekeeping.c           | 23 ++++++++++++++++-------
 kernel/time/timekeeping.h           |  7 ++++---
 5 files changed, 26 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index e292830b58f0..5e04f8fc26f6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -163,7 +163,7 @@ enum  hrtimer_base_type {
  *			and timers
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
- * @clock_was_set:	Indicates that clock was set from irq context.
+ * @clock_was_set_seq:	Sequence counter of clock was set events
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @in_hrtirq:		hrtimer_interrupt() is currently executing
@@ -179,7 +179,7 @@ struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
 	unsigned int			cpu;
 	unsigned int			active_bases;
-	unsigned int			clock_was_set;
+	unsigned int			clock_was_set_seq;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	int				in_hrtirq;
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index fb86963859c7..6f8276ae579c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -49,6 +49,7 @@ struct tk_read_base {
  * @offs_boot:		Offset clock monotonic -> clock boottime
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
+ * @clock_was_set_seq:	The sequence number of clock was set events
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -85,6 +86,7 @@ struct timekeeper {
 	ktime_t			offs_boot;
 	ktime_t			offs_tai;
 	s32			tai_offset;
+	unsigned int		clock_was_set_seq;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9e111dd83ca3..8ce9b3138017 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -451,7 +451,8 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base)
 	ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset;
 	ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset;
 
-	return ktime_get_update_offsets_now(offs_real, offs_boot, offs_tai);
+	return ktime_get_update_offsets_now(&base->clock_was_set_seq,
+					    offs_real, offs_boot, offs_tai);
 }
 
 /* High resolution timer related functions */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index dd1efa6a4ea4..3365e32dc208 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -602,6 +602,9 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
+
+	if (action & TK_CLOCK_WAS_SET)
+		tk->clock_was_set_seq++;
 }
 
 /**
@@ -1927,15 +1930,19 @@ void do_timer(unsigned long ticks)
 
 /**
  * ktime_get_update_offsets_now - hrtimer helper
+ * @cwsseq:	pointer to check and store the clock was set sequence number
  * @offs_real:	pointer to storage for monotonic -> realtime offset
  * @offs_boot:	pointer to storage for monotonic -> boottime offset
  * @offs_tai:	pointer to storage for monotonic -> clock tai offset
  *
- * Returns current monotonic time and updates the offsets
+ * Returns current monotonic time and updates the offsets if the
+ * sequence number in @cwsseq and timekeeper.clock_was_set_seq are
+ * different.
+ *
  * Called from hrtimer_interrupt() or retrigger_next_event()
  */
-ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
-							ktime_t *offs_tai)
+ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
+				     ktime_t *offs_boot, ktime_t *offs_tai)
 {
 	struct timekeeper *tk = &tk_core.timekeeper;
 	unsigned int seq;
@@ -1947,10 +1954,12 @@ ktime_t ktime_get_update_offsets_now(ktime_t *offs_real, ktime_t *offs_boot,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
-
-		*offs_real = tk->offs_real;
-		*offs_boot = tk->offs_boot;
-		*offs_tai = tk->offs_tai;
+		if (*cwsseq != tk->clock_was_set_seq) {
+			*cwsseq = tk->clock_was_set_seq;
+			*offs_real = tk->offs_real;
+			*offs_boot = tk->offs_boot;
+			*offs_tai = tk->offs_tai;
+		}
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
 	return ktime_add_ns(base, nsecs);
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 4d177fce37d5..704f595ce83f 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -3,9 +3,10 @@
 /*
  * Internal interfaces for kernel/time/
  */
-extern ktime_t ktime_get_update_offsets_now(ktime_t *offs_real,
-						ktime_t *offs_boot,
-						ktime_t *offs_tai);
+extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq,
+					    ktime_t *offs_real,
+					    ktime_t *offs_boot,
+					    ktime_t *offs_tai);
 
 extern int timekeeping_valid_for_hres(void);
 extern u64 timekeeping_max_deferment(void);
-- 
cgit v1.2.3


From e19ffe8be2cd0a1f726b235443eba21e64f6be5e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:39 +0000
Subject: hrtimer: Use bits for various boolean indicators

No point in wasting 12 byte storage space. Generates better code as well.

Text size reduction:
       x8664 -64, i386 -16, ARM -132, ARM64 -0, power64 -48

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.227955358@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  6 +++---
 kernel/time/hrtimer.c   | 24 ++++++++++++++++--------
 2 files changed, 19 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5e04f8fc26f6..17a59ddcc79a 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -181,10 +181,10 @@ struct hrtimer_cpu_base {
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
 #ifdef CONFIG_HIGH_RES_TIMERS
+	unsigned int			in_hrtirq	: 1,
+					hres_active	: 1,
+					hang_detected	: 1;
 	ktime_t				expires_next;
-	int				in_hrtirq;
-	int				hres_active;
-	int				hang_detected;
 	unsigned int			nr_events;
 	unsigned int			nr_retries;
 	unsigned int			nr_hangs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 8ce9b3138017..9bbfe33f6575 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -492,9 +492,14 @@ static inline int hrtimer_is_hres_enabled(void)
 /*
  * Is the high resolution mode active ?
  */
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base)
+{
+	return cpu_base->hres_active;
+}
+
 static inline int hrtimer_hres_active(void)
 {
-	return __this_cpu_read(hrtimer_bases.hres_active);
+	return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases));
 }
 
 /*
@@ -628,7 +633,7 @@ static void retrigger_next_event(void *arg)
 {
 	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
-	if (!hrtimer_hres_active())
+	if (!base->hres_active)
 		return;
 
 	raw_spin_lock(&base->lock);
@@ -685,6 +690,7 @@ void clock_was_set_delayed(void)
 
 #else
 
+static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *b) { return 0; }
 static inline int hrtimer_hres_active(void) { return 0; }
 static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
@@ -862,25 +868,27 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     struct hrtimer_clock_base *base,
 			     unsigned long newstate, int reprogram)
 {
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
 	struct timerqueue_node *next_timer;
+
 	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
 		goto out;
 
 	next_timer = timerqueue_getnext(&base->active);
 	timerqueue_del(&base->active, &timer->node);
 	if (!timerqueue_getnext(&base->active))
-		base->cpu_base->active_bases &= ~(1 << base->index);
+		cpu_base->active_bases &= ~(1 << base->index);
 
 	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
 		/* Reprogram the clock event device. if enabled */
-		if (reprogram && hrtimer_hres_active()) {
+		if (reprogram && cpu_base->hres_active) {
 			ktime_t expires;
 
 			expires = ktime_sub(hrtimer_get_expires(timer),
 					    base->offset);
-			if (base->cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(base->cpu_base, 1);
+			if (cpu_base->expires_next.tv64 == expires.tv64)
+				hrtimer_force_reprogram(cpu_base, 1);
 		}
 #endif
 	}
@@ -1114,7 +1122,7 @@ ktime_t hrtimer_get_next_event(void)
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
-	if (!hrtimer_hres_active())
+	if (!__hrtimer_hres_active(cpu_base))
 		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
 				     ktime_get());
 
@@ -1412,7 +1420,7 @@ void hrtimer_run_queues(void)
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t now;
 
-	if (hrtimer_hres_active())
+	if (__hrtimer_hres_active(cpu_base))
 		return;
 
 	raw_spin_lock(&cpu_base->lock);
-- 
cgit v1.2.3


From 34aee88a02ba296a8e8c9523cdf77147731903f1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:41 +0000
Subject: hrtimer: Use cpu_base->active_base for hotpath iterators

The active_bases field is guaranteed to be in sync with the timerqueue
of the corresponding clock base. So we can use it for iterating over
the clock bases. This allows to break out early if no more active
clock bases are available and avoids touching the cache lines of
inactive clock bases.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.322887675@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 9bbfe33f6575..fce0ccf97b51 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -419,16 +419,16 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
-	int i;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++, base++) {
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *next;
 		struct hrtimer *timer;
 
-		next = timerqueue_getnext(&base->active);
-		if (!next)
+		if (!(active & 0x01))
 			continue;
 
+		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
 		if (expires.tv64 < expires_next.tv64)
@@ -1214,17 +1214,16 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
 {
-	int i;
+	struct hrtimer_clock_base *base = cpu_base->clock_base;
+	unsigned int active = cpu_base->active_bases;
 
-	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
-		struct hrtimer_clock_base *base;
+	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *node;
 		ktime_t basenow;
 
-		if (!(cpu_base->active_bases & (1 << i)))
+		if (!(active & 0x01))
 			continue;
 
-		base = cpu_base->clock_base + i;
 		basenow = ktime_add(now, base->offset);
 
 		while ((node = timerqueue_getnext(&base->active))) {
-- 
cgit v1.2.3


From b97f44c9b658d52e0139c947ea5519e51ba38d81 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:47 +0000
Subject: hrtimer: Make use of timerqueue_add/del return values

Use the return value instead of reevaluating the information.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.658152945@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fce0ccf97b51..0cd1e0b8099d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -842,7 +842,6 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 {
 	debug_activate(timer);
 
-	timerqueue_add(&base->active, &timer->node);
 	base->cpu_base->active_bases |= 1 << base->index;
 
 	/*
@@ -851,7 +850,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 	 */
 	timer->state |= HRTIMER_STATE_ENQUEUED;
 
-	return (&timer->node == base->active.next);
+	return timerqueue_add(&base->active, &timer->node);
 }
 
 /*
@@ -875,8 +874,7 @@ static void __remove_hrtimer(struct hrtimer *timer,
 		goto out;
 
 	next_timer = timerqueue_getnext(&base->active);
-	timerqueue_del(&base->active, &timer->node);
-	if (!timerqueue_getnext(&base->active))
+	if (!timerqueue_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
 
 	if (&timer->node == next_timer) {
-- 
cgit v1.2.3


From 895bdfa793f6e912d1a58fc445b3dd4d686f7bd3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:49 +0000
Subject: hrtimer: Keep pointer to first timer and simplify __remove_hrtimer()

__remove_hrtimer() needs to evaluate the expiry time to figure out
whether the timer which is removed is eventually the first expiring
timer on the cpu. Keep a pointer to it, which is lazily updated, so we
can avoid the evaluation dance and retrieve the information from there.

Generates slightly better code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.752838019@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  6 ++++++
 kernel/time/hrtimer.c   | 46 ++++++++++++++++++++++++++++------------------
 2 files changed, 34 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index e5c22d611850..d194c1dacdaa 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -172,6 +172,7 @@ enum  hrtimer_base_type {
  * @clock_was_set_seq:	Sequence counter of clock was set events
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
+ * @next_timer:		Pointer to the first expiring timer
  * @in_hrtirq:		hrtimer_interrupt() is currently executing
  * @hres_active:	State of high resolution mode
  * @hang_detected:	The last hrtimer interrupt detected a hang
@@ -180,6 +181,10 @@ enum  hrtimer_base_type {
  * @nr_hangs:		Total number of hrtimer interrupt hangs
  * @max_hang_time:	Maximum time spent in hrtimer_interrupt
  * @clock_base:		array of clock bases for this cpu
+ *
+ * Note: next_timer is just an optimization for __remove_hrtimer().
+ *	 Do not dereference the pointer because it is not reliable on
+ *	 cross cpu removals.
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
@@ -191,6 +196,7 @@ struct hrtimer_cpu_base {
 					hres_active	: 1,
 					hang_detected	: 1;
 	ktime_t				expires_next;
+	struct hrtimer			*next_timer;
 	unsigned int			nr_events;
 	unsigned int			nr_retries;
 	unsigned int			nr_hangs;
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 0cd1e0b8099d..30178d0656cf 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -415,12 +415,21 @@ static inline void debug_deactivate(struct hrtimer *timer)
 }
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+static inline void hrtimer_update_next_timer(struct hrtimer_cpu_base *cpu_base,
+					     struct hrtimer *timer)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+	cpu_base->next_timer = timer;
+#endif
+}
+
 static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 {
 	struct hrtimer_clock_base *base = cpu_base->clock_base;
 	ktime_t expires, expires_next = { .tv64 = KTIME_MAX };
 	unsigned int active = cpu_base->active_bases;
 
+	hrtimer_update_next_timer(cpu_base, NULL);
 	for (; active; base++, active >>= 1) {
 		struct timerqueue_node *next;
 		struct hrtimer *timer;
@@ -431,8 +440,10 @@ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base)
 		next = timerqueue_getnext(&base->active);
 		timer = container_of(next, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-		if (expires.tv64 < expires_next.tv64)
+		if (expires.tv64 < expires_next.tv64) {
 			expires_next = expires;
+			hrtimer_update_next_timer(cpu_base, timer);
+		}
 	}
 	/*
 	 * clock_was_set() might have changed base->offset of any of
@@ -597,6 +608,8 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	if (cpu_base->in_hrtirq)
 		return 0;
 
+	cpu_base->next_timer = timer;
+
 	/*
 	 * If a hang was detected in the last timer interrupt then we
 	 * do not schedule a timer which is earlier than the expiry
@@ -868,30 +881,27 @@ static void __remove_hrtimer(struct hrtimer *timer,
 			     unsigned long newstate, int reprogram)
 {
 	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
-	struct timerqueue_node *next_timer;
+	unsigned int state = timer->state;
 
-	if (!(timer->state & HRTIMER_STATE_ENQUEUED))
-		goto out;
+	timer->state = newstate;
+	if (!(state & HRTIMER_STATE_ENQUEUED))
+		return;
 
-	next_timer = timerqueue_getnext(&base->active);
 	if (!timerqueue_del(&base->active, &timer->node))
 		cpu_base->active_bases &= ~(1 << base->index);
 
-	if (&timer->node == next_timer) {
 #ifdef CONFIG_HIGH_RES_TIMERS
-		/* Reprogram the clock event device. if enabled */
-		if (reprogram && cpu_base->hres_active) {
-			ktime_t expires;
-
-			expires = ktime_sub(hrtimer_get_expires(timer),
-					    base->offset);
-			if (cpu_base->expires_next.tv64 == expires.tv64)
-				hrtimer_force_reprogram(cpu_base, 1);
-		}
+	/*
+	 * Note: If reprogram is false we do not update
+	 * cpu_base->next_timer. This happens when we remove the first
+	 * timer on a remote cpu. No harm as we never dereference
+	 * cpu_base->next_timer. So the worst thing what can happen is
+	 * an superflous call to hrtimer_force_reprogram() on the
+	 * remote cpu later on if the same timer gets enqueued again.
+	 */
+	if (reprogram && timer == cpu_base->next_timer)
+		hrtimer_force_reprogram(cpu_base, 1);
 #endif
-	}
-out:
-	timer->state = newstate;
 }
 
 /*
-- 
cgit v1.2.3


From c6eb3f70d4482806dc2d3e1e3c7736f497b1d418 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:51 +0000
Subject: hrtimer: Get rid of hrtimer softirq

hrtimer softirq is a leftover from the initial implementation and
serves only the purpose to handle the enqueueing of already expired
timers in the high resolution timer mode. We discussed whether we
change the return value and force all start sites to handle that the
timer is already expired, but that would be a Herculean task and I'm
not sure whether its a good idea to enforce that handling on
everyone.

A simpler solution is to enforce a timer interrupt instead of raising
and scheduling a softirq. Just use the existing infrastructure to do
so and remove all the softirq leftovers.

The HRTIMER softirq enum is now unused, but kept around because trace
parsers rely on the existing numbering.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203501.840834708@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   |   1 -
 include/linux/interrupt.h |   3 +-
 kernel/time/hrtimer.c     | 163 ++++++++++++----------------------------------
 kernel/time/tick-common.c |  10 +++
 kernel/time/timer.c       |   2 -
 5 files changed, 55 insertions(+), 124 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index d194c1dacdaa..048270a27bc5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -459,7 +459,6 @@ extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode);
 
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
-extern void hrtimer_run_pending(void);
 
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 950ae4501826..6bf15a66bce7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -413,7 +413,8 @@ enum
 	BLOCK_IOPOLL_SOFTIRQ,
 	TASKLET_SOFTIRQ,
 	SCHED_SOFTIRQ,
-	HRTIMER_SOFTIRQ,
+	HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the
+			    numbering. Sigh! */
 	RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */
 
 	NR_SOFTIRQS
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 30178d0656cf..fc6b6d25f93d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -555,59 +555,48 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 }
 
 /*
- * Shared reprogramming for clock_realtime and clock_monotonic
- *
  * When a timer is enqueued and expires earlier than the already enqueued
  * timers, we have to check, whether it expires earlier than the timer for
  * which the clock event device was armed.
  *
- * Note, that in case the state has HRTIMER_STATE_CALLBACK set, no reprogramming
- * and no expiry check happens. The timer gets enqueued into the rbtree. The
- * reprogramming and expiry check is done in the hrtimer_interrupt or in the
- * softirq.
- *
  * Called with interrupts disabled and base->cpu_base.lock held
  */
-static int hrtimer_reprogram(struct hrtimer *timer,
-			     struct hrtimer_clock_base *base)
+static void hrtimer_reprogram(struct hrtimer *timer,
+			      struct hrtimer_clock_base *base)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
 	ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
-	int res;
 
 	WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
 
 	/*
-	 * When the callback is running, we do not reprogram the clock event
-	 * device. The timer callback is either running on a different CPU or
-	 * the callback is executed in the hrtimer_interrupt context. The
-	 * reprogramming is handled either by the softirq, which called the
-	 * callback or at the end of the hrtimer_interrupt.
+	 * If the timer is not on the current cpu, we cannot reprogram
+	 * the other cpus clock event device.
 	 */
-	if (hrtimer_callback_running(timer))
-		return 0;
+	if (base->cpu_base != cpu_base)
+		return;
+
+	/*
+	 * If the hrtimer interrupt is running, then it will
+	 * reevaluate the clock bases and reprogram the clock event
+	 * device. The callbacks are always executed in hard interrupt
+	 * context so we don't need an extra check for a running
+	 * callback.
+	 */
+	if (cpu_base->in_hrtirq)
+		return;
 
 	/*
 	 * CLOCK_REALTIME timer might be requested with an absolute
-	 * expiry time which is less than base->offset. Nothing wrong
-	 * about that, just avoid to call into the tick code, which
-	 * has now objections against negative expiry values.
+	 * expiry time which is less than base->offset. Set it to 0.
 	 */
 	if (expires.tv64 < 0)
-		return -ETIME;
+		expires.tv64 = 0;
 
 	if (expires.tv64 >= cpu_base->expires_next.tv64)
-		return 0;
-
-	/*
-	 * When the target cpu of the timer is currently executing
-	 * hrtimer_interrupt(), then we do not touch the clock event
-	 * device. hrtimer_interrupt() will reevaluate all clock bases
-	 * before reprogramming the device.
-	 */
-	if (cpu_base->in_hrtirq)
-		return 0;
+		return;
 
+	/* Update the pointer to the next expiring timer */
 	cpu_base->next_timer = timer;
 
 	/*
@@ -617,15 +606,14 @@ static int hrtimer_reprogram(struct hrtimer *timer,
 	 * to make progress.
 	 */
 	if (cpu_base->hang_detected)
-		return 0;
+		return;
 
 	/*
-	 * Clockevents returns -ETIME, when the event was in the past.
+	 * Program the timer hardware. We enforce the expiry for
+	 * events which are already in the past.
 	 */
-	res = tick_program_event(expires, 0);
-	if (!IS_ERR_VALUE(res))
-		cpu_base->expires_next = expires;
-	return res;
+	cpu_base->expires_next = expires;
+	tick_program_event(expires, 1);
 }
 
 /*
@@ -660,19 +648,11 @@ static void retrigger_next_event(void *arg)
  */
 static int hrtimer_switch_to_hres(void)
 {
-	int cpu = smp_processor_id();
-	struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
-	unsigned long flags;
-
-	if (base->hres_active)
-		return 1;
-
-	local_irq_save(flags);
+	struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases);
 
 	if (tick_init_highres()) {
-		local_irq_restore(flags);
 		printk(KERN_WARNING "Could not switch to high resolution "
-				    "mode on CPU %d\n", cpu);
+				    "mode on CPU %d\n", base->cpu);
 		return 0;
 	}
 	base->hres_active = 1;
@@ -681,7 +661,6 @@ static int hrtimer_switch_to_hres(void)
 	tick_setup_sched_timer();
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
-	local_irq_restore(flags);
 	return 1;
 }
 
@@ -984,26 +963,8 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * on dynticks target.
 		 */
 		wake_up_nohz_cpu(new_base->cpu_base->cpu);
-	} else if (new_base->cpu_base == this_cpu_ptr(&hrtimer_bases) &&
-			hrtimer_reprogram(timer, new_base)) {
-		/*
-		 * Only allow reprogramming if the new base is on this CPU.
-		 * (it might still be on another CPU if the timer was pending)
-		 *
-		 * XXX send_remote_softirq() ?
-		 */
-		if (wakeup) {
-			/*
-			 * We need to drop cpu_base->lock to avoid a
-			 * lock ordering issue vs. rq->lock.
-			 */
-			raw_spin_unlock(&new_base->cpu_base->lock);
-			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-			local_irq_restore(flags);
-			return ret;
-		} else {
-			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		}
+	} else {
+		hrtimer_reprogram(timer, new_base);
 	}
 
 	unlock_hrtimer_base(timer, &flags);
@@ -1354,7 +1315,7 @@ retry:
  * local version of hrtimer_peek_ahead_timers() called with interrupts
  * disabled.
  */
-static void __hrtimer_peek_ahead_timers(void)
+static inline void __hrtimer_peek_ahead_timers(void)
 {
 	struct tick_device *td;
 
@@ -1366,29 +1327,6 @@ static void __hrtimer_peek_ahead_timers(void)
 		hrtimer_interrupt(td->evtdev);
 }
 
-/**
- * hrtimer_peek_ahead_timers -- run soft-expired timers now
- *
- * hrtimer_peek_ahead_timers will peek at the timer queue of
- * the current cpu and check if there are any timers for which
- * the soft expires time has passed. If any such timers exist,
- * they are run immediately and then removed from the timer queue.
- *
- */
-void hrtimer_peek_ahead_timers(void)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	__hrtimer_peek_ahead_timers();
-	local_irq_restore(flags);
-}
-
-static void run_hrtimer_softirq(struct softirq_action *h)
-{
-	hrtimer_peek_ahead_timers();
-}
-
 #else /* CONFIG_HIGH_RES_TIMERS */
 
 static inline void __hrtimer_peek_ahead_timers(void) { }
@@ -1396,31 +1334,7 @@ static inline void __hrtimer_peek_ahead_timers(void) { }
 #endif	/* !CONFIG_HIGH_RES_TIMERS */
 
 /*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
-void hrtimer_run_pending(void)
-{
-	if (hrtimer_hres_active())
-		return;
-
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		hrtimer_switch_to_hres();
-}
-
-/*
- * Called from hardirq context every jiffy
+ * Called from run_local_timers in hardirq context every jiffy
  */
 void hrtimer_run_queues(void)
 {
@@ -1430,6 +1344,18 @@ void hrtimer_run_queues(void)
 	if (__hrtimer_hres_active(cpu_base))
 		return;
 
+	/*
+	 * This _is_ ugly: We have to check periodically, whether we
+	 * can switch to highres and / or nohz mode. The clocksource
+	 * switch happens with xtime_lock held. Notification from
+	 * there only sets the check bit in the tick_oneshot code,
+	 * otherwise we might deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) {
+		hrtimer_switch_to_hres();
+		return;
+	}
+
 	raw_spin_lock(&cpu_base->lock);
 	now = hrtimer_update_base(cpu_base);
 	__hrtimer_run_queues(cpu_base, now);
@@ -1700,9 +1626,6 @@ void __init hrtimers_init(void)
 	hrtimer_cpu_notify(&hrtimers_nb, (unsigned long)CPU_UP_PREPARE,
 			  (void *)(long)smp_processor_id());
 	register_cpu_notifier(&hrtimers_nb);
-#ifdef CONFIG_HIGH_RES_TIMERS
-	open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
-#endif
 }
 
 /**
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 3ae6afa1eb98..ea5f9eae8f74 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -102,6 +102,16 @@ void tick_handle_periodic(struct clock_event_device *dev)
 
 	tick_periodic(cpu);
 
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_NO_HZ_COMMON)
+	/*
+	 * The cpu might have transitioned to HIGHRES or NOHZ mode via
+	 * update_process_times() -> run_local_timers() ->
+	 * hrtimer_run_queues().
+	 */
+	if (dev->event_handler != tick_handle_periodic)
+		return;
+#endif
+
 	if (dev->state != CLOCK_EVT_STATE_ONESHOT)
 		return;
 	for (;;) {
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 2ece3aa5069c..b31f13f4fe41 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1409,8 +1409,6 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
 
-	hrtimer_run_pending();
-
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 }
-- 
cgit v1.2.3


From afc08b15cc2a3d2c48cbd427be8e0eea05698363 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:52 +0000
Subject: tick: sched: Remove hrtimer_active() checks

hrtimer_start() enforces a timer interrupt if the timer is already
expired. Get rid of the checks and the forward loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/20150414203501.943658239@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/time/tick-sched.c | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 914259128145..dc586c371687 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -696,11 +696,9 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 			hrtimer_start(&ts->sched_timer, expires,
 				      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
-				goto out;
+			goto out;
 		} else if (!tick_program_event(expires, 0))
-				goto out;
+			goto out;
 		/*
 		 * We are past the event already. So we crossed a
 		 * jiffie boundary. Update jiffies and raise the
@@ -888,8 +886,6 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 			hrtimer_start_expires(&ts->sched_timer,
 					      HRTIMER_MODE_ABS_PINNED);
-			/* Check, if the timer was already in the past */
-			if (hrtimer_active(&ts->sched_timer))
 				break;
 		} else {
 			if (!tick_program_event(
@@ -1167,15 +1163,8 @@ void tick_setup_sched_timer(void)
 		hrtimer_add_expires_ns(&ts->sched_timer, offset);
 	}
 
-	for (;;) {
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-		hrtimer_start_expires(&ts->sched_timer,
-				      HRTIMER_MODE_ABS_PINNED);
-		/* Check, if the timer was already in the past */
-		if (hrtimer_active(&ts->sched_timer))
-			break;
-		now = ktime_get();
-	}
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
 
 #ifdef CONFIG_NO_HZ_COMMON
 	if (tick_nohz_enabled) {
-- 
cgit v1.2.3


From 0ff53d09642204c648424def0caa9117e7a3caaf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:54 +0000
Subject: tick: sched: Force tick interrupt and get rid of softirq magic

We already got rid of the hrtimer reprogramming loops and hoops as
hrtimer now enforces an interrupt if the enqueued time is in the past.

Do the same for the nohz non highres mode. That gets rid of the need
to raise the softirq which only serves the purpose of getting the
machine out of the inner idle loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/20150414203502.023464878@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/time/tick-sched.c | 83 +++++++++++++++++-------------------------------
 1 file changed, 29 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index dc586c371687..0f07ff2ba22b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -565,6 +565,20 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
 
+static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
+{
+	hrtimer_cancel(&ts->sched_timer);
+	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
+
+	/* Forward the time to expire in the future */
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
+}
+
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
@@ -691,22 +705,18 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 			if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 				hrtimer_cancel(&ts->sched_timer);
 			goto out;
-		}
+		 }
 
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start(&ts->sched_timer, expires,
-				      HRTIMER_MODE_ABS_PINNED);
-			goto out;
-		} else if (!tick_program_event(expires, 0))
-			goto out;
-		/*
-		 * We are past the event already. So we crossed a
-		 * jiffie boundary. Update jiffies and raise the
-		 * softirq.
-		 */
-		tick_do_update_jiffies64(ktime_get());
+		 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+			 hrtimer_start(&ts->sched_timer, expires,
+				       HRTIMER_MODE_ABS_PINNED);
+		 else
+			 tick_program_event(expires, 1);
+	} else {
+		/* Tick is stopped, but required now. Enforce it */
+		tick_nohz_restart(ts, now);
 	}
-	raise_softirq_irqoff(TIMER_SOFTIRQ);
+
 out:
 	ts->next_jiffies = next_jiffies;
 	ts->last_jiffies = last_jiffies;
@@ -874,30 +884,6 @@ ktime_t tick_nohz_get_sleep_length(void)
 	return ts->sleep_length;
 }
 
-static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_cancel(&ts->sched_timer);
-	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
-
-	while (1) {
-		/* Forward the time to expire in the future */
-		hrtimer_forward(&ts->sched_timer, now, tick_period);
-
-		if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
-			hrtimer_start_expires(&ts->sched_timer,
-					      HRTIMER_MODE_ABS_PINNED);
-				break;
-		} else {
-			if (!tick_program_event(
-				hrtimer_get_expires(&ts->sched_timer), 0))
-				break;
-		}
-		/* Reread time and update jiffies */
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
-}
-
 static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now)
 {
 	/* Update jiffies first */
@@ -968,12 +954,6 @@ void tick_nohz_idle_exit(void)
 	local_irq_enable();
 }
 
-static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
-{
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
-	return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
-}
-
 /*
  * The nohz low res interrupt handler
  */
@@ -992,10 +972,8 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	if (unlikely(ts->tick_stopped))
 		return;
 
-	while (tick_nohz_reprogram(ts, now)) {
-		now = ktime_get();
-		tick_do_update_jiffies64(now);
-	}
+	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
 /**
@@ -1025,12 +1003,9 @@ static void tick_nohz_switch_to_nohz(void)
 	/* Get the next period */
 	next = tick_init_jiffy_update();
 
-	for (;;) {
-		hrtimer_set_expires(&ts->sched_timer, next);
-		if (!tick_program_event(next, 0))
-			break;
-		next = ktime_add(next, tick_period);
-	}
+	hrtimer_forward_now(&ts->sched_timer, tick_period);
+	hrtimer_set_expires(&ts->sched_timer, next);
+	tick_program_event(next, 1);
 	local_irq_enable();
 }
 
-- 
cgit v1.2.3


From 157d29e101c7d032e886df067aeea1b21a366cc5 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:56 +0000
Subject: tick: Sched: Restructure code

Get rid of one indentation level. Preparatory patch for a major
rework. No functional change.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/20150414203502.101563235@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 kernel/time/tick-sched.c | 171 ++++++++++++++++++++++-------------------------
 1 file changed, 81 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 0f07ff2ba22b..4c5f4a9dcc0a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -611,112 +611,103 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 		}
 	}
 
+	if ((long)delta_jiffies <= 1) {
+		if (!ts->tick_stopped)
+			goto out;
+		if (delta_jiffies == 0) {
+			/* Tick is stopped, but required now. Enforce it */
+			tick_nohz_restart(ts, now);
+			goto out;
+		}
+	}
+
 	/*
-	 * Do not stop the tick, if we are only one off (or less)
-	 * or if the cpu is required for RCU:
+	 * If this cpu is the one which updates jiffies, then give up
+	 * the assignment and let it be taken by the cpu which runs
+	 * the tick timer next, which might be this cpu as well. If we
+	 * don't drop this here the jiffies might be stale and
+	 * do_timer() never invoked. Keep track of the fact that it
+	 * was the one which had the do_timer() duty last. If this cpu
+	 * is the one which had the do_timer() duty last, we limit the
+	 * sleep time to the timekeeping max_deferement value which we
+	 * retrieved above. Otherwise we can sleep as long as we want.
 	 */
-	if (!ts->tick_stopped && delta_jiffies <= 1)
-		goto out;
-
-	/* Schedule the tick, if we are at least one jiffie off */
-	if ((long)delta_jiffies >= 1) {
-
-		/*
-		 * If this cpu is the one which updates jiffies, then
-		 * give up the assignment and let it be taken by the
-		 * cpu which runs the tick timer next, which might be
-		 * this cpu as well. If we don't drop this here the
-		 * jiffies might be stale and do_timer() never
-		 * invoked. Keep track of the fact that it was the one
-		 * which had the do_timer() duty last. If this cpu is
-		 * the one which had the do_timer() duty last, we
-		 * limit the sleep time to the timekeeping
-		 * max_deferement value which we retrieved
-		 * above. Otherwise we can sleep as long as we want.
-		 */
-		if (cpu == tick_do_timer_cpu) {
-			tick_do_timer_cpu = TICK_DO_TIMER_NONE;
-			ts->do_timer_last = 1;
-		} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-			time_delta = KTIME_MAX;
-			ts->do_timer_last = 0;
-		} else if (!ts->do_timer_last) {
-			time_delta = KTIME_MAX;
-		}
+	if (cpu == tick_do_timer_cpu) {
+		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
+		ts->do_timer_last = 1;
+	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
+		time_delta = KTIME_MAX;
+		ts->do_timer_last = 0;
+	} else if (!ts->do_timer_last) {
+		time_delta = KTIME_MAX;
+	}
 
 #ifdef CONFIG_NO_HZ_FULL
-		if (!ts->inidle) {
-			time_delta = min(time_delta,
-					 scheduler_tick_max_deferment());
-		}
+	if (!ts->inidle)
+		time_delta = min(time_delta, scheduler_tick_max_deferment());
 #endif
 
+	/*
+	 * calculate the expiry time for the next timer wheel
+	 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
+	 * there is no timer pending or at least extremely far into
+	 * the future (12 days for HZ=1000). In this case we set the
+	 * expiry to the end of time.
+	 */
+	if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
 		/*
-		 * calculate the expiry time for the next timer wheel
-		 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals
-		 * that there is no timer pending or at least extremely
-		 * far into the future (12 days for HZ=1000). In this
-		 * case we set the expiry to the end of time.
+		 * Calculate the time delta for the next timer event.
+		 * If the time delta exceeds the maximum time delta
+		 * permitted by the current clocksource then adjust
+		 * the time delta accordingly to ensure the
+		 * clocksource does not wrap.
 		 */
-		if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
-			/*
-			 * Calculate the time delta for the next timer event.
-			 * If the time delta exceeds the maximum time delta
-			 * permitted by the current clocksource then adjust
-			 * the time delta accordingly to ensure the
-			 * clocksource does not wrap.
-			 */
-			time_delta = min_t(u64, time_delta,
-					   tick_period.tv64 * delta_jiffies);
-		}
-
-		if (time_delta < KTIME_MAX)
-			expires = ktime_add_ns(last_update, time_delta);
-		else
-			expires.tv64 = KTIME_MAX;
+		time_delta = min_t(u64, time_delta,
+				   tick_period.tv64 * delta_jiffies);
+	}
 
-		/* Skip reprogram of event if its not changed */
-		if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
-			goto out;
+	if (time_delta < KTIME_MAX)
+		expires = ktime_add_ns(last_update, time_delta);
+	else
+		expires.tv64 = KTIME_MAX;
 
-		ret = expires;
+	/* Skip reprogram of event if its not changed */
+	if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+		goto out;
 
-		/*
-		 * nohz_stop_sched_tick can be called several times before
-		 * the nohz_restart_sched_tick is called. This happens when
-		 * interrupts arrive which do not cause a reschedule. In the
-		 * first call we save the current tick time, so we can restart
-		 * the scheduler tick in nohz_restart_sched_tick.
-		 */
-		if (!ts->tick_stopped) {
-			nohz_balance_enter_idle(cpu);
-			calc_load_enter_idle();
+	ret = expires;
 
-			ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
-			ts->tick_stopped = 1;
-			trace_tick_stop(1, " ");
-		}
+	/*
+	 * nohz_stop_sched_tick can be called several times before
+	 * the nohz_restart_sched_tick is called. This happens when
+	 * interrupts arrive which do not cause a reschedule. In the
+	 * first call we save the current tick time, so we can restart
+	 * the scheduler tick in nohz_restart_sched_tick.
+	 */
+	if (!ts->tick_stopped) {
+		nohz_balance_enter_idle(cpu);
+		calc_load_enter_idle();
 
-		/*
-		 * If the expiration time == KTIME_MAX, then
-		 * in this case we simply stop the tick timer.
-		 */
-		 if (unlikely(expires.tv64 == KTIME_MAX)) {
-			if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-				hrtimer_cancel(&ts->sched_timer);
-			goto out;
-		 }
+		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
+		ts->tick_stopped = 1;
+		trace_tick_stop(1, " ");
+	}
 
-		 if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-			 hrtimer_start(&ts->sched_timer, expires,
-				       HRTIMER_MODE_ABS_PINNED);
-		 else
-			 tick_program_event(expires, 1);
-	} else {
-		/* Tick is stopped, but required now. Enforce it */
-		tick_nohz_restart(ts, now);
+	/*
+	 * If the expiration time == KTIME_MAX, then
+	 * in this case we simply stop the tick timer.
+	 */
+	if (unlikely(expires.tv64 == KTIME_MAX)) {
+		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+			hrtimer_cancel(&ts->sched_timer);
+		goto out;
 	}
 
+	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
+		hrtimer_start(&ts->sched_timer, expires,
+			      HRTIMER_MODE_ABS_PINNED);
+	else
+		tick_program_event(expires, 1);
 out:
 	ts->next_jiffies = next_jiffies;
 	ts->last_jiffies = last_jiffies;
-- 
cgit v1.2.3


From c1ad348b452aacd784fb97403d03d71723c72ee1 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:08:58 +0000
Subject: tick: Nohz: Rework next timer evaluation

The evaluation of the next timer in the nohz code is based on jiffies
while all the tick internals are nano seconds based. We have also to
convert hrtimer nanoseconds to jiffies in the !highres case. That's
just wrong and introduces interesting corner cases.

Turn it around and convert the next timer wheel timer expiry and the
rcu event to clock monotonic and base all calculations on
nanoseconds. That identifies the case where no timer is pending
clearly with an absolute expiry value of KTIME_MAX.

Makes the code more readable and gets rid of the jiffies magic in the
nohz code.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Link: http://lkml.kernel.org/r/20150414203502.184198593@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/hrtimer.h     |   2 +-
 include/linux/rcupdate.h    |   6 ++-
 include/linux/rcutree.h     |   2 +-
 include/linux/timer.h       |   7 ---
 kernel/rcu/tree_plugin.h    |  14 +++---
 kernel/time/hrtimer.c       |  14 ++----
 kernel/time/tick-internal.h |   2 +
 kernel/time/tick-sched.c    | 109 ++++++++++++++++++++------------------------
 kernel/time/tick-sched.h    |   2 +-
 kernel/time/timer.c         |  71 ++++++++++++++---------------
 kernel/time/timer_list.c    |   4 +-
 11 files changed, 107 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 048270a27bc5..2c68f71ffd24 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -386,7 +386,7 @@ static inline int hrtimer_restart(struct hrtimer *timer)
 /* Query timers: */
 extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 
-extern ktime_t hrtimer_get_next_event(void);
+extern u64 hrtimer_get_next_event(void);
 
 /*
  * A timer is active, when it is enqueued into the rbtree or the
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 573a5afd5ed8..0627a447c589 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -44,6 +44,8 @@
 #include <linux/debugobjects.h>
 #include <linux/bug.h>
 #include <linux/compiler.h>
+#include <linux/ktime.h>
+
 #include <asm/barrier.h>
 
 extern int rcu_expedited; /* for sysctl */
@@ -1154,9 +1156,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void)
 	__kfree_rcu(&((ptr)->rcu_head), offsetof(typeof(*(ptr)), rcu_head))
 
 #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL)
-static inline int rcu_needs_cpu(unsigned long *delta_jiffies)
+static inline int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
+	*nextevt = KTIME_MAX;
 	return 0;
 }
 #endif /* #if defined(CONFIG_TINY_RCU) || defined(CONFIG_RCU_NOCB_CPU_ALL) */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d2e583a6aaca..db2e31beaae7 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -32,7 +32,7 @@
 
 void rcu_note_context_switch(void);
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies);
+int rcu_needs_cpu(u64 basem, u64 *nextevt);
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
 void rcu_cpu_stall_reset(void);
 
diff --git a/include/linux/timer.h b/include/linux/timer.h
index 8c5a197e1587..fbb80e0030bf 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -187,13 +187,6 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz);
  */
 #define NEXT_TIMER_MAX_DELTA	((1UL << 30) - 1)
 
-/*
- * Return when the next timer-wheel timeout occurs (in absolute jiffies),
- * locks the timer base and does the comparison against the given
- * jiffie.
- */
-extern unsigned long get_next_timer_interrupt(unsigned long now);
-
 /*
  * Timer-statistics info:
  */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8c0ec0f5a027..0ef80a0bbabb 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1368,9 +1368,9 @@ static void rcu_prepare_kthreads(int cpu)
  * any flavor of RCU.
  */
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *delta_jiffies)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
-	*delta_jiffies = ULONG_MAX;
+	*nextevt = KTIME_MAX;
 	return rcu_cpu_has_callbacks(NULL);
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
@@ -1481,16 +1481,17 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void)
  * The caller must have disabled interrupts.
  */
 #ifndef CONFIG_RCU_NOCB_CPU_ALL
-int rcu_needs_cpu(unsigned long *dj)
+int rcu_needs_cpu(u64 basemono, u64 *nextevt)
 {
 	struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);
+	unsigned long dj;
 
 	/* Snapshot to detect later posting of non-lazy callback. */
 	rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted;
 
 	/* If no callbacks, RCU doesn't need the CPU. */
 	if (!rcu_cpu_has_callbacks(&rdtp->all_lazy)) {
-		*dj = ULONG_MAX;
+		*nextevt = KTIME_MAX;
 		return 0;
 	}
 
@@ -1504,11 +1505,12 @@ int rcu_needs_cpu(unsigned long *dj)
 
 	/* Request timer delay depending on laziness, and round. */
 	if (!rdtp->all_lazy) {
-		*dj = round_up(rcu_idle_gp_delay + jiffies,
+		dj = round_up(rcu_idle_gp_delay + jiffies,
 			       rcu_idle_gp_delay) - jiffies;
 	} else {
-		*dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
+		dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies;
 	}
+	*nextevt = basemono + dj * TICK_NSEC;
 	return 0;
 }
 #endif /* #ifndef CONFIG_RCU_NOCB_CPU_ALL */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index fc6b6d25f93d..179b991cfdcb 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1080,26 +1080,22 @@ EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
 /**
  * hrtimer_get_next_event - get the time until next expiry event
  *
- * Returns the delta to the next expiry event or KTIME_MAX if no timer
- * is pending.
+ * Returns the next expiry time or KTIME_MAX if no timer is pending.
  */
-ktime_t hrtimer_get_next_event(void)
+u64 hrtimer_get_next_event(void)
 {
 	struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases);
-	ktime_t mindelta = { .tv64 = KTIME_MAX };
+	u64 expires = KTIME_MAX;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&cpu_base->lock, flags);
 
 	if (!__hrtimer_hres_active(cpu_base))
-		mindelta = ktime_sub(__hrtimer_get_next_event(cpu_base),
-				     ktime_get());
+		expires = __hrtimer_get_next_event(cpu_base).tv64;
 
 	raw_spin_unlock_irqrestore(&cpu_base->lock, flags);
 
-	if (mindelta.tv64 < 0)
-		mindelta.tv64 = 0;
-	return mindelta;
+	return expires;
 }
 #endif
 
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index b64fdd8054c5..65273f0a11ed 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -137,3 +137,5 @@ extern void tick_nohz_init(void);
 # else
 static inline void tick_nohz_init(void) { }
 #endif
+
+extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4c5f4a9dcc0a..753c211f6195 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -582,39 +582,46 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 					 ktime_t now, int cpu)
 {
-	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-	ktime_t last_update, expires, ret = { .tv64 = 0 };
-	unsigned long rcu_delta_jiffies;
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
-	u64 time_delta;
-
-	time_delta = timekeeping_max_deferment();
+	u64 basemono, next_tick, next_tmr, next_rcu, delta, expires;
+	unsigned long seq, basejiff;
+	ktime_t	tick;
 
 	/* Read jiffies and the time when jiffies were updated last */
 	do {
 		seq = read_seqbegin(&jiffies_lock);
-		last_update = last_jiffies_update;
-		last_jiffies = jiffies;
+		basemono = last_jiffies_update.tv64;
+		basejiff = jiffies;
 	} while (read_seqretry(&jiffies_lock, seq));
+	ts->last_jiffies = basejiff;
 
-	if (rcu_needs_cpu(&rcu_delta_jiffies) ||
+	if (rcu_needs_cpu(basemono, &next_rcu) ||
 	    arch_needs_cpu() || irq_work_needs_cpu()) {
-		next_jiffies = last_jiffies + 1;
-		delta_jiffies = 1;
+		next_tick = basemono + TICK_NSEC;
 	} else {
-		/* Get the next timer wheel timer */
-		next_jiffies = get_next_timer_interrupt(last_jiffies);
-		delta_jiffies = next_jiffies - last_jiffies;
-		if (rcu_delta_jiffies < delta_jiffies) {
-			next_jiffies = last_jiffies + rcu_delta_jiffies;
-			delta_jiffies = rcu_delta_jiffies;
-		}
+		/*
+		 * Get the next pending timer. If high resolution
+		 * timers are enabled this only takes the timer wheel
+		 * timers into account. If high resolution timers are
+		 * disabled this also looks at the next expiring
+		 * hrtimer.
+		 */
+		next_tmr = get_next_timer_interrupt(basejiff, basemono);
+		ts->next_timer = next_tmr;
+		/* Take the next rcu event into account */
+		next_tick = next_rcu < next_tmr ? next_rcu : next_tmr;
 	}
 
-	if ((long)delta_jiffies <= 1) {
+	/*
+	 * If the tick is due in the next period, keep it ticking or
+	 * restart it proper.
+	 */
+	delta = next_tick - basemono;
+	if (delta <= (u64)TICK_NSEC) {
+		tick.tv64 = 0;
 		if (!ts->tick_stopped)
 			goto out;
-		if (delta_jiffies == 0) {
+		if (delta == 0) {
 			/* Tick is stopped, but required now. Enforce it */
 			tick_nohz_restart(ts, now);
 			goto out;
@@ -629,54 +636,39 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	 * do_timer() never invoked. Keep track of the fact that it
 	 * was the one which had the do_timer() duty last. If this cpu
 	 * is the one which had the do_timer() duty last, we limit the
-	 * sleep time to the timekeeping max_deferement value which we
-	 * retrieved above. Otherwise we can sleep as long as we want.
+	 * sleep time to the timekeeping max_deferement value.
+	 * Otherwise we can sleep as long as we want.
 	 */
+	delta = timekeeping_max_deferment();
 	if (cpu == tick_do_timer_cpu) {
 		tick_do_timer_cpu = TICK_DO_TIMER_NONE;
 		ts->do_timer_last = 1;
 	} else if (tick_do_timer_cpu != TICK_DO_TIMER_NONE) {
-		time_delta = KTIME_MAX;
+		delta = KTIME_MAX;
 		ts->do_timer_last = 0;
 	} else if (!ts->do_timer_last) {
-		time_delta = KTIME_MAX;
+		delta = KTIME_MAX;
 	}
 
 #ifdef CONFIG_NO_HZ_FULL
+	/* Limit the tick delta to the maximum scheduler deferment */
 	if (!ts->inidle)
-		time_delta = min(time_delta, scheduler_tick_max_deferment());
+		delta = min(delta, scheduler_tick_max_deferment());
 #endif
 
-	/*
-	 * calculate the expiry time for the next timer wheel
-	 * timer. delta_jiffies >= NEXT_TIMER_MAX_DELTA signals that
-	 * there is no timer pending or at least extremely far into
-	 * the future (12 days for HZ=1000). In this case we set the
-	 * expiry to the end of time.
-	 */
-	if (likely(delta_jiffies < NEXT_TIMER_MAX_DELTA)) {
-		/*
-		 * Calculate the time delta for the next timer event.
-		 * If the time delta exceeds the maximum time delta
-		 * permitted by the current clocksource then adjust
-		 * the time delta accordingly to ensure the
-		 * clocksource does not wrap.
-		 */
-		time_delta = min_t(u64, time_delta,
-				   tick_period.tv64 * delta_jiffies);
-	}
-
-	if (time_delta < KTIME_MAX)
-		expires = ktime_add_ns(last_update, time_delta);
+	/* Calculate the next expiry time */
+	if (delta < (KTIME_MAX - basemono))
+		expires = basemono + delta;
 	else
-		expires.tv64 = KTIME_MAX;
+		expires = KTIME_MAX;
+
+	expires = min_t(u64, expires, next_tick);
+	tick.tv64 = expires;
 
 	/* Skip reprogram of event if its not changed */
-	if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
+	if (ts->tick_stopped && (expires == dev->next_event.tv64))
 		goto out;
 
-	ret = expires;
-
 	/*
 	 * nohz_stop_sched_tick can be called several times before
 	 * the nohz_restart_sched_tick is called. This happens when
@@ -694,26 +686,23 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 	}
 
 	/*
-	 * If the expiration time == KTIME_MAX, then
-	 * in this case we simply stop the tick timer.
+	 * If the expiration time == KTIME_MAX, then we simply stop
+	 * the tick timer.
 	 */
-	if (unlikely(expires.tv64 == KTIME_MAX)) {
+	if (unlikely(expires == KTIME_MAX)) {
 		if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
 			hrtimer_cancel(&ts->sched_timer);
 		goto out;
 	}
 
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES)
-		hrtimer_start(&ts->sched_timer, expires,
-			      HRTIMER_MODE_ABS_PINNED);
+		hrtimer_start(&ts->sched_timer, tick, HRTIMER_MODE_ABS_PINNED);
 	else
-		tick_program_event(expires, 1);
+		tick_program_event(tick, 1);
 out:
-	ts->next_jiffies = next_jiffies;
-	ts->last_jiffies = last_jiffies;
+	/* Update the estimated sleep length */
 	ts->sleep_length = ktime_sub(dev->next_event, now);
-
-	return ret;
+	return tick;
 }
 
 static void tick_nohz_full_stop_tick(struct tick_sched *ts)
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
index 28b5da3e1a17..42fdf4958bcc 100644
--- a/kernel/time/tick-sched.h
+++ b/kernel/time/tick-sched.h
@@ -57,7 +57,7 @@ struct tick_sched {
 	ktime_t				iowait_sleeptime;
 	ktime_t				sleep_length;
 	unsigned long			last_jiffies;
-	unsigned long			next_jiffies;
+	u64				next_timer;
 	ktime_t				idle_expires;
 	int				do_timer_last;
 };
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index b31f13f4fe41..172b83cd2f8e 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -49,6 +49,8 @@
 #include <asm/timex.h>
 #include <asm/io.h>
 
+#include "tick-internal.h"
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/timer.h>
 
@@ -1311,54 +1313,48 @@ cascade:
  * Check, if the next hrtimer event is before the next timer wheel
  * event:
  */
-static unsigned long cmp_next_hrtimer_event(unsigned long now,
-					    unsigned long expires)
+static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
 {
-	ktime_t hr_delta = hrtimer_get_next_event();
-	struct timespec tsdelta;
-	unsigned long delta;
-
-	if (hr_delta.tv64 == KTIME_MAX)
-		return expires;
+	u64 nextevt = hrtimer_get_next_event();
 
 	/*
-	 * Expired timer available, let it expire in the next tick
+	 * If high resolution timers are enabled
+	 * hrtimer_get_next_event() returns KTIME_MAX.
 	 */
-	if (hr_delta.tv64 <= 0)
-		return now + 1;
-
-	tsdelta = ktime_to_timespec(hr_delta);
-	delta = timespec_to_jiffies(&tsdelta);
+	if (expires <= nextevt)
+		return expires;
 
 	/*
-	 * Limit the delta to the max value, which is checked in
-	 * tick_nohz_stop_sched_tick():
+	 * If the next timer is already expired, return the tick base
+	 * time so the tick is fired immediately.
 	 */
-	if (delta > NEXT_TIMER_MAX_DELTA)
-		delta = NEXT_TIMER_MAX_DELTA;
+	if (nextevt <= basem)
+		return basem;
 
 	/*
-	 * Take rounding errors in to account and make sure, that it
-	 * expires in the next tick. Otherwise we go into an endless
-	 * ping pong due to tick_nohz_stop_sched_tick() retriggering
-	 * the timer softirq
+	 * Round up to the next jiffie. High resolution timers are
+	 * off, so the hrtimers are expired in the tick and we need to
+	 * make sure that this tick really expires the timer to avoid
+	 * a ping pong of the nohz stop code.
+	 *
+	 * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
 	 */
-	if (delta < 1)
-		delta = 1;
-	now += delta;
-	if (time_before(now, expires))
-		return now;
-	return expires;
+	return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
 }
 
 /**
- * get_next_timer_interrupt - return the jiffy of the next pending timer
- * @now: current time (in jiffies)
+ * get_next_timer_interrupt - return the time (clock mono) of the next timer
+ * @basej:	base time jiffies
+ * @basem:	base time clock monotonic
+ *
+ * Returns the tick aligned clock monotonic time of the next pending
+ * timer or KTIME_MAX if no timer is pending.
  */
-unsigned long get_next_timer_interrupt(unsigned long now)
+u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
 	struct tvec_base *base = __this_cpu_read(tvec_bases);
-	unsigned long expires = now + NEXT_TIMER_MAX_DELTA;
+	u64 expires = KTIME_MAX;
+	unsigned long nextevt;
 
 	/*
 	 * Pretend that there is no timer pending if the cpu is offline.
@@ -1371,14 +1367,15 @@ unsigned long get_next_timer_interrupt(unsigned long now)
 	if (base->active_timers) {
 		if (time_before_eq(base->next_timer, base->timer_jiffies))
 			base->next_timer = __next_timer_interrupt(base);
-		expires = base->next_timer;
+		nextevt = base->next_timer;
+		if (time_before_eq(nextevt, basej))
+			expires = basem;
+		else
+			expires = basem + (nextevt - basej) * TICK_NSEC;
 	}
 	spin_unlock(&base->lock);
 
-	if (time_before_eq(expires, now))
-		return now;
-
-	return cmp_next_hrtimer_event(now, expires);
+	return cmp_next_hrtimer_event(basem, expires);
 }
 #endif
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 6232fc536185..66f39bba5353 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -191,7 +191,7 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
 		P_ns(idle_sleeptime);
 		P_ns(iowait_sleeptime);
 		P(last_jiffies);
-		P(next_jiffies);
+		P(next_timer);
 		P_ns(idle_expires);
 		SEQ_printf(m, "jiffies: %Lu\n",
 			   (unsigned long long)jiffies);
@@ -289,7 +289,7 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 
 static inline void timer_list_header(struct seq_file *m, u64 now)
 {
-	SEQ_printf(m, "Timer List Version: v0.7\n");
+	SEQ_printf(m, "Timer List Version: v0.8\n");
 	SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
 	SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
 	SEQ_printf(m, "\n");
-- 
cgit v1.2.3


From 3497d206c4d9b266d2e56c8b20e51b2f0e6a3c72 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:03 +0000
Subject: perf: core: Use hrtimer_start()

hrtimer_start() does not longer defer already expired timers to the
softirq. Get rid of the __hrtimer_start_range_ns() invocation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.452104213@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 81aa3a4ece9f..05309fdba2a4 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -834,9 +834,7 @@ static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
 	if (hrtimer_active(hr))
 		return;
 
-	if (!hrtimer_callback_running(hr))
-		__hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
-					 0, HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(hr, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
 }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -6843,9 +6841,8 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
 	} else {
 		period = max_t(u64, 10000, hwc->sample_period);
 	}
-	__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
-- 
cgit v1.2.3


From 4961b6e11825c2b05b516374b1800fc5dfc2cb78 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:05 +0000
Subject: sched: core: Use hrtimer_start[_expires]()

hrtimer_start() now enforces a timer interrupt when an already expired
timer is enqueued.

Get rid of the __hrtimer_start_range_ns() invocations and the loops
around it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.531131739@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c | 28 ++++++++--------------------
 kernel/sched/fair.c |  2 +-
 2 files changed, 9 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f9123a82cbb6..3026678113e7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -92,22 +92,11 @@
 
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-	unsigned long delta;
-	ktime_t soft, hard, now;
-
-	for (;;) {
-		if (hrtimer_active(period_timer))
-			break;
-
-		now = hrtimer_cb_get_time(period_timer);
-		hrtimer_forward(period_timer, now, period);
+	if (hrtimer_active(period_timer))
+		return;
 
-		soft = hrtimer_get_softexpires(period_timer);
-		hard = hrtimer_get_expires(period_timer);
-		delta = ktime_to_ns(ktime_sub(hard, soft));
-		__hrtimer_start_range_ns(period_timer, soft, delta,
-					 HRTIMER_MODE_ABS_PINNED, 0);
-	}
+	hrtimer_forward_now(period_timer, period);
+	hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 DEFINE_MUTEX(sched_domains_mutex);
@@ -355,12 +344,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
 
 #ifdef CONFIG_SMP
 
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
 {
 	struct hrtimer *timer = &rq->hrtick_timer;
-	ktime_t time = hrtimer_get_softexpires(timer);
 
-	return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 }
 
 /*
@@ -440,8 +428,8 @@ void hrtick_start(struct rq *rq, u64 delay)
 	 * doesn't make sense. Rely on vruntime for fairness.
 	 */
 	delay = max_t(u64, delay, 10000LL);
-	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-			HRTIMER_MODE_REL_PINNED, 0);
+	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ffeaa4105e48..854881b2526b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3850,7 +3850,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
  * Are we near the end of the current quota period?
  *
  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
- * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * hrtimer base being cleared by hrtimer_start. In the case of
  * migrate_hrtimers, base is never cleared, so we are fine.
  */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
-- 
cgit v1.2.3


From cc9684d3c1188ac5f1cf0ee9f8be7ba456099d7b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:06 +0000
Subject: sched: deadline: Use hrtimer_start()

hrtimer_start() does not longer defer already expired timers to the
softirq. Get rid of the __hrtimer_start_range_ns() invocation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.627353666@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/deadline.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5e95145088fd..21d6907d2b9f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -503,8 +503,6 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
 	struct rq *rq = rq_of_dl_rq(dl_rq);
 	ktime_t now, act;
-	ktime_t soft, hard;
-	unsigned long range;
 	s64 delta;
 
 	if (boosted)
@@ -527,15 +525,9 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
 	if (ktime_us_delta(act, now) < 0)
 		return 0;
 
-	hrtimer_set_expires(&dl_se->dl_timer, act);
+	hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
 
-	soft = hrtimer_get_softexpires(&dl_se->dl_timer);
-	hard = hrtimer_get_expires(&dl_se->dl_timer);
-	range = ktime_to_ns(ktime_sub(hard, soft));
-	__hrtimer_start_range_ns(&dl_se->dl_timer, soft,
-				 range, HRTIMER_MODE_ABS, 0);
-
-	return hrtimer_active(&dl_se->dl_timer);
+	return 1;
 }
 
 /*
-- 
cgit v1.2.3


From 58f1f803f1d6ef9ab280de13246d65970a09cb95 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:08 +0000
Subject: hrtimer: Get rid of __hrtimer_start_range_ns()

No more callers. Remove the leftovers.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.707871492@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  4 ----
 kernel/time/hrtimer.c   | 38 +++++++++++++++-----------------------
 2 files changed, 15 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2c68f71ffd24..a80baa86bb24 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -359,10 +359,6 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
-extern int
-__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			 unsigned long delta_ns,
-			 const enum hrtimer_mode mode, int wakeup);
 
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 179b991cfdcb..88d6ea25dde4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -916,9 +916,20 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode,
-		int wakeup)
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			   unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -971,25 +982,6 @@ int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__hrtimer_start_range_ns);
-
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-		unsigned long delta_ns, const enum hrtimer_mode mode)
-{
-	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
-}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -1006,7 +998,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
-- 
cgit v1.2.3


From 02a171af1a46966dcdb5b38cdc33e4f43e92c778 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:10 +0000
Subject: hrtimer: Make hrtimer_start() a inline wrapper

No point for an extra export just to set the extra argument of
hrtimer_start_range_ns() to 0.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.808544539@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 19 +++++++++++++++++--
 kernel/time/hrtimer.c   | 19 -------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index a80baa86bb24..42074ab3d5c3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -355,11 +355,26 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
 /* Basic timer operations: */
-extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
-			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
 
+/**
+ * hrtimer_start - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
+ *		relative (HRTIMER_MODE_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim,
+				const enum hrtimer_mode mode)
+{
+	return hrtimer_start_range_ns(timer, tim, 0, mode);
+}
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 88d6ea25dde4..e5cf71aa6d77 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -984,25 +984,6 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
-/**
- * hrtimer_start - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
- *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
-{
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
-}
-EXPORT_SYMBOL_GPL(hrtimer_start);
-
-
 /**
  * hrtimer_try_to_cancel - try to deactivate a timer
  * @timer:	hrtimer to stop
-- 
cgit v1.2.3


From 3f7b349ac14885472a19c46840235114e5ad5e52 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:11 +0000
Subject: hrtimer: Remove bogus hrtimer_active() check

The check for hrtimer_active() after starting the timer is
pointless. If the timer is inactive it has expired already and
therefor the task pointer is already NULL.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.907149271@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index e5cf71aa6d77..c38f0b6024b4 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1361,8 +1361,6 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start_expires(&t->timer, mode);
-		if (!hrtimer_active(&t->timer))
-			t->task = NULL;
 
 		if (likely(t->task))
 			freezable_schedule();
@@ -1633,8 +1631,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
 	hrtimer_init_sleeper(&t, current);
 
 	hrtimer_start_expires(&t.timer, mode);
-	if (!hrtimer_active(&t.timer))
-		t.task = NULL;
 
 	if (likely(t.task))
 		schedule();
-- 
cgit v1.2.3


From 2e4b0d3fe88bc2618fd5d081ace338a70f8c23da Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:13 +0000
Subject: futex: Remove bogus hrtimer_active() check

The check for hrtimer_active() after starting the timer is
pointless. If the timer is inactive it has expired already and
therefor the task pointer is already NULL.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203502.985825453@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 2579e407ff67..720eacff6b58 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2063,11 +2063,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 	queue_me(q, hb);
 
 	/* Arm the timer */
-	if (timeout) {
+	if (timeout)
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	/*
 	 * If we have been removed from the hash list, then another task
-- 
cgit v1.2.3


From ccdd92c17e144c8494f4c94ab85b48d297545cec Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:15 +0000
Subject: rtmutex: Remove bogus hrtimer_active() check

The check for hrtimer_active() after starting the timer is
pointless. If the timer is inactive it has expired already and
therefor the task pointer is already NULL.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.081830481@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/locking/rtmutex.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index b73279367087..8626437acf0c 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1180,11 +1180,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 	set_current_state(state);
 
 	/* Setup the timer, when timeout != NULL */
-	if (unlikely(timeout)) {
+	if (unlikely(timeout))
 		hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-		if (!hrtimer_active(&timeout->timer))
-			timeout->task = NULL;
-	}
 
 	ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk);
 
-- 
cgit v1.2.3


From b193217e6dc3f88b599b573b53e0e0f6671d969a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:18 +0000
Subject: alarmtimer: Get rid of unused return value

We want to get rid of the hrtimer_start() return value and the alarm
timer return value is nowhere used. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/20150414203503.243910615@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/alarmtimer.h |  4 ++--
 kernel/time/alarmtimer.c   | 11 ++++-------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/alarmtimer.h b/include/linux/alarmtimer.h
index a899402a5a0e..52f3b7da4f2d 100644
--- a/include/linux/alarmtimer.h
+++ b/include/linux/alarmtimer.h
@@ -43,8 +43,8 @@ struct alarm {
 
 void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
 		enum alarmtimer_restart (*function)(struct alarm *, ktime_t));
-int alarm_start(struct alarm *alarm, ktime_t start);
-int alarm_start_relative(struct alarm *alarm, ktime_t start);
+void alarm_start(struct alarm *alarm, ktime_t start);
+void alarm_start_relative(struct alarm *alarm, ktime_t start);
 void alarm_restart(struct alarm *alarm);
 int alarm_try_to_cancel(struct alarm *alarm);
 int alarm_cancel(struct alarm *alarm);
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 0b55a7570c90..7fbba635a549 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -317,19 +317,16 @@ EXPORT_SYMBOL_GPL(alarm_init);
  * @alarm: ptr to alarm to set
  * @start: time to run the alarm
  */
-int alarm_start(struct alarm *alarm, ktime_t start)
+void alarm_start(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 	unsigned long flags;
-	int ret;
 
 	spin_lock_irqsave(&base->lock, flags);
 	alarm->node.expires = start;
 	alarmtimer_enqueue(base, alarm);
-	ret = hrtimer_start(&alarm->timer, alarm->node.expires,
-				HRTIMER_MODE_ABS);
+	hrtimer_start(&alarm->timer, alarm->node.expires, HRTIMER_MODE_ABS);
 	spin_unlock_irqrestore(&base->lock, flags);
-	return ret;
 }
 EXPORT_SYMBOL_GPL(alarm_start);
 
@@ -338,12 +335,12 @@ EXPORT_SYMBOL_GPL(alarm_start);
  * @alarm: ptr to alarm to set
  * @start: time relative to now to run the alarm
  */
-int alarm_start_relative(struct alarm *alarm, ktime_t start)
+void alarm_start_relative(struct alarm *alarm, ktime_t start)
 {
 	struct alarm_base *base = &alarm_bases[alarm->type];
 
 	start = ktime_add(start, base->gettime());
-	return alarm_start(alarm, start);
+	alarm_start(alarm, start);
 }
 EXPORT_SYMBOL_GPL(alarm_start_relative);
 
-- 
cgit v1.2.3


From b8a62f1ff0ccb18fdc25c6150d1cd394610f4753 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:22 +0000
Subject: tick: broadcast-hrtimer: Remove overly clever return value abuse

The assignment of bc_moved in the conditional construct relies on the
fact that in the case of hrtimer_start() invocation the return value
is always 0. It took me a while to understand it.

We want to get rid of the hrtimer_start() return value. Open code the
logic which makes it readable as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.404751457@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast-hrtimer.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 6aac4beedbbe..96428d706b16 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -66,9 +66,11 @@ static int bc_set_next(ktime_t expires, struct clock_event_device *bc)
 	 * hrtimer_{start/cancel} functions call into tracing,
 	 * calls to these functions must be bound within RCU_NONIDLE.
 	 */
-	RCU_NONIDLE(bc_moved = (hrtimer_try_to_cancel(&bctimer) >= 0) ?
-		!hrtimer_start(&bctimer, expires, HRTIMER_MODE_ABS_PINNED) :
-			0);
+	RCU_NONIDLE({
+			bc_moved = hrtimer_try_to_cancel(&bctimer) >= 0;
+			if (bc_moved)
+				hrtimer_start(&bctimer, expires,
+					      HRTIMER_MODE_ABS_PINNED);});
 	if (bc_moved) {
 		/* Bind the "device" to the cpu */
 		bc->bound_on = smp_processor_id();
-- 
cgit v1.2.3


From 61699e13072a89880aa584dcc64c6da465fb2ccc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:23 +0000
Subject: hrtimer: Remove hrtimer_start() return value

No user was ever interested whether the timer was active or not when
it was started. All abusers of the return value are gone, so get rid
of it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.483556394@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h   | 22 +++++++++-------------
 include/linux/interrupt.h |  6 +++---
 kernel/time/hrtimer.c     | 23 +++++++----------------
 3 files changed, 19 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 42074ab3d5c3..470d876c2eda 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -355,7 +355,7 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { }
 #endif
 
 /* Basic timer operations: */
-extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
 
 /**
@@ -364,34 +364,30 @@ extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
  * @tim:	expiry time
  * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
  *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
  */
-static inline int hrtimer_start(struct hrtimer *timer, ktime_t tim,
-				const enum hrtimer_mode mode)
+static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim,
+				 const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	hrtimer_start_range_ns(timer, tim, 0, mode);
 }
 
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
-static inline int hrtimer_start_expires(struct hrtimer *timer,
-						enum hrtimer_mode mode)
+static inline void hrtimer_start_expires(struct hrtimer *timer,
+					 enum hrtimer_mode mode)
 {
 	unsigned long delta;
 	ktime_t soft, hard;
 	soft = hrtimer_get_softexpires(timer);
 	hard = hrtimer_get_expires(timer);
 	delta = ktime_to_ns(ktime_sub(hard, soft));
-	return hrtimer_start_range_ns(timer, soft, delta, mode);
+	hrtimer_start_range_ns(timer, soft, delta, mode);
 }
 
-static inline int hrtimer_restart(struct hrtimer *timer)
+static inline void hrtimer_restart(struct hrtimer *timer)
 {
-	return hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
+	hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 
 /* Query timers: */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6bf15a66bce7..be7e75c945e9 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -593,10 +593,10 @@ tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer,
 		     clockid_t which_clock, enum hrtimer_mode mode);
 
 static inline
-int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
-			  const enum hrtimer_mode mode)
+void tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time,
+			   const enum hrtimer_mode mode)
 {
-	return hrtimer_start(&ttimer->timer, time, mode);
+	hrtimer_start(&ttimer->timer, time, mode);
 }
 
 static inline
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index c38f0b6024b4..beab02d3ff1e 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -923,22 +923,18 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
  * @delta_ns:	"slack" range for the timer
  * @mode:	expiry mode: absolute (HRTIMER_MODE_ABS) or
  *		relative (HRTIMER_MODE_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
  */
-int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
-			   unsigned long delta_ns, const enum hrtimer_mode mode)
+void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			    unsigned long delta_ns, const enum hrtimer_mode mode)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
-	int ret, leftmost;
+	int leftmost;
 
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	ret = remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -962,11 +958,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	timer_stats_hrtimer_set_start_info(timer);
 
 	leftmost = enqueue_hrtimer(timer, new_base);
-
-	if (!leftmost) {
-		unlock_hrtimer_base(timer, &flags);
-		return ret;
-	}
+	if (!leftmost)
+		goto unlock;
 
 	if (!hrtimer_is_hres_active(timer)) {
 		/*
@@ -977,10 +970,8 @@ int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	} else {
 		hrtimer_reprogram(timer, new_base);
 	}
-
+unlock:
 	unlock_hrtimer_base(timer, &flags);
-
-	return ret;
 }
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
-- 
cgit v1.2.3


From 19d9f4225dd6a47fca430f15eeae345ceb95c301 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:25 +0000
Subject: hrtimer: Avoid locking in hrtimer_cancel() if timer not active

We can do a lockless check for hrtimer_active before actually taking
the lock in hrtimer[_try_to]_cancel. This is useful for hotpath users
like nanosleep as they avoid the lock dance when the timer has
expired.

This is safe because active is true when the timer is enqueued or the
callback is running. Taking the hrtimer base lock does not protect
against concurrent hrtimer_start calls, the callsite has to do the
proper serialization itself.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.580273114@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index beab02d3ff1e..3bac94269a98 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -991,6 +991,15 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	unsigned long flags;
 	int ret = -1;
 
+	/*
+	 * Check lockless first. If the timer is not active (neither
+	 * enqueued nor running the callback, nothing to do here.  The
+	 * base lock does not serialize against a concurrent enqueue,
+	 * so we can avoid taking it.
+	 */
+	if (!hrtimer_active(timer))
+		return 0;
+
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-- 
cgit v1.2.3


From 6deba083e1de3f92f65c9849254e92a1ef001b73 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:28 +0000
Subject: timer: Remove pointless return value of do_usleep_range()

The only user ignores it anyway and rightfully so.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.756060258@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 172b83cd2f8e..e9cc7e0642f2 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1692,14 +1692,14 @@ unsigned long msleep_interruptible(unsigned int msecs)
 
 EXPORT_SYMBOL(msleep_interruptible);
 
-static int __sched do_usleep_range(unsigned long min, unsigned long max)
+static void __sched do_usleep_range(unsigned long min, unsigned long max)
 {
 	ktime_t kmin;
 	unsigned long delta;
 
 	kmin = ktime_set(0, min * NSEC_PER_USEC);
 	delta = (max - min) * NSEC_PER_USEC;
-	return schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
+	schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
 }
 
 /**
-- 
cgit v1.2.3


From 2ad5d3272d8e20e24d8242ebac9f3007f1ea56bc Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 14 Apr 2015 21:09:30 +0000
Subject: timer: Put usleep_range into the __sched section

do_usleep_range() and schedule_hrtimeout_range() are __sched as
well. So it makes no sense to have the exported function in a
different section.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/20150414203503.833709502@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e9cc7e0642f2..03f926c7a8ee 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1707,7 +1707,7 @@ static void __sched do_usleep_range(unsigned long min, unsigned long max)
  * @min: Minimum time in usecs to sleep
  * @max: Maximum time in usecs to sleep
  */
-void usleep_range(unsigned long min, unsigned long max)
+void __sched usleep_range(unsigned long min, unsigned long max)
 {
 	__set_current_state(TASK_UNINTERRUPTIBLE);
 	do_usleep_range(min, max);
-- 
cgit v1.2.3


From 5de2755c8c8b3a6b8414870e2c284914a2b42e4d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Tue, 20 May 2014 15:49:48 +0200
Subject: hrtimer: Allow concurrent hrtimer_start() for self restarting timers

Because we drop cpu_base->lock around calling hrtimer::function, it is
possible for hrtimer_start() to come in between and enqueue the timer.

If hrtimer::function then returns HRTIMER_RESTART we'll hit the BUG_ON
because HRTIMER_STATE_ENQUEUED will be set.

Since the above is a perfectly valid scenario, remove the BUG_ON and
make the enqueue_hrtimer() call conditional on the timer not being
enqueued already.

NOTE: in that concurrent scenario its entirely common for both sites
to want to modify the hrtimer, since hrtimers don't provide
serialization themselves be sure to provide some such that the
hrtimer::function and the hrtimer_start() caller don't both try and
fudge the expiration state at the same time.

To that effect, add a WARN when someone tries to forward an already
enqueued timer, the most common way to change the expiry of self
restarting timers. Ideally we'd put the WARN in everything modifying
the expiry but most of that is inlines and we don't need the bloat.

Fixes: 2d44ae4d7135 ("hrtimer: clean up cpu->base locking tricks")
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Ben Segall <bsegall@google.com>
Cc: Roman Gushchin <klamm@yandex-team.ru>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20150415113105.GT5029@twins.programming.kicks-ass.net
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3bac94269a98..4adf32067862 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -799,6 +799,9 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
 	if (delta.tv64 < 0)
 		return 0;
 
+	if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED))
+		return 0;
+
 	if (interval.tv64 < hrtimer_resolution)
 		interval.tv64 = hrtimer_resolution;
 
@@ -1139,11 +1142,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
+	 *
+	 * Note: Because we dropped the cpu_base->lock above,
+	 * hrtimer_start_range_ns() can have popped in and enqueued the timer
+	 * for us already.
 	 */
-	if (restart != HRTIMER_NORESTART) {
-		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+	if (restart != HRTIMER_NORESTART &&
+	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
-	}
 
 	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
 
-- 
cgit v1.2.3


From 77a4d1a1b9a122ca1fa3507bd30aec1520d7a8a4 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 15 Apr 2015 11:41:57 +0200
Subject: sched: Cleanup bandwidth timers

Roman reported a 3 cpu lockup scenario involving __start_cfs_bandwidth().

The more I look at that code the more I'm convinced its crack, that
entire __start_cfs_bandwidth() thing is brain melting, we don't need to
cancel a timer before starting it, *hrtimer_start*() will happily remove
the timer for you if its still enqueued.

Removing that, removes a big part of the problem, no more ugly cancel
loop to get stuck in.

So now, if I understand things right, the entire reason you have this
cfs_b->lock guarded ->timer_active nonsense is to make sure we don't
accidentally lose the timer.

It appears to me that it should be possible to guarantee that same by
unconditionally (re)starting the timer when !queued. Because regardless
what hrtimer::function will return, if we beat it to (re)enqueue the
timer, it doesn't matter.

Now, because hrtimers don't come with any serialization guarantees we
must ensure both handler and (re)start loop serialize their access to
the hrtimer to avoid both trying to forward the timer at the same
time.

Update the rt bandwidth timer to match.

This effectively reverts: 09dc4ab03936 ("sched/fair: Fix
tg_set_cfs_bandwidth() deadlock on rq->lock").

Reported-by: Roman Gushchin <klamm@yandex-team.ru>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Ben Segall <bsegall@google.com>
Cc: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/20150415095011.804589208@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/sched/core.c  | 15 ++++++-------
 kernel/sched/fair.c  | 59 +++++++++++++---------------------------------------
 kernel/sched/rt.c    | 14 ++++++-------
 kernel/sched/sched.h |  4 ++--
 4 files changed, 31 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3026678113e7..d8a6196465d5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -92,10 +92,13 @@
 
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-	if (hrtimer_active(period_timer))
-		return;
+	/*
+	 * Do not forward the expiration time of active timers;
+	 * we do not want to loose an overrun.
+	 */
+	if (!hrtimer_active(period_timer))
+		hrtimer_forward_now(period_timer, period);
 
-	hrtimer_forward_now(period_timer, period);
 	hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
 }
 
@@ -8113,10 +8116,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
 	__refill_cfs_bandwidth_runtime(cfs_b);
 	/* restart the period timer (if active) to handle new period expiry */
-	if (runtime_enabled && cfs_b->timer_active) {
-		/* force a reprogram */
-		__start_cfs_bandwidth(cfs_b, true);
-	}
+	if (runtime_enabled)
+		start_cfs_bandwidth(cfs_b);
 	raw_spin_unlock_irq(&cfs_b->lock);
 
 	for_each_online_cpu(i) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 854881b2526b..e3b32ebfe421 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3476,16 +3476,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_b->quota == RUNTIME_INF)
 		amount = min_amount;
 	else {
-		/*
-		 * If the bandwidth pool has become inactive, then at least one
-		 * period must have elapsed since the last consumption.
-		 * Refresh the global state and ensure bandwidth timer becomes
-		 * active.
-		 */
-		if (!cfs_b->timer_active) {
-			__refill_cfs_bandwidth_runtime(cfs_b);
-			__start_cfs_bandwidth(cfs_b, false);
-		}
+		start_cfs_bandwidth(cfs_b);
 
 		if (cfs_b->runtime > 0) {
 			amount = min(cfs_b->runtime, min_amount);
@@ -3634,6 +3625,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
 	long task_delta, dequeue = 1;
+	bool empty;
 
 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
 
@@ -3663,13 +3655,21 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
+	empty = list_empty(&cfs_rq->throttled_list);
+
 	/*
 	 * Add to the _head_ of the list, so that an already-started
 	 * distribute_cfs_runtime will not see us
 	 */
 	list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-	if (!cfs_b->timer_active)
-		__start_cfs_bandwidth(cfs_b, false);
+
+	/*
+	 * If we're the first throttled task, make sure the bandwidth
+	 * timer is running.
+	 */
+	if (empty)
+		start_cfs_bandwidth(cfs_b);
+
 	raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -3784,13 +3784,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	if (cfs_b->idle && !throttled)
 		goto out_deactivate;
 
-	/*
-	 * if we have relooped after returning idle once, we need to update our
-	 * status as actually running, so that other cpus doing
-	 * __start_cfs_bandwidth will stop trying to cancel us.
-	 */
-	cfs_b->timer_active = 1;
-
 	__refill_cfs_bandwidth_runtime(cfs_b);
 
 	if (!throttled) {
@@ -3835,7 +3828,6 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
 	return 0;
 
 out_deactivate:
-	cfs_b->timer_active = 0;
 	return 1;
 }
 
@@ -3999,6 +3991,7 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, slack_timer);
+
 	do_sched_cfs_slack_timer(cfs_b);
 
 	return HRTIMER_NORESTART;
@@ -4008,15 +4001,12 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 {
 	struct cfs_bandwidth *cfs_b =
 		container_of(timer, struct cfs_bandwidth, period_timer);
-	ktime_t now;
 	int overrun;
 	int idle = 0;
 
 	raw_spin_lock(&cfs_b->lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, cfs_b->period);
-
+		overrun = hrtimer_forward_now(timer, cfs_b->period);
 		if (!overrun)
 			break;
 
@@ -4047,27 +4037,8 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
 }
 
-/* requires cfs_b->lock, may release to reprogram timer */
-void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force)
+void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	/*
-	 * The timer may be active because we're trying to set a new bandwidth
-	 * period or because we're racing with the tear-down path
-	 * (timer_active==0 becomes visible before the hrtimer call-back
-	 * terminates).  In either case we ensure that it's re-programmed
-	 */
-	while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
-	       hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
-		/* bounce the lock to allow do_sched_cfs_period_timer to run */
-		raw_spin_unlock(&cfs_b->lock);
-		cpu_relax();
-		raw_spin_lock(&cfs_b->lock);
-		/* if someone else restarted the timer then we're done */
-		if (!force && cfs_b->timer_active)
-			return;
-	}
-
-	cfs_b->timer_active = 1;
 	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 575da76a3874..b0febf25d8f1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -18,19 +18,20 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 {
 	struct rt_bandwidth *rt_b =
 		container_of(timer, struct rt_bandwidth, rt_period_timer);
-	ktime_t now;
-	int overrun;
 	int idle = 0;
+	int overrun;
 
+	raw_spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
-		now = hrtimer_cb_get_time(timer);
-		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-
+		overrun = hrtimer_forward_now(timer, rt_b->rt_period);
 		if (!overrun)
 			break;
 
+		raw_spin_unlock(&rt_b->rt_runtime_lock);
 		idle = do_sched_rt_period_timer(rt_b, overrun);
+		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
 }
@@ -52,9 +53,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
-	if (hrtimer_active(&rt_b->rt_period_timer))
-		return;
-
 	raw_spin_lock(&rt_b->rt_runtime_lock);
 	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e0e129993958..08606a1f8c4d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -215,7 +215,7 @@ struct cfs_bandwidth {
 	s64 hierarchical_quota;
 	u64 runtime_expires;
 
-	int idle, timer_active;
+	int idle;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -306,7 +306,7 @@ extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
 
 extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
-extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b, bool force);
+extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
 extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
 
 extern void free_rt_sched_group(struct task_group *tg);
-- 
cgit v1.2.3


From 272325c4821f052092c41feac21f4a1a46f0ad48 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Wed, 15 Apr 2015 11:41:58 +0200
Subject: perf: Fix mux_interval hrtimer wreckage

Thomas stumbled over the hrtimer_forward_now() in
perf_event_mux_interval_ms_store() and noticed its broken-ness.

You cannot just change the expiry time of an active timer, it will
destroy the red-black tree order and cause havoc.

Change it to (re)start the timer instead, (re)starting a timer will
dequeue and enqueue a timer and therefore preserve rb-tree order.

Since we cannot enqueue remotely, wrap the thing in
cpu_function_call(), this however mandates that we restrict ourselves
to online cpus. Also serialize the entire setting so we don't get
multiple concurrent threads trying to update to different values.

Also fix a problem in perf_mux_hrtimer_restart(), checking against
hrtimer_active() can actually loose us the timer when timer->state ==
HRTIMER_STATE_CALLBACK and the callback has already decided NORESTART.

Furthermore it doesn't make any sense to test
hrtimer_callback_running() when we already tested hrtimer_active(),
but with the above change, we explicitly must call it when
callback_running.

Lastly, rename a few functions:

  s/perf_cpu_hrtimer_/perf_mux_hrtimer_/ -- because I could not find
                                            the mux timer function

  s/\<hr\>/timer/ -- because that's the normal way of calling things.

Fixes: 62b856397927 ("perf: Add sysfs entry to adjust multiplexing interval per PMU")
Reported-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150415095011.863052571@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c | 63 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 36 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 05309fdba2a4..e7ed00b4a6ed 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,9 +51,11 @@
 
 static struct workqueue_struct *perf_wq;
 
+typedef int (*remote_function_f)(void *);
+
 struct remote_function_call {
 	struct task_struct	*p;
-	int			(*func)(void *info);
+	remote_function_f	func;
 	void			*info;
 	int			ret;
 };
@@ -86,7 +88,7 @@ static void remote_function(void *data)
  *	    -EAGAIN - when the process moved away
  */
 static int
-task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+task_function_call(struct task_struct *p, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= p,
@@ -110,7 +112,7 @@ task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
  *
  * returns: @func return value or -ENXIO when the cpu is offline
  */
-static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+static int cpu_function_call(int cpu, remote_function_f func, void *info)
 {
 	struct remote_function_call data = {
 		.p	= NULL,
@@ -747,7 +749,7 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 /*
  * function must be called with interrupts disbled
  */
-static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
 	enum hrtimer_restart ret = HRTIMER_NORESTART;
@@ -771,7 +773,7 @@ static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
 }
 
 /* CPU is going down */
-void perf_cpu_hrtimer_cancel(int cpu)
+void perf_mux_hrtimer_cancel(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct pmu *pmu;
@@ -798,11 +800,11 @@ void perf_cpu_hrtimer_cancel(int cpu)
 	local_irq_restore(flags);
 }
 
-static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
-	int timer;
+	u64 interval;
 
 	/* no multiplexing needed for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
@@ -812,29 +814,30 @@ static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 	 * check default is sane, if not set then force to
 	 * default interval (1/tick)
 	 */
-	timer = pmu->hrtimer_interval_ms;
-	if (timer < 1)
-		timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
+	interval = pmu->hrtimer_interval_ms;
+	if (interval < 1)
+		interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
 
-	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
+	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
-	hr->function = perf_cpu_hrtimer_handler;
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	timer->function = perf_mux_hrtimer_handler;
 }
 
-static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
-	struct hrtimer *hr = &cpuctx->hrtimer;
+	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
-		return;
+		return 0;
 
-	if (hrtimer_active(hr))
-		return;
+	if (hrtimer_is_queued(timer))
+		return 0;
 
-	hrtimer_start(hr, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
+	hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
+	return 0;
 }
 
 void perf_pmu_disable(struct pmu *pmu)
@@ -1913,7 +1916,7 @@ group_sched_in(struct perf_event *group_event,
 
 	if (event_sched_in(group_event, cpuctx, ctx)) {
 		pmu->cancel_txn(pmu);
-		perf_cpu_hrtimer_restart(cpuctx);
+		perf_mux_hrtimer_restart(cpuctx);
 		return -EAGAIN;
 	}
 
@@ -1960,7 +1963,7 @@ group_error:
 
 	pmu->cancel_txn(pmu);
 
-	perf_cpu_hrtimer_restart(cpuctx);
+	perf_mux_hrtimer_restart(cpuctx);
 
 	return -EAGAIN;
 }
@@ -2233,7 +2236,7 @@ static int __perf_event_enable(void *info)
 		 */
 		if (leader != event) {
 			group_sched_out(leader, cpuctx, ctx);
-			perf_cpu_hrtimer_restart(cpuctx);
+			perf_mux_hrtimer_restart(cpuctx);
 		}
 		if (leader->attr.pinned) {
 			update_group_times(leader);
@@ -7143,6 +7146,8 @@ perf_event_mux_interval_ms_show(struct device *dev,
 	return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
 }
 
+static DEFINE_MUTEX(mux_interval_mutex);
+
 static ssize_t
 perf_event_mux_interval_ms_store(struct device *dev,
 				 struct device_attribute *attr,
@@ -7162,17 +7167,21 @@ perf_event_mux_interval_ms_store(struct device *dev,
 	if (timer == pmu->hrtimer_interval_ms)
 		return count;
 
+	mutex_lock(&mux_interval_mutex);
 	pmu->hrtimer_interval_ms = timer;
 
 	/* update all cpuctx for this PMU */
-	for_each_possible_cpu(cpu) {
+	get_online_cpus();
+	for_each_online_cpu(cpu) {
 		struct perf_cpu_context *cpuctx;
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
 		cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
 
-		if (hrtimer_active(&cpuctx->hrtimer))
-			hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
+		cpu_function_call(cpu,
+			(remote_function_f)perf_mux_hrtimer_restart, cpuctx);
 	}
+	put_online_cpus();
+	mutex_unlock(&mux_interval_mutex);
 
 	return count;
 }
@@ -7277,7 +7286,7 @@ skip_type:
 		lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
 		cpuctx->ctx.pmu = pmu;
 
-		__perf_cpu_hrtimer_init(cpuctx, cpu);
+		__perf_mux_hrtimer_init(cpuctx, cpu);
 
 		cpuctx->unique_pmu = pmu;
 	}
-- 
cgit v1.2.3


From 9183034879a1196714836c8142340c850c747323 Mon Sep 17 00:00:00 2001
From: kbuild test robot
Date: Thu, 23 Apr 2015 04:00:00 +0800
Subject: perf: perf_mux_hrtimer_cancel() can be static

Signed-off-by: Fengguang Wu <fengguang.wu@intel.com>
Cc: kbuild-all@01.org
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150422200000.GA122603@lkp-sb04
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index e7ed00b4a6ed..598182dcc260 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -773,7 +773,7 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 }
 
 /* CPU is going down */
-void perf_mux_hrtimer_cancel(int cpu)
+static void perf_mux_hrtimer_cancel(int cpu)
 {
 	struct perf_cpu_context *cpuctx;
 	struct pmu *pmu;
-- 
cgit v1.2.3


From b484403b9abe5f444ae2fee6a249759bb3c35bcf Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Thu, 23 Apr 2015 13:58:09 +0200
Subject: sched: debug: Remove the cfs bandwidth timer_active printout

The struct member is gone.

Reported-by: fengguang.wu@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/sched/debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index a245c1fc6f0a..f94724eda407 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -230,8 +230,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 #endif
 #endif
 #ifdef CONFIG_CFS_BANDWIDTH
-	SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
-			cfs_rq->tg->cfs_bandwidth.timer_active);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttled",
 			cfs_rq->throttled);
 	SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
-- 
cgit v1.2.3


From 30fbd59057004f97f45467124693f22e8b6f3e16 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Mon, 4 May 2015 13:51:12 +0200
Subject: perf: Remove unused function perf_mux_hrtimer_cancel()

Reported-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/events/core.c | 28 ----------------------------
 1 file changed, 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/events/core.c b/kernel/events/core.c
index 598182dcc260..f5288293d667 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -772,34 +772,6 @@ static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 	return ret;
 }
 
-/* CPU is going down */
-static void perf_mux_hrtimer_cancel(int cpu)
-{
-	struct perf_cpu_context *cpuctx;
-	struct pmu *pmu;
-	unsigned long flags;
-
-	if (WARN_ON(cpu != smp_processor_id()))
-		return;
-
-	local_irq_save(flags);
-
-	rcu_read_lock();
-
-	list_for_each_entry_rcu(pmu, &pmus, entry) {
-		cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
-
-		if (pmu->task_ctx_nr == perf_sw_context)
-			continue;
-
-		hrtimer_cancel(&cpuctx->hrtimer);
-	}
-
-	rcu_read_unlock();
-
-	local_irq_restore(flags);
-}
-
 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 {
 	struct hrtimer *timer = &cpuctx->hrtimer;
-- 
cgit v1.2.3


From 2951d5c031a3aaefa31b688fbf229e75692f4786 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 5 May 2015 10:00:13 +0200
Subject: tick: broadcast: Prevent livelock from event handler

With the removal of the hrtimer softirq the switch to highres/nohz
mode happens in the tick interrupt. That leads to a livelock when the
per cpu event handler is directly called from the broadcast handler
under broadcast lock because broadcast lock needs to be taken for the
highres/nohz switch as well.

Solve this by calling the cpu local handler outside the broadcast_lock
held region.

Fixes: c6eb3f70d448 "hrtimer: Get rid of hrtimer softirq"
Reported-and-tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 53 +++++++++++++++++++++-----------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7e8ca4f448a8..5d9e4aab9797 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -255,18 +255,18 @@ int tick_receive_broadcast(void)
 /*
  * Broadcast the event to the cpus, which are set in the mask (mangled).
  */
-static void tick_do_broadcast(struct cpumask *mask)
+static bool tick_do_broadcast(struct cpumask *mask)
 {
 	int cpu = smp_processor_id();
 	struct tick_device *td;
+	bool local = false;
 
 	/*
 	 * Check, if the current cpu is in the mask
 	 */
 	if (cpumask_test_cpu(cpu, mask)) {
 		cpumask_clear_cpu(cpu, mask);
-		td = &per_cpu(tick_cpu_device, cpu);
-		td->evtdev->event_handler(td->evtdev);
+		local = true;
 	}
 
 	if (!cpumask_empty(mask)) {
@@ -279,16 +279,17 @@ static void tick_do_broadcast(struct cpumask *mask)
 		td = &per_cpu(tick_cpu_device, cpumask_first(mask));
 		td->evtdev->broadcast(mask);
 	}
+	return local;
 }
 
 /*
  * Periodic broadcast:
  * - invoke the broadcast handlers
  */
-static void tick_do_periodic_broadcast(void)
+static bool tick_do_periodic_broadcast(void)
 {
 	cpumask_and(tmpmask, cpu_online_mask, tick_broadcast_mask);
-	tick_do_broadcast(tmpmask);
+	return tick_do_broadcast(tmpmask);
 }
 
 /*
@@ -296,34 +297,26 @@ static void tick_do_periodic_broadcast(void)
  */
 static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 {
-	ktime_t next;
+	struct tick_device *td = this_cpu_ptr(&tick_cpu_device);
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
+	bc_local = tick_do_periodic_broadcast();
 
-	tick_do_periodic_broadcast();
+	if (dev->state == CLOCK_EVT_STATE_ONESHOT) {
+		ktime_t next = ktime_add(dev->next_event, tick_period);
 
-	/*
-	 * The device is in periodic mode. No reprogramming necessary:
-	 */
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
-		goto unlock;
+		clockevents_program_event(dev, next, true);
+	}
+	raw_spin_unlock(&tick_broadcast_lock);
 
 	/*
-	 * Setup the next period for devices, which do not have
-	 * periodic mode. We read dev->next_event first and add to it
-	 * when the event already expired. clockevents_program_event()
-	 * sets dev->next_event only when the event is really
-	 * programmed to the device.
+	 * We run the handler of the local cpu after dropping
+	 * tick_broadcast_lock because the handler might deadlock when
+	 * trying to switch to oneshot mode.
 	 */
-	for (next = dev->next_event; ;) {
-		next = ktime_add(next, tick_period);
-
-		if (!clockevents_program_event(dev, next, false))
-			goto unlock;
-		tick_do_periodic_broadcast();
-	}
-unlock:
-	raw_spin_unlock(&tick_broadcast_lock);
+	if (bc_local)
+		td->evtdev->event_handler(td->evtdev);
 }
 
 /**
@@ -622,9 +615,13 @@ again:
 		cpumask_and(tmpmask, tmpmask, cpu_online_mask);
 
 	/*
-	 * Wakeup the cpus which have an expired event.
+	 * Wakeup the cpus which have an expired event and handle the
+	 * broadcast event of the local cpu.
 	 */
-	tick_do_broadcast(tmpmask);
+	if (tick_do_broadcast(tmpmask)) {
+		td = this_cpu_ptr(&tick_cpu_device);
+		td->evtdev->event_handler(td->evtdev);
+	}
 
 	/*
 	 * Two reasons for reprogram:
-- 
cgit v1.2.3


From 298dbd1c5cd66f0ac85981b83b7d519a5d88d1b8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 5 May 2015 09:44:24 +0200
Subject: tick: broadcast: Simplify oneshot logic and shorten lock region

Simplify the oneshot logic by avoiding the reprogramming loops. That
also allows to call the cpu local handler outside of the
broadcast_lock held region.

Tested-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast.c | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 5d9e4aab9797..12fcc55d607a 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -525,18 +525,14 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
 	irq_set_affinity(bc->irq, bc->cpumask);
 }
 
-static int tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
-				    ktime_t expires, int force)
+static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
+				     ktime_t expires)
 {
-	int ret;
-
 	if (bc->state != CLOCK_EVT_STATE_ONESHOT)
 		clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
-	ret = clockevents_program_event(bc, expires, force);
-	if (!ret)
-		tick_broadcast_set_affinity(bc, cpumask_of(cpu));
-	return ret;
+	clockevents_program_event(bc, expires, 1);
+	tick_broadcast_set_affinity(bc, cpumask_of(cpu));
 }
 
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -573,9 +569,9 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
 	struct tick_device *td;
 	ktime_t now, next_event;
 	int cpu, next_cpu = 0;
+	bool bc_local;
 
 	raw_spin_lock(&tick_broadcast_lock);
-again:
 	dev->next_event.tv64 = KTIME_MAX;
 	next_event.tv64 = KTIME_MAX;
 	cpumask_clear(tmpmask);
@@ -615,13 +611,9 @@ again:
 		cpumask_and(tmpmask, tmpmask, cpu_online_mask);
 
 	/*
-	 * Wakeup the cpus which have an expired event and handle the
-	 * broadcast event of the local cpu.
+	 * Wakeup the cpus which have an expired event.
 	 */
-	if (tick_do_broadcast(tmpmask)) {
-		td = this_cpu_ptr(&tick_cpu_device);
-		td->evtdev->event_handler(td->evtdev);
-	}
+	bc_local = tick_do_broadcast(tmpmask);
 
 	/*
 	 * Two reasons for reprogram:
@@ -633,15 +625,15 @@ again:
 	 * - There are pending events on sleeping CPUs which were not
 	 * in the event mask
 	 */
-	if (next_event.tv64 != KTIME_MAX) {
-		/*
-		 * Rearm the broadcast device. If event expired,
-		 * repeat the above
-		 */
-		if (tick_broadcast_set_event(dev, next_cpu, next_event, 0))
-			goto again;
-	}
+	if (next_event.tv64 != KTIME_MAX)
+		tick_broadcast_set_event(dev, next_cpu, next_event);
+
 	raw_spin_unlock(&tick_broadcast_lock);
+
+	if (bc_local) {
+		td = this_cpu_ptr(&tick_cpu_device);
+		td->evtdev->event_handler(td->evtdev);
+	}
 }
 
 static int broadcast_needs_cpu(struct clock_event_device *bc, int cpu)
@@ -723,7 +715,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			 */
 			if (!cpumask_test_cpu(cpu, tick_broadcast_force_mask) &&
 			    dev->next_event.tv64 < bc->next_event.tv64)
-				tick_broadcast_set_event(bc, cpu, dev->next_event, 1);
+				tick_broadcast_set_event(bc, cpu, dev->next_event);
 		}
 		/*
 		 * If the current CPU owns the hrtimer broadcast
@@ -858,7 +850,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
-			tick_broadcast_set_event(bc, cpu, tick_next_period, 1);
+			tick_broadcast_set_event(bc, cpu, tick_next_period);
 		} else
 			bc->next_event.tv64 = KTIME_MAX;
 	} else {
-- 
cgit v1.2.3


From 1ef09cd713c90781b683a0b4e0a874803c172b1d Mon Sep 17 00:00:00 2001
From: Preeti U Murthy
Date: Tue, 28 Apr 2015 14:15:20 +0530
Subject: tick-broadcast: Fix the printing of broadcast masks

Today the number of bits of the broadcast masks that is output into
/proc/timer_list is sizeof(unsigned long). This means that on machines
with a larger number of CPUs, the bitmasks of CPUs beyond this range do
not appear.

Fix this by using bitmap printing through "%*pb" instead, so as to
output the broadcast masks for the range of nr_cpu_ids into
/proc/timer_list.

Signed-off-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: peterz@infradead.org
Cc: linuxppc-dev@ozlabs.org
Cc: john.stultz@linaro.org
Link: http://lkml.kernel.org/r/20150428084520.3314.62668.stgit@preeti.in.ibm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer_list.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 66f39bba5353..18b074b215b0 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -276,11 +276,11 @@ static void timer_list_show_tickdevices_header(struct seq_file *m)
 {
 #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
 	print_tickdevice(m, tick_get_broadcast_device(), -1);
-	SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_mask()));
 #ifdef CONFIG_TICK_ONESHOT
-	SEQ_printf(m, "tick_broadcast_oneshot_mask: %08lx\n",
-		   cpumask_bits(tick_get_broadcast_oneshot_mask())[0]);
+	SEQ_printf(m, "tick_broadcast_oneshot_mask: %*pb\n",
+		   cpumask_pr_args(tick_get_broadcast_oneshot_mask()));
 #endif
 	SEQ_printf(m, "\n");
 #endif
-- 
cgit v1.2.3


From 781978e6e156101209f62b9ebc8783b70ef248de Mon Sep 17 00:00:00 2001
From: Joonwoo Park
Date: Mon, 27 Apr 2015 19:21:49 -0700
Subject: timer: Use timer->base for flag checks

At present, internal_add_timer() examines flags with 'base' which doesn't
contain flags.  Examine with 'timer->base' to avoid unnecessary waking up
of nohz CPU when timer base has TIMER_DEFERRABLE set.

Signed-off-by: Joonwoo Park <joonwoop@codeaurora.org>
Cc: sboyd@codeaurora.org
Cc: skannan@codeaurora.org
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1430187709-21087-1-git-send-email-joonwoop@codeaurora.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 03f926c7a8ee..d4af7c56c95d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -436,7 +436,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!tbase_get_deferrable(base) || tick_nohz_full_cpu(base->cpu))
+	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu))
 		wake_up_nohz_cpu(base->cpu);
 }
 
-- 
cgit v1.2.3


From 38d23a6cc16c02f7b0c920266053f340b5601735 Mon Sep 17 00:00:00 2001
From: Andreas Sandberg
Date: Fri, 24 Apr 2015 13:06:05 +0000
Subject: tick: hrtimer-broadcast: Prevent endless restarting when broadcast
 device is unused

The hrtimer callback in the hrtimer's tick broadcast code sometimes
incorrectly ends up scheduling events at the current tick causing the
kernel to hang servicing the same hrtimer forever. This typically
happens when a device is swapped out by
tick_install_broadcast_device(), which replaces the event handler with
clock_events_handle_noop() and sets the device mode to
CLOCK_EVT_MODE_UNUSED. If the timer is scheduled when this happens,
the next_event field will not be updated and the hrtimer ends up being
restarted at the current tick. To prevent this from happening, only
try to restart the hrtimer if the broadcast clock event device is in
one of the active modes and try to cancel the timer when entering the
CLOCK_EVT_MODE_UNUSED mode.

Signed-off-by: Andreas Sandberg <andreas.sandberg@arm.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1429880765-5558-1-git-send-email-andreas.sandberg@arm.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-broadcast-hrtimer.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
index 96428d706b16..3e7db49a2381 100644
--- a/kernel/time/tick-broadcast-hrtimer.c
+++ b/kernel/time/tick-broadcast-hrtimer.c
@@ -22,6 +22,7 @@ static void bc_set_mode(enum clock_event_mode mode,
 			struct clock_event_device *bc)
 {
 	switch (mode) {
+	case CLOCK_EVT_MODE_UNUSED:
 	case CLOCK_EVT_MODE_SHUTDOWN:
 		/*
 		 * Note, we cannot cancel the timer here as we might
@@ -101,10 +102,13 @@ static enum hrtimer_restart bc_handler(struct hrtimer *t)
 {
 	ce_broadcast_hrtimer.event_handler(&ce_broadcast_hrtimer);
 
-	if (ce_broadcast_hrtimer.next_event.tv64 == KTIME_MAX)
+	switch (ce_broadcast_hrtimer.mode) {
+	case CLOCK_EVT_MODE_ONESHOT:
+		if (ce_broadcast_hrtimer.next_event.tv64 != KTIME_MAX)
+			return HRTIMER_RESTART;
+	default:
 		return HRTIMER_NORESTART;
-
-	return HRTIMER_RESTART;
+	}
 }
 
 void tick_setup_hrtimer_broadcast(void)
-- 
cgit v1.2.3


From 6b442bc81337913eb775965a67ffdb8a36935422 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Thu, 7 May 2015 14:35:59 +0200
Subject: nohz: Fix !HIGH_RES_TIMERS hang

Simon Horman reported this crash on a system with
high-res timers disabled but nohz enabled:

  > ------------[ cut here ]------------
  > kernel BUG at kernel/irq_work.c:135!

    BUG_ON(!irqs_disabled());

So something enabled interrupts in the periodic tick handling machinery,
and that code path indeed has a local_irq_disable()/enable pair in
tick_nohz_switch_to_nohz() which causes havoc. Fix it.

This patch also fixes a +nohz -hrtimers hang reported by Ingo Molnar.

Reported-by: Simon Horman <horms@verge.net.au>
Reported-by: Ingo Molnar <mingo@kernel.org>
Tested-by: Simon Horman <horms@verge.net.au>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: LAK <linux-arm-kernel@lists.infradead.org>
Cc: Magnus Damm <magnus.damm@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1505071425520.4225@nanos
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/time/tick-sched.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 753c211f6195..812f7a3b9898 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -967,11 +967,9 @@ static void tick_nohz_switch_to_nohz(void)
 	if (!tick_nohz_enabled)
 		return;
 
-	local_irq_disable();
-	if (tick_switch_to_oneshot(tick_nohz_handler)) {
-		local_irq_enable();
+	if (tick_switch_to_oneshot(tick_nohz_handler))
 		return;
-	}
+
 	tick_nohz_active = 1;
 	ts->nohz_mode = NOHZ_MODE_LOWRES;
 
@@ -986,7 +984,6 @@ static void tick_nohz_switch_to_nohz(void)
 	hrtimer_forward_now(&ts->sched_timer, tick_period);
 	hrtimer_set_expires(&ts->sched_timer, next);
 	tick_program_event(next, 1);
-	local_irq_enable();
 }
 
 /*
@@ -1171,7 +1168,7 @@ void tick_oneshot_notify(void)
  * Called cyclic from the hrtimer softirq (driven by the timer
  * softirq) allow_nohz signals, that we can switch into low-res nohz
  * mode, because high resolution timers are disabled (either compile
- * or runtime).
+ * or runtime). Called with interrupts disabled.
  */
 int tick_check_oneshot_change(int allow_nohz)
 {
-- 
cgit v1.2.3


From 4cfafd3082afc707653aeb82e9f8e7b596fbbfd6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 14 May 2015 12:23:11 +0200
Subject: sched,perf: Fix periodic timers

In the below two commits (see Fixes) we have periodic timers that can
stop themselves when they're no longer required, but need to be
(re)-started when their idle condition changes.

Further complications is that we want the timer handler to always do
the forward such that it will always correctly deal with the overruns,
and we do not want to race such that the handler has already decided
to stop, but the (external) restart sees the timer still active and we
end up with a 'lost' timer.

The problem with the current code is that the re-start can come before
the callback does the forward, at which point the forward from the
callback will WARN about forwarding an enqueued timer.

Now, conceptually its easy to detect if you're before or after the fwd
by comparing the expiration time against the current time. Of course,
that's expensive (and racy) because we don't have the current time.

Alternatively one could cache this state inside the timer, but then
everybody pays the overhead of maintaining this extra state, and that
is undesired.

The only other option that I could see is the external timer_active
variable, which I tried to kill before. I would love a nicer interface
for this seemingly simple 'problem' but alas.

Fixes: 272325c4821f ("perf: Fix mux_interval hrtimer wreckage")
Fixes: 77a4d1a1b9a1 ("sched: Cleanup bandwidth timers")
Cc: pjt@google.com
Cc: tglx@linutronix.de
Cc: klamm@yandex-team.ru
Cc: mingo@kernel.org
Cc: bsegall@google.com
Cc: hpa@zytor.com
Cc: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20150514102311.GX21418@twins.programming.kicks-ass.net
---
 include/linux/perf_event.h |  4 ++++
 kernel/events/core.c       | 29 ++++++++++++++++-------------
 kernel/sched/core.c        | 12 ------------
 kernel/sched/fair.c        | 17 +++++++++++++----
 kernel/sched/rt.c          |  8 +++++++-
 kernel/sched/sched.h       |  5 ++---
 6 files changed, 42 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 61992cf2e977..cf3342a8ad80 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -566,8 +566,12 @@ struct perf_cpu_context {
 	struct perf_event_context	*task_ctx;
 	int				active_oncpu;
 	int				exclusive;
+
+	raw_spinlock_t			hrtimer_lock;
 	struct hrtimer			hrtimer;
 	ktime_t				hrtimer_interval;
+	unsigned int			hrtimer_active;
+
 	struct pmu			*unique_pmu;
 	struct perf_cgroup		*cgrp;
 };
diff --git a/kernel/events/core.c b/kernel/events/core.c
index f5288293d667..d9c93f36e379 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -752,24 +752,21 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
 {
 	struct perf_cpu_context *cpuctx;
-	enum hrtimer_restart ret = HRTIMER_NORESTART;
 	int rotations = 0;
 
 	WARN_ON(!irqs_disabled());
 
 	cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
-
 	rotations = perf_rotate_context(cpuctx);
 
-	/*
-	 * arm timer if needed
-	 */
-	if (rotations) {
+	raw_spin_lock(&cpuctx->hrtimer_lock);
+	if (rotations)
 		hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
-		ret = HRTIMER_RESTART;
-	}
+	else
+		cpuctx->hrtimer_active = 0;
+	raw_spin_unlock(&cpuctx->hrtimer_lock);
 
-	return ret;
+	return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
 }
 
 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
@@ -792,7 +789,8 @@ static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
 
 	cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
 
-	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+	raw_spin_lock_init(&cpuctx->hrtimer_lock);
+	hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	timer->function = perf_mux_hrtimer_handler;
 }
 
@@ -800,15 +798,20 @@ static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
 {
 	struct hrtimer *timer = &cpuctx->hrtimer;
 	struct pmu *pmu = cpuctx->ctx.pmu;
+	unsigned long flags;
 
 	/* not for SW PMU */
 	if (pmu->task_ctx_nr == perf_sw_context)
 		return 0;
 
-	if (hrtimer_is_queued(timer))
-		return 0;
+	raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
+	if (!cpuctx->hrtimer_active) {
+		cpuctx->hrtimer_active = 1;
+		hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
+		hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+	}
+	raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
 
-	hrtimer_start(timer, cpuctx->hrtimer_interval, HRTIMER_MODE_REL_PINNED);
 	return 0;
 }
 
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8a6196465d5..e84aeb280777 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,18 +90,6 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
 
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
-	/*
-	 * Do not forward the expiration time of active timers;
-	 * we do not want to loose an overrun.
-	 */
-	if (!hrtimer_active(period_timer))
-		hrtimer_forward_now(period_timer, period);
-
-	hrtimer_start_expires(period_timer, HRTIMER_MODE_ABS_PINNED);
-}
-
 DEFINE_MUTEX(sched_domains_mutex);
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e3b32ebfe421..69be2825262d 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3870,8 +3870,9 @@ static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
 	if (runtime_refresh_within(cfs_b, min_left))
 		return;
 
-	start_bandwidth_timer(&cfs_b->slack_timer,
-				ns_to_ktime(cfs_bandwidth_slack_period));
+	hrtimer_start(&cfs_b->slack_timer,
+			ns_to_ktime(cfs_bandwidth_slack_period),
+			HRTIMER_MODE_REL);
 }
 
 /* we know any runtime found here is valid as update_curr() precedes return */
@@ -4012,6 +4013,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
 
 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
 	}
+	if (idle)
+		cfs_b->period_active = 0;
 	raw_spin_unlock(&cfs_b->lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -4025,7 +4028,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->period = ns_to_ktime(default_cfs_period());
 
 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
 	cfs_b->period_timer.function = sched_cfs_period_timer;
 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
@@ -4039,7 +4042,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 
 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 {
-	start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+	lockdep_assert_held(&cfs_b->lock);
+
+	if (!cfs_b->period_active) {
+		cfs_b->period_active = 1;
+		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
+		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 }
 
 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b0febf25d8f1..e43da5391dcd 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -31,6 +31,8 @@ static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
 		idle = do_sched_rt_period_timer(rt_b, overrun);
 		raw_spin_lock(&rt_b->rt_runtime_lock);
 	}
+	if (idle)
+		rt_b->rt_period_active = 0;
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 
 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
@@ -54,7 +56,11 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 		return;
 
 	raw_spin_lock(&rt_b->rt_runtime_lock);
-	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+	if (!rt_b->rt_period_active) {
+		rt_b->rt_period_active = 1;
+		hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
+		hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
+	}
 	raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08606a1f8c4d..f9a58ef373b4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -131,6 +131,7 @@ struct rt_bandwidth {
 	ktime_t			rt_period;
 	u64			rt_runtime;
 	struct hrtimer		rt_period_timer;
+	unsigned int		rt_period_active;
 };
 
 void __dl_clear_params(struct task_struct *p);
@@ -215,7 +216,7 @@ struct cfs_bandwidth {
 	s64 hierarchical_quota;
 	u64 runtime_expires;
 
-	int idle;
+	int idle, period_active;
 	struct hrtimer period_timer, slack_timer;
 	struct list_head throttled_cfs_rq;
 
@@ -1406,8 +1407,6 @@ static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
 static inline void sched_avg_update(struct rq *rq) { }
 #endif
 
-extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
-
 /*
  * __task_rq_lock - lock the rq @p resides on.
  */
-- 
cgit v1.2.3


From 0a227985d4a993a322ff72ecbaeee2611d624216 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire
Date: Mon, 18 May 2015 14:19:12 +0200
Subject: time: Move timeconst.h into include/generated

kernel/time/timeconst.h is moved to include/generated/ and generated 
by the top level Kbuild. This allows using timeconst.h in an earlier
build stage.

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1431951554-5563-1-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Kbuild                   | 34 +++++++++++++++++++++++++++-------
 kernel/time/Makefile     | 17 +----------------
 kernel/time/time.c       |  2 +-
 kernel/time/timeconst.bc |  3 ++-
 4 files changed, 31 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/Kbuild b/Kbuild
index 6f0d82a9245d..df99a5f53beb 100644
--- a/Kbuild
+++ b/Kbuild
@@ -2,8 +2,9 @@
 # Kbuild for top-level directory of the kernel
 # This file takes care of the following:
 # 1) Generate bounds.h
-# 2) Generate asm-offsets.h (may need bounds.h)
-# 3) Check for missing system calls
+# 2) Generate timeconst.h
+# 3) Generate asm-offsets.h (may need bounds.h and timeconst.h)
+# 4) Check for missing system calls
 
 # Default sed regexp - multiline due to syntax constraints
 define sed-y
@@ -47,7 +48,26 @@ $(obj)/$(bounds-file): kernel/bounds.s FORCE
 	$(call filechk,offsets,__LINUX_BOUNDS_H__)
 
 #####
-# 2) Generate asm-offsets.h
+# 2) Generate timeconst.h
+
+timeconst-file := include/generated/timeconst.h
+
+#always  += $(timeconst-file)
+targets += $(timeconst-file)
+
+quiet_cmd_gentimeconst = GEN     $@
+define cmd_gentimeconst
+	(echo $(CONFIG_HZ) | bc -q $< ) > $@
+endef
+define filechk_gentimeconst
+	(echo $(CONFIG_HZ) | bc -q $< )
+endef
+
+$(obj)/$(timeconst-file): kernel/time/timeconst.bc FORCE
+	$(call filechk,gentimeconst)
+
+#####
+# 3) Generate asm-offsets.h
 #
 
 offsets-file := include/generated/asm-offsets.h
@@ -57,7 +77,7 @@ targets += arch/$(SRCARCH)/kernel/asm-offsets.s
 
 # We use internal kbuild rules to avoid the "is up to date" message from make
 arch/$(SRCARCH)/kernel/asm-offsets.s: arch/$(SRCARCH)/kernel/asm-offsets.c \
-                                      $(obj)/$(bounds-file) FORCE
+                                      $(obj)/$(timeconst-file) $(obj)/$(bounds-file) FORCE
 	$(Q)mkdir -p $(dir $@)
 	$(call if_changed_dep,cc_s_c)
 
@@ -65,7 +85,7 @@ $(obj)/$(offsets-file): arch/$(SRCARCH)/kernel/asm-offsets.s FORCE
 	$(call filechk,offsets,__ASM_OFFSETS_H__)
 
 #####
-# 3) Check for missing system calls
+# 4) Check for missing system calls
 #
 
 always += missing-syscalls
@@ -77,5 +97,5 @@ quiet_cmd_syscalls = CALL    $<
 missing-syscalls: scripts/checksyscalls.sh $(offsets-file) FORCE
 	$(call cmd,syscalls)
 
-# Keep these two files during make clean
-no-clean-files := $(bounds-file) $(offsets-file)
+# Keep these three files during make clean
+no-clean-files := $(bounds-file) $(offsets-file) $(timeconst-file)
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 01f0312419b3..ffc4cc3dcd47 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -13,19 +13,4 @@ obj-$(CONFIG_TIMER_STATS)			+= timer_stats.o
 obj-$(CONFIG_DEBUG_FS)				+= timekeeping_debug.o
 obj-$(CONFIG_TEST_UDELAY)			+= test_udelay.o
 
-$(obj)/time.o: $(obj)/timeconst.h
-
-quiet_cmd_hzfile = HZFILE  $@
-      cmd_hzfile = echo "hz=$(CONFIG_HZ)" > $@
-
-targets += hz.bc
-$(obj)/hz.bc: $(objtree)/include/config/hz.h FORCE
-	$(call if_changed,hzfile)
-
-quiet_cmd_bc  = BC      $@
-      cmd_bc  = bc -q $(filter-out FORCE,$^) > $@
-
-targets += timeconst.h
-$(obj)/timeconst.h: $(obj)/hz.bc $(src)/timeconst.bc FORCE
-	$(call if_changed,bc)
-
+$(obj)/time.o: $(objtree)/include/config/
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 2c85b7724af4..4fa1d26a9843 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -41,7 +41,7 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
-#include "timeconst.h"
+#include <generated/timeconst.h>
 #include "timekeeping.h"
 
 /*
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
index 511bdf2cafda..c7388dee8635 100644
--- a/kernel/time/timeconst.bc
+++ b/kernel/time/timeconst.bc
@@ -50,7 +50,7 @@ define timeconst(hz) {
 	print "#include <linux/types.h>\n\n"
 
 	print "#if HZ != ", hz, "\n"
-	print "#error \qkernel/timeconst.h has the wrong HZ value!\q\n"
+	print "#error \qinclude/generated/timeconst.h has the wrong HZ value!\q\n"
 	print "#endif\n\n"
 
 	if (hz < 2) {
@@ -105,4 +105,5 @@ define timeconst(hz) {
 	halt
 }
 
+hz = read();
 timeconst(hz)
-- 
cgit v1.2.3


From ca42aaf0c8616cde6161ea4391dff364efeee46a Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire
Date: Mon, 18 May 2015 14:19:13 +0200
Subject: time: Refactor msecs_to_jiffies

Refactor the msecs_to_jiffies conditional code part in time.c and 
jiffies.h putting it into conditional functions rather than #ifdefs
to improve readability.

[ tglx: Verified that there is no binary code change ]

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1431951554-5563-2-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 64 ++++++++++++++++++++++++++++++++++++++++++++++++-
 kernel/time/time.c      | 59 +++++++++++++++------------------------------
 2 files changed, 82 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index c367cbdf73ab..9527ddbb0f1b 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -7,6 +7,7 @@
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
+#include <generated/timeconst.h>
 
 /*
  * The following defines establish the engineering parameters of the PLL
@@ -288,7 +289,68 @@ static inline u64 jiffies_to_nsecs(const unsigned long j)
 	return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC;
 }
 
-extern unsigned long msecs_to_jiffies(const unsigned int m);
+extern unsigned long __msecs_to_jiffies(const unsigned int m);
+#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
+/*
+ * HZ is equal to or smaller than 1000, and 1000 is a nice round
+ * multiple of HZ, divide with the factor between them, but round
+ * upwards:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
+}
+#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
+/*
+ * HZ is larger than 1000, and HZ is a nice round multiple of 1000 -
+ * simply multiply with the factor between them.
+ *
+ * But first make sure the multiplication result cannot overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+		return m * (HZ / MSEC_PER_SEC);
+}
+#else
+/*
+ * Generic case - multiply, round and divide. But first check that if
+ * we are doing a net multiplication, that we wouldn't overflow:
+ */
+static inline unsigned long _msecs_to_jiffies(const unsigned int m)
+{
+		if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
+			return MAX_JIFFY_OFFSET;
+
+		return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
+			>> MSEC_TO_HZ_SHR32;
+}
+#endif
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
+ *
+ * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
+ *
+ * - 'too large' values [that would result in larger than
+ *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
+ *
+ * - all other values are converted to jiffies by either multiplying
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * the HZ range specific helpers _msecs_to_jiffies() are called from
+ * __msecs_to_jiffies().
+ */
+static inline unsigned long msecs_to_jiffies(const unsigned int m)
+{
+	return __msecs_to_jiffies(m);
+}
+
 extern unsigned long usecs_to_jiffies(const unsigned int u);
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
 extern void jiffies_to_timespec(const unsigned long jiffies,
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 4fa1d26a9843..c42c2c3214fe 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -483,9 +483,11 @@ struct timespec64 ns_to_timespec64(const s64 nsec)
 }
 EXPORT_SYMBOL(ns_to_timespec64);
 #endif
-/*
- * When we convert to jiffies then we interpret incoming values
- * the following way:
+/**
+ * msecs_to_jiffies: - convert milliseconds to jiffies
+ * @m:	time in milliseconds
+ *
+ * conversion is done as follows:
  *
  * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET)
  *
@@ -493,51 +495,28 @@ EXPORT_SYMBOL(ns_to_timespec64);
  *   MAX_JIFFY_OFFSET values] mean 'infinite timeout' too.
  *
  * - all other values are converted to jiffies by either multiplying
- *   the input value by a factor or dividing it with a factor
- *
- * We must also be careful about 32-bit overflows.
+ *   the input value by a factor or dividing it with a factor and
+ *   handling any 32-bit overflows.
+ *   for the details see __msecs_to_jiffies()
+ *
+ * msecs_to_jiffies() checks for the passed in value being a constant
+ * via __builtin_constant_p() allowing gcc to eliminate most of the
+ * code, __msecs_to_jiffies() is called if the value passed does not
+ * allow constant folding and the actual conversion must be done at
+ * runtime.
+ * the _msecs_to_jiffies helpers are the HZ dependent conversion
+ * routines found in include/linux/jiffies.h
  */
-unsigned long msecs_to_jiffies(const unsigned int m)
+unsigned long __msecs_to_jiffies(const unsigned int m)
 {
 	/*
 	 * Negative value, means infinite timeout:
 	 */
 	if ((int)m < 0)
 		return MAX_JIFFY_OFFSET;
-
-#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ)
-	/*
-	 * HZ is equal to or smaller than 1000, and 1000 is a nice
-	 * round multiple of HZ, divide with the factor between them,
-	 * but round upwards:
-	 */
-	return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ);
-#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC)
-	/*
-	 * HZ is larger than 1000, and HZ is a nice round multiple of
-	 * 1000 - simply multiply with the factor between them.
-	 *
-	 * But first make sure the multiplication result cannot
-	 * overflow:
-	 */
-	if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return m * (HZ / MSEC_PER_SEC);
-#else
-	/*
-	 * Generic case - multiply, round and divide. But first
-	 * check that if we are doing a net multiplication, that
-	 * we wouldn't overflow:
-	 */
-	if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET))
-		return MAX_JIFFY_OFFSET;
-
-	return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32)
-		>> MSEC_TO_HZ_SHR32;
-#endif
+	return _msecs_to_jiffies(m);
 }
-EXPORT_SYMBOL(msecs_to_jiffies);
+EXPORT_SYMBOL(__msecs_to_jiffies);
 
 unsigned long usecs_to_jiffies(const unsigned int u)
 {
-- 
cgit v1.2.3


From 8fff52fd50934580c5108afed12043a774edf728 Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Fri, 3 Apr 2015 09:04:04 +0530
Subject: clockevents: Introduce CLOCK_EVT_STATE_ONESHOT_STOPPED state

When no timers/hrtimers are pending, the expiry time is set to a
special value: 'KTIME_MAX'. This normally happens with
NO_HZ_{IDLE|FULL} in both LOWRES/HIGHRES modes.

When 'expiry == KTIME_MAX', we either cancel the 'tick-sched' hrtimer
(NOHZ_MODE_HIGHRES) or skip reprogramming clockevent device
(NOHZ_MODE_LOWRES).  But, the clockevent device is already
reprogrammed from the tick-handler for next tick.

As the clock event device is programmed in ONESHOT mode it will at
least fire one more time (unnecessarily). Timers on few
implementations (like arm_arch_timer, etc.) only support PERIODIC mode
and their drivers emulate ONESHOT over that. Which means that on these
platforms we will get spurious interrupts periodically (at last
programmed interval rate, normally tick rate).

In order to avoid spurious interrupts, the clockevent device should be
stopped or its interrupts should be masked.

A simple (yet hacky) solution to get this fixed could be: update
hrtimer_force_reprogram() to always reprogram clockevent device and
update clockevent drivers to STOP generating events (or delay it to
max time) when 'expires' is set to KTIME_MAX. But the drawback here is
that every clockevent driver has to be hacked for this particular case
and its very easy for new ones to miss this.

However, Thomas suggested to add an optional state ONESHOT_STOPPED to
solve this problem: lkml.org/lkml/2014/5/9/508.

This patch adds support for ONESHOT_STOPPED state in clockevents
core. It will only be available to drivers that implement the
state-specific callbacks instead of the legacy ->set_mode() callback.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Reviewed-by: Preeti U. Murthy <preeti@linux.vnet.ibm.com>
Cc: linaro-kernel@lists.linaro.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/b8b383a03ac07b13312c16850b5106b82e4245b5.1428031396.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/clockchips.h |  7 ++++++-
 kernel/time/clockevents.c  | 14 +++++++++++++-
 kernel/time/timer_list.c   |  6 ++++++
 3 files changed, 25 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 96c280b2c263..271fa4c8eb29 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -37,12 +37,15 @@ enum clock_event_mode {
  *		reached from DETACHED or SHUTDOWN.
  * ONESHOT:	Device is programmed to generate event only once. Can be reached
  *		from DETACHED or SHUTDOWN.
+ * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily
+ *		    stopped.
  */
 enum clock_event_state {
 	CLOCK_EVT_STATE_DETACHED,
 	CLOCK_EVT_STATE_SHUTDOWN,
 	CLOCK_EVT_STATE_PERIODIC,
 	CLOCK_EVT_STATE_ONESHOT,
+	CLOCK_EVT_STATE_ONESHOT_STOPPED,
 };
 
 /*
@@ -90,6 +93,7 @@ enum clock_event_state {
  * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
  * @set_state_periodic:	switch state to periodic, if !set_mode
  * @set_state_oneshot:	switch state to oneshot, if !set_mode
+ * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode
  * @set_state_shutdown:	switch state to shutdown, if !set_mode
  * @tick_resume:	resume clkevt device, if !set_mode
  * @broadcast:		function to broadcast events
@@ -121,11 +125,12 @@ struct clock_event_device {
 	 * State transition callback(s): Only one of the two groups should be
 	 * defined:
 	 * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME.
-	 * - set_state_{shutdown|periodic|oneshot}(), tick_resume().
+	 * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume().
 	 */
 	void			(*set_mode)(enum clock_event_mode mode, struct clock_event_device *);
 	int			(*set_state_periodic)(struct clock_event_device *);
 	int			(*set_state_oneshot)(struct clock_event_device *);
+	int			(*set_state_oneshot_stopped)(struct clock_event_device *);
 	int			(*set_state_shutdown)(struct clock_event_device *);
 	int			(*tick_resume)(struct clock_event_device *);
 
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 637a09461c1d..dc6afb485027 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -134,6 +134,17 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 			return -ENOSYS;
 		return dev->set_state_oneshot(dev);
 
+	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
+		/* Core internal bug */
+		if (WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT,
+			      "Current state: %d\n", dev->state))
+			return -EINVAL;
+
+		if (dev->set_state_oneshot_stopped)
+			return dev->set_state_oneshot_stopped(dev);
+		else
+			return -ENOSYS;
+
 	default:
 		return -ENOSYS;
 	}
@@ -445,7 +456,8 @@ static int clockevents_sanity_check(struct clock_event_device *dev)
 	if (dev->set_mode) {
 		/* We shouldn't be supporting new modes now */
 		WARN_ON(dev->set_state_periodic || dev->set_state_oneshot ||
-			dev->set_state_shutdown || dev->tick_resume);
+			dev->set_state_shutdown || dev->tick_resume ||
+			dev->set_state_oneshot_stopped);
 
 		BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
 		return 0;
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 18b074b215b0..1327004429be 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -258,6 +258,12 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 			SEQ_printf(m, "\n");
 		}
 
+		if (dev->set_state_oneshot_stopped) {
+			SEQ_printf(m, " oneshot stopped: ");
+			print_name_offset(m, dev->set_state_oneshot_stopped);
+			SEQ_printf(m, "\n");
+		}
+
 		if (dev->tick_resume) {
 			SEQ_printf(m, " resume:   ");
 			print_name_offset(m, dev->tick_resume);
-- 
cgit v1.2.3


From d25408756accbd2171abaa0678f986adae139e6f Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Fri, 3 Apr 2015 09:04:05 +0530
Subject: clockevents: Stop unused clockevent devices

To avoid getting spurious interrupts on a tickless CPU, clockevent
device can now be stopped by switching to ONESHOT_STOPPED state.

The natural place for handling this transition is tick_program_event().

On 'expires == KTIME_MAX', we skip programming the event and so we need
to fix such call sites as well, to always call tick_program_event()
irrespective of the expires value.

Once the clockevent device is required again, check if it was earlier
put into ONESHOT_STOPPED state. If yes, switch its state to ONESHOT
before programming its event.

To make sure we haven't missed any corner case, add a WARN() for the
case where we try to reprogram clockevent device while we aren't
configured in ONESHOT_STOPPED state.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Kevin Hilman <khilman@linaro.org>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5146b07be7f0bc497e0ebae036590ec2fa73e540.1428031396.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c  |  4 ++++
 kernel/time/hrtimer.c      |  6 ++----
 kernel/time/tick-oneshot.c | 16 ++++++++++++++++
 3 files changed, 22 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index dc6afb485027..4922f1b805ea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -331,6 +331,10 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
 		return 0;
 
+	/* We must be in ONESHOT state here */
+	WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, "Current state: %d\n",
+		  dev->state);
+
 	/* Shortcut for clockevent devices that can deal with ktime. */
 	if (dev->features & CLOCK_EVT_FEAT_KTIME)
 		return dev->set_next_ktime(expires, dev);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 4adf32067862..278d4b36fd94 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -550,8 +550,7 @@ hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal)
 	if (cpu_base->hang_detected)
 		return;
 
-	if (cpu_base->expires_next.tv64 != KTIME_MAX)
-		tick_program_event(cpu_base->expires_next, 1);
+	tick_program_event(cpu_base->expires_next, 1);
 }
 
 /*
@@ -1237,8 +1236,7 @@ retry:
 	raw_spin_unlock(&cpu_base->lock);
 
 	/* Reprogramming necessary ? */
-	if (expires_next.tv64 == KTIME_MAX ||
-	    !tick_program_event(expires_next, 0)) {
+	if (!tick_program_event(expires_next, 0)) {
 		cpu_base->hang_detected = 0;
 		return;
 	}
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 67a64b1670bf..f8de75715c2f 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -28,6 +28,22 @@ int tick_program_event(ktime_t expires, int force)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
+	if (unlikely(expires.tv64 == KTIME_MAX)) {
+		/*
+		 * We don't need the clock event device any more, stop it.
+		 */
+		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+		return 0;
+	}
+
+	if (unlikely(dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED)) {
+		/*
+		 * We need the clock event again, configure it in ONESHOT mode
+		 * before using it.
+		 */
+		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	}
+
 	return clockevents_program_event(dev, expires, force);
 }
 
-- 
cgit v1.2.3


From 6f7d79849a00bba82d3139ff91ff2aaabd12841e Mon Sep 17 00:00:00 2001
From: Sasha Levin
Date: Mon, 1 Dec 2014 23:04:06 -0500
Subject: time: Make sure tz_minuteswest is set to a valid value when setting
 time

Invalid values may overflow later, leading to undefined behaviour when
multiplied by 60 to get the amount of seconds.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/time.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/time.c b/kernel/time/time.c
index c42c2c3214fe..972e3bbac963 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -173,6 +173,10 @@ int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
 		return error;
 
 	if (tz) {
+		/* Verify we're witin the +-15 hrs range */
+		if (tz->tz_minuteswest > 15*60 || tz->tz_minuteswest < -15*60)
+			return -EINVAL;
+
 		sys_tz = *tz;
 		update_vsyscall_tz();
 		if (firsttime) {
-- 
cgit v1.2.3


From 6374f9124efea5fae9cba263108583c39e22f86b Mon Sep 17 00:00:00 2001
From: Harald Geyer
Date: Tue, 7 Apr 2015 11:12:35 +0000
Subject: timekeeping: Provide new API to get the current time resolution

This patch series introduces a new function
u32 ktime_get_resolution_ns(void)
which allows to clean up some driver code.

In particular the IIO subsystem has a function to provide timestamps for
events but no means to get their resolution. So currently the dht11 driver
tries to guess the resolution in a rather messy and convoluted way. We
can do much better with the new code.

This API is not designed to be exposed to user space.

This has been tested on i386, sunxi and mxs.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Harald Geyer <harald@ccbib.org>
[jstultz: Tweaked to make it build after upstream changes]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h |  1 +
 kernel/time/timekeeping.c   | 17 +++++++++++++++++
 2 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 99176af216af..9af5c1214682 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -163,6 +163,7 @@ extern ktime_t ktime_get(void);
 extern ktime_t ktime_get_with_offset(enum tk_offsets offs);
 extern ktime_t ktime_mono_to_any(ktime_t tmono, enum tk_offsets offs);
 extern ktime_t ktime_get_raw(void);
+extern u32 ktime_get_resolution_ns(void);
 
 /**
  * ktime_get_real - get the real (wall-) time in ktime_t format
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3365e32dc208..85d376396313 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -702,6 +702,23 @@ ktime_t ktime_get(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get);
 
+u32 ktime_get_resolution_ns(void)
+{
+	struct timekeeper *tk = &tk_core.timekeeper;
+	unsigned int seq;
+	u32 nsecs;
+
+	WARN_ON(timekeeping_suspended);
+
+	do {
+		seq = read_seqcount_begin(&tk_core.seq);
+		nsecs = tk->tkr_mono.mult >> tk->tkr_mono.shift;
+	} while (read_seqcount_retry(&tk_core.seq, seq));
+
+	return nsecs;
+}
+EXPORT_SYMBOL_GPL(ktime_get_resolution_ns);
+
 static ktime_t *offsets[TK_OFFS_MAX] = {
 	[TK_OFFS_REAL]	= &tk_core.timekeeper.offs_real,
 	[TK_OFFS_BOOT]	= &tk_core.timekeeper.offs_boot,
-- 
cgit v1.2.3


From 57d05a93ada77c4f8a6112cbc867a2948dce7991 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Wed, 13 May 2015 16:04:47 -0700
Subject: time: Rework debugging variables so they aren't global

Ingo suggested that the timekeeping debugging variables
recently added should not be global, and should be tied
to the timekeeper's read_base.

Thus this patch implements that suggestion.

This version is different from the earlier versions
as it keeps the variables in the timekeeper structure
rather then in the tkr.

Cc: Ingo Molnar <mingo@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeper_internal.h | 15 +++++++++++++++
 kernel/time/timekeeping.c           | 33 +++++++++++----------------------
 2 files changed, 26 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index 6f8276ae579c..e1f5a1136554 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -61,6 +61,9 @@ struct tk_read_base {
  *			shifted nano seconds.
  * @ntp_error_shift:	Shift conversion between clock shifted nano seconds and
  *			ntp shifted nano seconds.
+ * @last_warning:	Warning ratelimiter (DEBUG_TIMEKEEPING)
+ * @underflow_seen:	Underflow warning flag (DEBUG_TIMEKEEPING)
+ * @overflow_seen:	Overflow warning flag (DEBUG_TIMEKEEPING)
  *
  * Note: For timespec(64) based interfaces wall_to_monotonic is what
  * we need to add to xtime (or xtime corrected for sub jiffie times)
@@ -106,6 +109,18 @@ struct timekeeper {
 	s64			ntp_error;
 	u32			ntp_error_shift;
 	u32			ntp_err_mult;
+#ifdef CONFIG_DEBUG_TIMEKEEPING
+	long			last_warning;
+	/*
+	 * These simple flag variables are managed
+	 * without locks, which is racy, but they are
+	 * ok since we don't really care about being
+	 * super precise about how many events were
+	 * seen, just that a problem was observed.
+	 */
+	int			underflow_seen;
+	int			overflow_seen;
+#endif
 };
 
 #ifdef CONFIG_GENERIC_TIME_VSYSCALL
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 85d376396313..2f10b6557a1c 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -118,18 +118,6 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
 
 #ifdef CONFIG_DEBUG_TIMEKEEPING
 #define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
-/*
- * These simple flag variables are managed
- * without locks, which is racy, but ok since
- * we don't really care about being super
- * precise about how many events were seen,
- * just that a problem was observed.
- */
-static int timekeeping_underflow_seen;
-static int timekeeping_overflow_seen;
-
-/* last_warning is only modified under the timekeeping lock */
-static long timekeeping_last_warning;
 
 static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 {
@@ -149,29 +137,30 @@ static void timekeeping_check_update(struct timekeeper *tk, cycle_t offset)
 		}
 	}
 
-	if (timekeeping_underflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->underflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Underflow in clocksource '%s' observed, time update ignored.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_underflow_seen = 0;
+		tk->underflow_seen = 0;
 	}
 
-	if (timekeeping_overflow_seen) {
-		if (jiffies - timekeeping_last_warning > WARNING_FREQ) {
+	if (tk->overflow_seen) {
+		if (jiffies - tk->last_warning > WARNING_FREQ) {
 			printk_deferred("WARNING: Overflow in clocksource '%s' observed, time update capped.\n", name);
 			printk_deferred("         Please report this, consider using a different clocksource, if possible.\n");
 			printk_deferred("         Your kernel is probably still fine.\n");
-			timekeeping_last_warning = jiffies;
+			tk->last_warning = jiffies;
 		}
-		timekeeping_overflow_seen = 0;
+		tk->overflow_seen = 0;
 	}
 }
 
 static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 {
+	struct timekeeper *tk = &tk_core.timekeeper;
 	cycle_t now, last, mask, max, delta;
 	unsigned int seq;
 
@@ -197,13 +186,13 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
 	 * mask-relative negative values.
 	 */
 	if (unlikely((~delta & mask) < (mask >> 3))) {
-		timekeeping_underflow_seen = 1;
+		tk->underflow_seen = 1;
 		delta = 0;
 	}
 
 	/* Cap delta value to the max_cycles values to avoid mult overflows */
 	if (unlikely(delta > max)) {
-		timekeeping_overflow_seen = 1;
+		tk->overflow_seen = 1;
 		delta = tkr->clock->max_cycles;
 	}
 
-- 
cgit v1.2.3


From 4e413e8526aa53393d0b3d9ecbdb0436203586ee Mon Sep 17 00:00:00 2001
From: Badhri Jagan Sridharan
Date: Thu, 7 May 2015 16:20:34 -0700
Subject: tracing: timer: Add deferrable flag to timer_start

The timer_start event now shows whether the timer is
deferrable in case of a low-res timer. The debug_activate
function now includes a deferrable flag while calling
the trace_timer_start event.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Badhri Jagan Sridharan <Badhri@google.com>
[jstultz: Fixed minor whitespace and grammer tweaks
 pointed out by Ingo]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/trace/events/timer.h | 13 +++++++++----
 kernel/time/timer.c          |  2 +-
 2 files changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index 68c2c2000f02..d7abef1fe6e0 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -43,15 +43,18 @@ DEFINE_EVENT(timer_class, timer_init,
  */
 TRACE_EVENT(timer_start,
 
-	TP_PROTO(struct timer_list *timer, unsigned long expires),
+	TP_PROTO(struct timer_list *timer,
+		unsigned long expires,
+		unsigned int deferrable),
 
-	TP_ARGS(timer, expires),
+	TP_ARGS(timer, expires, deferrable),
 
 	TP_STRUCT__entry(
 		__field( void *,	timer		)
 		__field( void *,	function	)
 		__field( unsigned long,	expires		)
 		__field( unsigned long,	now		)
+		__field( unsigned int,	deferrable	)
 	),
 
 	TP_fast_assign(
@@ -59,11 +62,13 @@ TRACE_EVENT(timer_start,
 		__entry->function	= timer->function;
 		__entry->expires	= expires;
 		__entry->now		= jiffies;
+		__entry->deferrable     = deferrable;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld]",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now)
+		  (long)__entry->expires - __entry->now,
+		  __entry->deferrable > 0 ? 'y':'n')
 );
 
 /**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d4af7c56c95d..7775d454a204 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -650,7 +650,7 @@ static inline void
 debug_activate(struct timer_list *timer, unsigned long expires)
 {
 	debug_timer_activate(timer);
-	trace_timer_start(timer, expires);
+	trace_timer_start(timer, expires, tbase_get_deferrable(timer->base));
 }
 
 static inline void debug_deactivate(struct timer_list *timer)
-- 
cgit v1.2.3


From e83d0a4106d81dd08b70318f078f3bad6acdc110 Mon Sep 17 00:00:00 2001
From: Xunlei Pang
Date: Thu, 9 Apr 2015 09:04:42 +0800
Subject: time: Remove read_boot_clock()

Now that we have a read_boot_clock64() function available on every
architecture, and converted all the users to it, it's time to remove
the (now unused) read_boot_clock() completely from the kernel.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Xunlei Pang <pang.xunlei@linaro.org>
[jstultz: Minor commit message tweak suggested by Ingo]
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 include/linux/timekeeping.h |  1 -
 kernel/time/timekeeping.c   | 14 +++-----------
 2 files changed, 3 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h
index 9af5c1214682..3aa72e648650 100644
--- a/include/linux/timekeeping.h
+++ b/include/linux/timekeeping.h
@@ -267,7 +267,6 @@ extern int persistent_clock_is_local;
 
 extern void read_persistent_clock(struct timespec *ts);
 extern void read_persistent_clock64(struct timespec64 *ts);
-extern void read_boot_clock(struct timespec *ts);
 extern void read_boot_clock64(struct timespec64 *ts);
 extern int update_persistent_clock(struct timespec now);
 extern int update_persistent_clock64(struct timespec64 now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2f10b6557a1c..90ed5db67c1d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1188,28 +1188,20 @@ void __weak read_persistent_clock64(struct timespec64 *ts64)
 }
 
 /**
- * read_boot_clock -  Return time of the system start.
+ * read_boot_clock64 -  Return time of the system start.
  *
  * Weak dummy function for arches that do not yet support it.
  * Function to read the exact time the system has been started.
- * Returns a timespec with tv_sec=0 and tv_nsec=0 if unsupported.
+ * Returns a timespec64 with tv_sec=0 and tv_nsec=0 if unsupported.
  *
  *  XXX - Do be sure to remove it once all arches implement it.
  */
-void __weak read_boot_clock(struct timespec *ts)
+void __weak read_boot_clock64(struct timespec64 *ts)
 {
 	ts->tv_sec = 0;
 	ts->tv_nsec = 0;
 }
 
-void __weak read_boot_clock64(struct timespec64 *ts64)
-{
-	struct timespec ts;
-
-	read_boot_clock(&ts);
-	*ts64 = timespec_to_timespec64(ts);
-}
-
 /* Flag for if timekeeping_resume() has injected sleeptime */
 static bool sleeptime_injected;
 
-- 
cgit v1.2.3


From ac34ad27fc160b5bd31c731cdaaf6e1d1890ccb2 Mon Sep 17 00:00:00 2001
From: Alexandre Belloni
Date: Fri, 16 Jan 2015 10:05:51 +0100
Subject: clockevents: Do not suspend/resume if unused

There is no point in calling suspend/resume for unused clockevents as
they are already stopped and disabled.

This is really important for AT91 as the hardware is a trainwreck and
takes ages to synchronize.

Reported-by: Sylvain Rochet <sylvain.rochet@finsecur.com>
Signed-off-by: Alexandre Belloni <alexandre.belloni@free-electrons.com>
Cc: Daniel Lezcano <daniel.lezcano@linaro.org>
Cc: Nicolas Ferre <nicolas.ferre@atmel.com>
Cc: Boris Brezillon <boris.brezillon@free-electrons.com>
Cc: Maxime Ripard <maxime.ripard@free-electrons.com>
Cc: linux-arm-kernel@lists.infradead.org
Link: http://lkml.kernel.org/r/1421399151-26800-1-git-send-email-alexandre.belloni@free-electrons.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 4922f1b805ea..2a5c369e50ab 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -638,7 +638,7 @@ void clockevents_suspend(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry_reverse(dev, &clockevent_devices, list)
-		if (dev->suspend)
+		if (dev->suspend && dev->mode != CLOCK_EVT_MODE_UNUSED)
 			dev->suspend(dev);
 }
 
@@ -650,7 +650,7 @@ void clockevents_resume(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry(dev, &clockevent_devices, list)
-		if (dev->resume)
+		if (dev->resume && dev->mode != CLOCK_EVT_MODE_UNUSED)
 			dev->resume(dev);
 }
 
-- 
cgit v1.2.3


From 472c4a9437d3c6a0b1e59df7c5aa14075946aa70 Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Thu, 21 May 2015 13:33:46 +0530
Subject: clockevents: Use helpers to check the state of a clockevent device

Use accessor functions to check the state of clockevent devices in
core code.

Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/fa2b9869fd17f210eaa156ec2b594efd0230b6c7.1432192527.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c    | 24 ++++++++++++------------
 kernel/time/tick-broadcast.c |  6 +++---
 kernel/time/tick-common.c    |  2 +-
 kernel/time/tick-oneshot.c   |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2a5c369e50ab..e568ec8c320b 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -136,7 +136,7 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 
 	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
 		/* Core internal bug */
-		if (WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT,
+		if (WARN_ONCE(!clockevent_state_oneshot(dev),
 			      "Current state: %d\n", dev->state))
 			return -EINVAL;
 
@@ -170,7 +170,7 @@ void clockevents_set_state(struct clock_event_device *dev,
 		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
 		 * on it, so fix it up and emit a warning:
 		 */
-		if (state == CLOCK_EVT_STATE_ONESHOT) {
+		if (clockevent_state_oneshot(dev)) {
 			if (unlikely(!dev->mult)) {
 				dev->mult = 1;
 				WARN_ON(1);
@@ -259,7 +259,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 		delta = dev->min_delta_ns;
 		dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-		if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+		if (clockevent_state_shutdown(dev))
 			return 0;
 
 		dev->retries++;
@@ -296,7 +296,7 @@ static int clockevents_program_min_delta(struct clock_event_device *dev)
 	delta = dev->min_delta_ns;
 	dev->next_event = ktime_add_ns(ktime_get(), delta);
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
 	dev->retries++;
@@ -328,11 +328,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 
 	dev->next_event = expires;
 
-	if (dev->state == CLOCK_EVT_STATE_SHUTDOWN)
+	if (clockevent_state_shutdown(dev))
 		return 0;
 
 	/* We must be in ONESHOT state here */
-	WARN_ONCE(dev->state != CLOCK_EVT_STATE_ONESHOT, "Current state: %d\n",
+	WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
 		  dev->state);
 
 	/* Shortcut for clockevent devices that can deal with ktime. */
@@ -377,7 +377,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 	struct clock_event_device *dev, *newdev = NULL;
 
 	list_for_each_entry(dev, &clockevent_devices, list) {
-		if (dev == ced || dev->state != CLOCK_EVT_STATE_DETACHED)
+		if (dev == ced || !clockevent_state_detached(dev))
 			continue;
 
 		if (!tick_check_replacement(newdev, dev))
@@ -403,7 +403,7 @@ static int clockevents_replace(struct clock_event_device *ced)
 static int __clockevents_try_unbind(struct clock_event_device *ced, int cpu)
 {
 	/* Fast track. Device is unused */
-	if (ced->state == CLOCK_EVT_STATE_DETACHED) {
+	if (clockevent_state_detached(ced)) {
 		list_del_init(&ced->list);
 		return 0;
 	}
@@ -561,10 +561,10 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 {
 	clockevents_config(dev, freq);
 
-	if (dev->state == CLOCK_EVT_STATE_ONESHOT)
+	if (clockevent_state_oneshot(dev))
 		return clockevents_program_event(dev, dev->next_event, false);
 
-	if (dev->state == CLOCK_EVT_STATE_PERIODIC)
+	if (clockevent_state_periodic(dev))
 		return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
 
 	return 0;
@@ -625,7 +625,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	}
 
 	if (new) {
-		BUG_ON(new->state != CLOCK_EVT_STATE_DETACHED);
+		BUG_ON(!clockevent_state_detached(new));
 		clockevents_shutdown(new);
 	}
 }
@@ -681,7 +681,7 @@ void tick_cleanup_dead_cpu(int cpu)
 		if (cpumask_test_cpu(cpu, dev->cpumask) &&
 		    cpumask_weight(dev->cpumask) == 1 &&
 		    !tick_is_broadcast_device(dev)) {
-			BUG_ON(dev->state != CLOCK_EVT_STATE_DETACHED);
+			BUG_ON(!clockevent_state_detached(dev));
 			list_del(&dev->list);
 		}
 	}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 12fcc55d607a..132f819fdcdf 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -303,7 +303,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 	raw_spin_lock(&tick_broadcast_lock);
 	bc_local = tick_do_periodic_broadcast();
 
-	if (dev->state == CLOCK_EVT_STATE_ONESHOT) {
+	if (clockevent_state_oneshot(dev)) {
 		ktime_t next = ktime_add(dev->next_event, tick_period);
 
 		clockevents_program_event(dev, next, true);
@@ -528,7 +528,7 @@ static void tick_broadcast_set_affinity(struct clock_event_device *bc,
 static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 				     ktime_t expires)
 {
-	if (bc->state != CLOCK_EVT_STATE_ONESHOT)
+	if (!clockevent_state_oneshot(bc))
 		clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
 	clockevents_program_event(bc, expires, 1);
@@ -831,7 +831,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 
 	/* Set it up only once ! */
 	if (bc->event_handler != tick_handle_oneshot_broadcast) {
-		int was_periodic = bc->state == CLOCK_EVT_STATE_PERIODIC;
+		int was_periodic = clockevent_state_periodic(bc);
 
 		bc->event_handler = tick_handle_oneshot_broadcast;
 
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ea5f9eae8f74..cf881c62c3c5 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -112,7 +112,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 		return;
 #endif
 
-	if (dev->state != CLOCK_EVT_STATE_ONESHOT)
+	if (!clockevent_state_oneshot(dev))
 		return;
 	for (;;) {
 		/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index f8de75715c2f..3f9715bec291 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -36,7 +36,7 @@ int tick_program_event(ktime_t expires, int force)
 		return 0;
 	}
 
-	if (unlikely(dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED)) {
+	if (unlikely(clockevent_state_oneshot_stopped(dev))) {
 		/*
 		 * We need the clock event again, configure it in ONESHOT mode
 		 * before using it.
-- 
cgit v1.2.3


From d7eb231c71420bc34ac3d35403115600f920cfc2 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 2 Jun 2015 14:08:46 +0200
Subject: clockevents: Provide functions to set and get the state

We want to rename dev->state, so provide proper get and set
functions. Rename clockevents_set_state() to
clockevents_switch_state() to avoid confusion.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/clockevents.c    | 18 +++++++++---------
 kernel/time/tick-broadcast.c | 12 ++++++------
 kernel/time/tick-common.c    |  4 ++--
 kernel/time/tick-internal.h  | 15 +++++++++++++--
 kernel/time/tick-oneshot.c   | 10 +++++-----
 5 files changed, 35 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index e568ec8c320b..a45f90c4b2d5 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -94,8 +94,8 @@ u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt)
 }
 EXPORT_SYMBOL_GPL(clockevent_delta2ns);
 
-static int __clockevents_set_state(struct clock_event_device *dev,
-				   enum clock_event_state state)
+static int __clockevents_switch_state(struct clock_event_device *dev,
+				      enum clock_event_state state)
 {
 	/* Transition with legacy set_mode() callback */
 	if (dev->set_mode) {
@@ -151,17 +151,17 @@ static int __clockevents_set_state(struct clock_event_device *dev,
 }
 
 /**
- * clockevents_set_state - set the operating state of a clock event device
+ * clockevents_switch_state - set the operating state of a clock event device
  * @dev:	device to modify
  * @state:	new state
  *
  * Must be called with interrupts disabled !
  */
-void clockevents_set_state(struct clock_event_device *dev,
-			   enum clock_event_state state)
+void clockevents_switch_state(struct clock_event_device *dev,
+			      enum clock_event_state state)
 {
 	if (dev->state != state) {
-		if (__clockevents_set_state(dev, state))
+		if (__clockevents_switch_state(dev, state))
 			return;
 
 		dev->state = state;
@@ -185,7 +185,7 @@ void clockevents_set_state(struct clock_event_device *dev,
  */
 void clockevents_shutdown(struct clock_event_device *dev)
 {
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 	dev->next_event.tv64 = KTIME_MAX;
 }
 
@@ -565,7 +565,7 @@ int __clockevents_update_freq(struct clock_event_device *dev, u32 freq)
 		return clockevents_program_event(dev, dev->next_event, false);
 
 	if (clockevent_state_periodic(dev))
-		return __clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+		return __clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 
 	return 0;
 }
@@ -619,7 +619,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
 	 */
 	if (old) {
 		module_put(old->owner);
-		clockevents_set_state(old, CLOCK_EVT_STATE_DETACHED);
+		clockevents_switch_state(old, CLOCK_EVT_STATE_DETACHED);
 		list_del(&old->list);
 		list_add(&old->list, &clockevents_released);
 	}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 132f819fdcdf..d39f32cdd1b5 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -529,7 +529,7 @@ static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 				     ktime_t expires)
 {
 	if (!clockevent_state_oneshot(bc))
-		clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+		clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 
 	clockevents_program_event(bc, expires, 1);
 	tick_broadcast_set_affinity(bc, cpumask_of(cpu));
@@ -537,7 +537,7 @@ static void tick_broadcast_set_event(struct clock_event_device *bc, int cpu,
 
 static void tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
-	clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 }
 
 /*
@@ -555,7 +555,7 @@ void tick_check_oneshot_broadcast_this_cpu(void)
 		 * switched over, leave the device alone.
 		 */
 		if (td->mode == TICKDEV_MODE_ONESHOT) {
-			clockevents_set_state(td->evtdev,
+			clockevents_switch_state(td->evtdev,
 					      CLOCK_EVT_STATE_ONESHOT);
 		}
 	}
@@ -659,7 +659,7 @@ static void broadcast_shutdown_local(struct clock_event_device *bc,
 		if (dev->next_event.tv64 < bc->next_event.tv64)
 			return;
 	}
-	clockevents_set_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_SHUTDOWN);
 }
 
 /**
@@ -729,7 +729,7 @@ int tick_broadcast_oneshot_control(enum tick_broadcast_state state)
 			cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask);
 	} else {
 		if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) {
-			clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 			/*
 			 * The cpu which was handling the broadcast
 			 * timer marked this cpu in the broadcast
@@ -847,7 +847,7 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			   tick_broadcast_oneshot_mask, tmpmask);
 
 		if (was_periodic && !cpumask_empty(tmpmask)) {
-			clockevents_set_state(bc, CLOCK_EVT_STATE_ONESHOT);
+			clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
 			tick_broadcast_init_next_event(tmpmask,
 						       tick_next_period);
 			tick_broadcast_set_event(bc, cpu, tick_next_period);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index cf881c62c3c5..311e2e133517 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -150,7 +150,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 
 	if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
 	    !tick_broadcast_oneshot_active()) {
-		clockevents_set_state(dev, CLOCK_EVT_STATE_PERIODIC);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_PERIODIC);
 	} else {
 		unsigned long seq;
 		ktime_t next;
@@ -160,7 +160,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 			next = tick_next_period;
 		} while (read_seqretry(&jiffies_lock, seq));
 
-		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 
 		for (;;) {
 			if (!clockevents_program_event(dev, next, false))
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 65273f0a11ed..4461de9bb4b8 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,11 +36,22 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 	return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
 }
 
+static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
+{
+	return dev->state;
+}
+
+static inline void clockevent_set_state(struct clock_event_device *dev,
+					enum clock_event_state state)
+{
+	dev->state = state;
+}
+
 extern void clockevents_shutdown(struct clock_event_device *dev);
 extern void clockevents_exchange_device(struct clock_event_device *old,
 					struct clock_event_device *new);
-extern void clockevents_set_state(struct clock_event_device *dev,
-				 enum clock_event_state state);
+extern void clockevents_switch_state(struct clock_event_device *dev,
+				     enum clock_event_state state);
 extern int clockevents_program_event(struct clock_event_device *dev,
 				     ktime_t expires, bool force);
 extern void clockevents_handle_noop(struct clock_event_device *dev);
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 3f9715bec291..b51344652330 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -32,7 +32,7 @@ int tick_program_event(ktime_t expires, int force)
 		/*
 		 * We don't need the clock event device any more, stop it.
 		 */
-		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT_STOPPED);
 		return 0;
 	}
 
@@ -41,7 +41,7 @@ int tick_program_event(ktime_t expires, int force)
 		 * We need the clock event again, configure it in ONESHOT mode
 		 * before using it.
 		 */
-		clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+		clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	}
 
 	return clockevents_program_event(dev, expires, force);
@@ -54,7 +54,7 @@ void tick_resume_oneshot(void)
 {
 	struct clock_event_device *dev = __this_cpu_read(tick_cpu_device.evtdev);
 
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(dev, ktime_get(), true);
 }
 
@@ -66,7 +66,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
 			ktime_t next_event)
 {
 	newdev->event_handler = handler;
-	clockevents_set_state(newdev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(newdev, CLOCK_EVT_STATE_ONESHOT);
 	clockevents_program_event(newdev, next_event, true);
 }
 
@@ -97,7 +97,7 @@ int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *))
 
 	td->mode = TICKDEV_MODE_ONESHOT;
 	dev->event_handler = handler;
-	clockevents_set_state(dev, CLOCK_EVT_STATE_ONESHOT);
+	clockevents_switch_state(dev, CLOCK_EVT_STATE_ONESHOT);
 	tick_broadcast_switch_to_oneshot();
 	return 0;
 }
-- 
cgit v1.2.3


From 051ebd101b05c09d9b5b673e19fb0586e9bfec56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 2 Jun 2015 14:13:46 +0200
Subject: clockevents: Use set/get state helper functions

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/time/clockevents.c | 11 ++++++-----
 kernel/time/tick-common.c |  2 +-
 2 files changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index a45f90c4b2d5..2397b97320d8 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -137,7 +137,8 @@ static int __clockevents_switch_state(struct clock_event_device *dev,
 	case CLOCK_EVT_STATE_ONESHOT_STOPPED:
 		/* Core internal bug */
 		if (WARN_ONCE(!clockevent_state_oneshot(dev),
-			      "Current state: %d\n", dev->state))
+			      "Current state: %d\n",
+			      clockevent_get_state(dev)))
 			return -EINVAL;
 
 		if (dev->set_state_oneshot_stopped)
@@ -160,11 +161,11 @@ static int __clockevents_switch_state(struct clock_event_device *dev,
 void clockevents_switch_state(struct clock_event_device *dev,
 			      enum clock_event_state state)
 {
-	if (dev->state != state) {
+	if (clockevent_get_state(dev) != state) {
 		if (__clockevents_switch_state(dev, state))
 			return;
 
-		dev->state = state;
+		clockevent_set_state(dev, state);
 
 		/*
 		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
@@ -333,7 +334,7 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
 
 	/* We must be in ONESHOT state here */
 	WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n",
-		  dev->state);
+		  clockevent_get_state(dev));
 
 	/* Shortcut for clockevent devices that can deal with ktime. */
 	if (dev->features & CLOCK_EVT_FEAT_KTIME)
@@ -496,7 +497,7 @@ void clockevents_register_device(struct clock_event_device *dev)
 	BUG_ON(clockevents_sanity_check(dev));
 
 	/* Initialize state to DETACHED */
-	dev->state = CLOCK_EVT_STATE_DETACHED;
+	clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 
 	if (!dev->cpumask) {
 		WARN_ON(num_possible_cpus() > 1);
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 311e2e133517..17f144450050 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -377,7 +377,7 @@ void tick_shutdown(unsigned int cpu)
 		 * Prevent that the clock events layer tries to call
 		 * the set mode function!
 		 */
-		dev->state = CLOCK_EVT_STATE_DETACHED;
+		clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED);
 		dev->mode = CLOCK_EVT_MODE_UNUSED;
 		clockevents_exchange_device(dev, NULL);
 		dev->event_handler = clockevents_handle_noop;
-- 
cgit v1.2.3


From be3ef76e9d9b97962c70bd6351787d29071ae481 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 2 Jun 2015 14:30:11 +0200
Subject: clockevents: Rename state to state_use_accessors

The only sensible way to make abuse of core internal fields obvious
and easy to grep for.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 include/linux/clockchips.h  | 14 +++++++-------
 kernel/time/tick-internal.h |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index 64214ad85af9..597a1e836f22 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -87,7 +87,7 @@ enum clock_event_state {
  * @mult:		nanosecond to cycles multiplier
  * @shift:		nanoseconds to cycles divisor (power of two)
  * @mode:		operating mode, relevant only to ->set_mode(), OBSOLETE
- * @state:		current state of the device, assigned by the core code
+ * @state_use_accessors:current state of the device, assigned by the core code
  * @features:		features
  * @retries:		number of forced programming retries
  * @set_mode:		legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME.
@@ -117,7 +117,7 @@ struct clock_event_device {
 	u32			mult;
 	u32			shift;
 	enum clock_event_mode	mode;
-	enum clock_event_state	state;
+	enum clock_event_state	state_use_accessors;
 	unsigned int		features;
 	unsigned long		retries;
 
@@ -152,27 +152,27 @@ struct clock_event_device {
 /* Helpers to verify state of a clockevent device */
 static inline bool clockevent_state_detached(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_DETACHED;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED;
 }
 
 static inline bool clockevent_state_shutdown(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_SHUTDOWN;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN;
 }
 
 static inline bool clockevent_state_periodic(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_PERIODIC;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC;
 }
 
 static inline bool clockevent_state_oneshot(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_ONESHOT;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT;
 }
 
 static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev)
 {
-	return dev->state == CLOCK_EVT_STATE_ONESHOT_STOPPED;
+	return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED;
 }
 
 /*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 4461de9bb4b8..ec2208aabdd1 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -38,13 +38,13 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 
 static inline enum clock_event_state clockevent_get_state(struct clock_event_device *dev)
 {
-	return dev->state;
+	return dev->state_use_accessors;
 }
 
 static inline void clockevent_set_state(struct clock_event_device *dev,
 					enum clock_event_state state)
 {
-	dev->state = state;
+	dev->state_use_accessors = state;
 }
 
 extern void clockevents_shutdown(struct clock_event_device *dev);
-- 
cgit v1.2.3


From ae60d6a0e3a9197d37f8c8c4584a8ecd18518cd6 Mon Sep 17 00:00:00 2001
From: Nicholas Mc Guire
Date: Thu, 28 May 2015 19:09:55 +0200
Subject: time: Refactor usecs_to_jiffies

Refactor the usecs_to_jiffies conditional code part in time.c and
jiffies.h putting it into conditional functions rather than #ifdefs
to improve readability. This is analogous to the msecs_to_jiffies()
cleanup in commit ca42aaf0c861 ("time: Refactor msecs_to_jiffies")

Signed-off-by: Nicholas Mc Guire <hofrat@osadl.org>
Cc: Masahiro Yamada <yamada.m@jp.panasonic.com>
Cc: Sam Ravnborg <sam@ravnborg.org>
Cc: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Andrew Hunter <ahh@google.com>
Cc: Paul Turner <pjt@google.com>
Cc: Michal Marek <mmarek@suse.cz>
Link: http://lkml.kernel.org/r/1432832996-12129-1-git-send-email-hofrat@osadl.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/jiffies.h | 27 ++++++++++++++++++++++++++-
 kernel/time/time.c      | 13 +++----------
 2 files changed, 29 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h
index 3bde5eb8568b..a316ebea0a89 100644
--- a/include/linux/jiffies.h
+++ b/include/linux/jiffies.h
@@ -362,7 +362,32 @@ static inline unsigned long msecs_to_jiffies(const unsigned int m)
 	}
 }
 
-extern unsigned long usecs_to_jiffies(const unsigned int u);
+extern unsigned long __usecs_to_jiffies(const unsigned int u);
+#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
+}
+#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return u * (HZ / USEC_PER_SEC);
+}
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+#else
+static inline unsigned long _usecs_to_jiffies(const unsigned int u)
+{
+	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
+		>> USEC_TO_HZ_SHR32;
+}
+#endif
+
+static inline unsigned long usecs_to_jiffies(const unsigned int u)
+{
+	return __usecs_to_jiffies(u);
+}
+
 extern unsigned long timespec_to_jiffies(const struct timespec *value);
 extern void jiffies_to_timespec(const unsigned long jiffies,
 				struct timespec *value);
diff --git a/kernel/time/time.c b/kernel/time/time.c
index 972e3bbac963..85d5bb1d67eb 100644
--- a/kernel/time/time.c
+++ b/kernel/time/time.c
@@ -522,20 +522,13 @@ unsigned long __msecs_to_jiffies(const unsigned int m)
 }
 EXPORT_SYMBOL(__msecs_to_jiffies);
 
-unsigned long usecs_to_jiffies(const unsigned int u)
+unsigned long __usecs_to_jiffies(const unsigned int u)
 {
 	if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET))
 		return MAX_JIFFY_OFFSET;
-#if HZ <= USEC_PER_SEC && !(USEC_PER_SEC % HZ)
-	return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ);
-#elif HZ > USEC_PER_SEC && !(HZ % USEC_PER_SEC)
-	return u * (HZ / USEC_PER_SEC);
-#else
-	return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32)
-		>> USEC_TO_HZ_SHR32;
-#endif
+	return _usecs_to_jiffies(u);
 }
-EXPORT_SYMBOL(usecs_to_jiffies);
+EXPORT_SYMBOL(__usecs_to_jiffies);
 
 /*
  * The TICK_NSEC - 1 rounds up the value to the next resolution.  Note
-- 
cgit v1.2.3


From 45bbfe64ea564a69e56ab6754006eee506224f46 Mon Sep 17 00:00:00 2001
From: Joe Perches
Date: Mon, 25 May 2015 11:49:55 -0700
Subject: clocksource: Use current logging style

clocksource messages aren't prefixed in dmesg so it's a bit unclear
what subsystem emits the messages.

Use pr_fmt and pr_<level> to auto-prefix the messages appropriately.

Miscellanea:

o Remove "Warning" from KERN_WARNING level messages
o Align "timekeeping watchdog: " messages
o Coalesce formats
o Align multiline arguments

Signed-off-by: Joe Perches <joe@perches.com>
Cc: John Stultz <john.stultz@linaro.org>
Link: http://lkml.kernel.org/r/1432579795.2846.75.camel@perches.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clocksource.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 15facb1b9c60..841b72f720e8 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -23,6 +23,8 @@
  *   o Allow clocksource drivers to be unregistered
  */
 
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
 #include <linux/device.h>
 #include <linux/clocksource.h>
 #include <linux/init.h>
@@ -216,10 +218,11 @@ static void clocksource_watchdog(unsigned long data)
 
 		/* Check the deviation from the watchdog clocksource. */
 		if ((abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD)) {
-			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable, because the skew is too large:\n", cs->name);
-			pr_warn("	'%s' wd_now: %llx wd_last: %llx mask: %llx\n",
+			pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n",
+				cs->name);
+			pr_warn("                      '%s' wd_now: %llx wd_last: %llx mask: %llx\n",
 				watchdog->name, wdnow, wdlast, watchdog->mask);
-			pr_warn("	'%s' cs_now: %llx cs_last: %llx mask: %llx\n",
+			pr_warn("                      '%s' cs_now: %llx cs_last: %llx mask: %llx\n",
 				cs->name, csnow, cslast, cs->mask);
 			__clocksource_unstable(cs);
 			continue;
@@ -567,9 +570,8 @@ static void __clocksource_select(bool skipcur)
 		 */
 		if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && oneshot) {
 			/* Override clocksource cannot be used. */
-			printk(KERN_WARNING "Override clocksource %s is not "
-			       "HRT compatible. Cannot switch while in "
-			       "HRT/NOHZ mode\n", cs->name);
+			pr_warn("Override clocksource %s is not HRT compatible - cannot switch while in HRT/NOHZ mode\n",
+				cs->name);
 			override_name[0] = 0;
 		} else
 			/* Override clocksource can be used. */
@@ -708,8 +710,8 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq
 
 	clocksource_update_max_deferment(cs);
 
-	pr_info("clocksource %s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
-			cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
+	pr_info("%s: mask: 0x%llx max_cycles: 0x%llx, max_idle_ns: %lld ns\n",
+		cs->name, cs->mask, cs->max_cycles, cs->max_idle_ns);
 }
 EXPORT_SYMBOL_GPL(__clocksource_update_freq_scale);
 
@@ -1008,12 +1010,10 @@ __setup("clocksource=", boot_override_clocksource);
 static int __init boot_override_clock(char* str)
 {
 	if (!strcmp(str, "pmtmr")) {
-		printk("Warning: clock=pmtmr is deprecated. "
-			"Use clocksource=acpi_pm.\n");
+		pr_warn("clock=pmtmr is deprecated - use clocksource=acpi_pm\n");
 		return boot_override_clocksource("acpi_pm");
 	}
-	printk("Warning! clock= boot option is deprecated. "
-		"Use clocksource=xyz\n");
+	pr_warn("clock= boot option is deprecated - use clocksource=xyz\n");
 	return boot_override_clocksource(str);
 }
 
-- 
cgit v1.2.3


From d151832650ed98961a5650e73e85c349ad7839cb Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Jun 2015 15:54:53 -0700
Subject: time: Move clock_was_set_seq update before updating shadow-timekeeper

It was reported that 868a3e915f7f5eba (hrtimer: Make offset
update smarter) was causing timer problems after suspend/resume.

The problem with that change is the modification to
clock_was_set_seq in timekeeping_update is done prior to
mirroring the time state to the shadow-timekeeper. Thus the
next time we do update_wall_time() the updated sequence is
overwritten by whats in the shadow copy.

This patch moves the shadow-timekeeper mirroring to the end
of the function, after all updates have been made, so all data
is kept in sync.

(This patch also affects the update_fast_timekeeper calls which
were also problematically done prior to the mirroring).

Reported-and-tested-by: Jeremiah Mahler <jmmahler@gmail.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Preeti U Murthy <preeti@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1434063297-28657-2-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 90ed5db67c1d..849b93265904 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -585,15 +585,19 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 	update_vsyscall(tk);
 	update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET);
 
-	if (action & TK_MIRROR)
-		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
-		       sizeof(tk_core.timekeeper));
-
 	update_fast_timekeeper(&tk->tkr_mono, &tk_fast_mono);
 	update_fast_timekeeper(&tk->tkr_raw,  &tk_fast_raw);
 
 	if (action & TK_CLOCK_WAS_SET)
 		tk->clock_was_set_seq++;
+	/*
+	 * The mirroring of the data to the shadow-timekeeper needs
+	 * to happen last here to ensure we don't over-write the
+	 * timekeeper structure on the next update with stale data
+	 */
+	if (action & TK_MIRROR)
+		memcpy(&shadow_timekeeper, &tk_core.timekeeper,
+		       sizeof(tk_core.timekeeper));
 }
 
 /**
-- 
cgit v1.2.3


From 90bf361ceae28dee50a584c3dd4c1a96178d982c Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Jun 2015 15:54:54 -0700
Subject: ntp: Introduce and use SECS_PER_DAY macro instead of 86400

Currently the leapsecond logic uses what looks like magic values.

Improve this by defining SECS_PER_DAY and using that macro
to make the logic more clear.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-3-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7a681003001c..7aa216188450 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -35,6 +35,7 @@ unsigned long			tick_nsec;
 static u64			tick_length;
 static u64			tick_length_base;
 
+#define SECS_PER_DAY		86400
 #define MAX_TICKADJ		500LL		/* usecs */
 #define MAX_TICKADJ_SCALED \
 	(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
@@ -390,7 +391,7 @@ int second_overflow(unsigned long secs)
 	case TIME_INS:
 		if (!(time_status & STA_INS))
 			time_state = TIME_OK;
-		else if (secs % 86400 == 0) {
+		else if (secs % SECS_PER_DAY == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE
@@ -400,7 +401,7 @@ int second_overflow(unsigned long secs)
 	case TIME_DEL:
 		if (!(time_status & STA_DEL))
 			time_state = TIME_OK;
-		else if ((secs + 1) % 86400 == 0) {
+		else if ((secs + 1) % SECS_PER_DAY == 0) {
 			leap = 1;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
-- 
cgit v1.2.3


From 833f32d763028c1bb371c64f457788b933773b3e Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Jun 2015 15:54:55 -0700
Subject: time: Prevent early expiry of hrtimers[CLOCK_REALTIME] at the leap
 second edge

Currently, leapsecond adjustments are done at tick time. As a result,
the leapsecond was applied at the first timer tick *after* the
leapsecond (~1-10ms late depending on HZ), rather then exactly on the
second edge.

This was in part historical from back when we were always tick based,
but correcting this since has been avoided since it adds extra
conditional checks in the gettime fastpath, which has performance
overhead.

However, it was recently pointed out that ABS_TIME CLOCK_REALTIME
timers set for right after the leapsecond could fire a second early,
since some timers may be expired before we trigger the timekeeping
timer, which then applies the leapsecond.

This isn't quite as bad as it sounds, since behaviorally it is similar
to what is possible w/ ntpd made leapsecond adjustments done w/o using
the kernel discipline. Where due to latencies, timers may fire just
prior to the settimeofday call. (Also, one should note that all
applications using CLOCK_REALTIME timers should always be careful,
since they are prone to quirks from settimeofday() disturbances.)

However, the purpose of having the kernel do the leap adjustment is to
avoid such latencies, so I think this is worth fixing.

So in order to properly keep those timers from firing a second early,
this patch modifies the ntp and timekeeping logic so that we keep
enough state so that the update_base_offsets_now accessor, which
provides the hrtimer core the current time, can check and apply the
leapsecond adjustment on the second edge. This prevents the hrtimer
core from expiring timers too early.

This patch does not modify any other time read path, so no additional
overhead is incurred. However, this also means that the leap-second
continues to be applied at tick time for all other read-paths.

Apologies to Richard Cochran, who pushed for similar changes years
ago, which I resisted due to the concerns about the performance
overhead.

While I suspect this isn't extremely critical, folks who care about
strict leap-second correctness will likely want to watch
this. Potentially a -stable candidate eventually.

Originally-suggested-by: Richard Cochran <richardcochran@gmail.com>
Reported-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Reported-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-4-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/time64.h              |  1 +
 include/linux/timekeeper_internal.h |  2 ++
 kernel/time/ntp.c                   | 42 ++++++++++++++++++++++++++++++-------
 kernel/time/ntp_internal.h          |  1 +
 kernel/time/timekeeping.c           | 23 +++++++++++++++++++-
 5 files changed, 61 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 12d4e82b0276..77b5df2acd2a 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -29,6 +29,7 @@ struct timespec64 {
 #define FSEC_PER_SEC	1000000000000000LL
 
 /* Located here for timespec[64]_valid_strict */
+#define TIME64_MAX			((s64)~((u64)1 << 63))
 #define KTIME_MAX			((s64)~((u64)1 << 63))
 #define KTIME_SEC_MAX			(KTIME_MAX / NSEC_PER_SEC)
 
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index e1f5a1136554..25247220b4b7 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -50,6 +50,7 @@ struct tk_read_base {
  * @offs_tai:		Offset clock monotonic -> clock tai
  * @tai_offset:		The current UTC to TAI offset in seconds
  * @clock_was_set_seq:	The sequence number of clock was set events
+ * @next_leap_ktime:	CLOCK_MONOTONIC time value of a pending leap-second
  * @raw_time:		Monotonic raw base time in timespec64 format
  * @cycle_interval:	Number of clock cycles in one NTP interval
  * @xtime_interval:	Number of clock shifted nano seconds in one NTP
@@ -90,6 +91,7 @@ struct timekeeper {
 	ktime_t			offs_tai;
 	s32			tai_offset;
 	unsigned int		clock_was_set_seq;
+	ktime_t			next_leap_ktime;
 	struct timespec64	raw_time;
 
 	/* The following members are for timekeeping internal use */
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7aa216188450..033743e3647a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -77,6 +77,9 @@ static long			time_adjust;
 /* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
 static s64			ntp_tick_adj;
 
+/* second value of the next pending leapsecond, or TIME64_MAX if no leap */
+static time64_t			ntp_next_leap_sec = TIME64_MAX;
+
 #ifdef CONFIG_NTP_PPS
 
 /*
@@ -350,6 +353,7 @@ void ntp_clear(void)
 	tick_length	= tick_length_base;
 	time_offset	= 0;
 
+	ntp_next_leap_sec = TIME64_MAX;
 	/* Clear PPS state variables */
 	pps_clear();
 }
@@ -360,6 +364,21 @@ u64 ntp_tick_length(void)
 	return tick_length;
 }
 
+/**
+ * ntp_get_next_leap - Returns the next leapsecond in CLOCK_REALTIME ktime_t
+ *
+ * Provides the time of the next leapsecond against CLOCK_REALTIME in
+ * a ktime_t format. Returns KTIME_MAX if no leapsecond is pending.
+ */
+ktime_t ntp_get_next_leap(void)
+{
+	ktime_t ret;
+
+	if ((time_state == TIME_INS) && (time_status & STA_INS))
+		return ktime_set(ntp_next_leap_sec, 0);
+	ret.tv64 = KTIME_MAX;
+	return ret;
+}
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -383,15 +402,21 @@ int second_overflow(unsigned long secs)
 	 */
 	switch (time_state) {
 	case TIME_OK:
-		if (time_status & STA_INS)
+		if (time_status & STA_INS) {
 			time_state = TIME_INS;
-		else if (time_status & STA_DEL)
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						(secs % SECS_PER_DAY);
+		} else if (time_status & STA_DEL) {
 			time_state = TIME_DEL;
+			ntp_next_leap_sec = secs + SECS_PER_DAY -
+						 ((secs+1) % SECS_PER_DAY);
+		}
 		break;
 	case TIME_INS:
-		if (!(time_status & STA_INS))
+		if (!(time_status & STA_INS)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if (secs % SECS_PER_DAY == 0) {
+		} else if (secs % SECS_PER_DAY == 0) {
 			leap = -1;
 			time_state = TIME_OOP;
 			printk(KERN_NOTICE
@@ -399,19 +424,21 @@ int second_overflow(unsigned long secs)
 		}
 		break;
 	case TIME_DEL:
-		if (!(time_status & STA_DEL))
+		if (!(time_status & STA_DEL)) {
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_OK;
-		else if ((secs + 1) % SECS_PER_DAY == 0) {
+		} else if ((secs + 1) % SECS_PER_DAY == 0) {
 			leap = 1;
+			ntp_next_leap_sec = TIME64_MAX;
 			time_state = TIME_WAIT;
 			printk(KERN_NOTICE
 				"Clock: deleting leap second 23:59:59 UTC\n");
 		}
 		break;
 	case TIME_OOP:
+		ntp_next_leap_sec = TIME64_MAX;
 		time_state = TIME_WAIT;
 		break;
-
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
@@ -548,6 +575,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts)
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
+		ntp_next_leap_sec = TIME64_MAX;
 		/* restart PPS frequency calibration */
 		pps_reset_freq_interval();
 	}
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index bbd102ad9df7..65430504ca26 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -5,6 +5,7 @@ extern void ntp_init(void);
 extern void ntp_clear(void);
 /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */
 extern u64 ntp_tick_length(void);
+extern ktime_t ntp_get_next_leap(void);
 extern int second_overflow(unsigned long secs);
 extern int ntp_validate_timex(struct timex *);
 extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 849b93265904..5d67ffb7e317 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -539,6 +539,17 @@ int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(pvclock_gtod_unregister_notifier);
 
+/*
+ * tk_update_leap_state - helper to update the next_leap_ktime
+ */
+static inline void tk_update_leap_state(struct timekeeper *tk)
+{
+	tk->next_leap_ktime = ntp_get_next_leap();
+	if (tk->next_leap_ktime.tv64 != KTIME_MAX)
+		/* Convert to monotonic time */
+		tk->next_leap_ktime = ktime_sub(tk->next_leap_ktime, tk->offs_real);
+}
+
 /*
  * Update the ktime_t based scalar nsec members of the timekeeper
  */
@@ -580,6 +591,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 		ntp_clear();
 	}
 
+	tk_update_leap_state(tk);
 	tk_update_ktime_data(tk);
 
 	update_vsyscall(tk);
@@ -1956,15 +1968,22 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 
 		base = tk->tkr_mono.base;
 		nsecs = timekeeping_get_ns(&tk->tkr_mono);
+		base = ktime_add_ns(base, nsecs);
+
 		if (*cwsseq != tk->clock_was_set_seq) {
 			*cwsseq = tk->clock_was_set_seq;
 			*offs_real = tk->offs_real;
 			*offs_boot = tk->offs_boot;
 			*offs_tai = tk->offs_tai;
 		}
+
+		/* Handle leapsecond insertion adjustments */
+		if (unlikely(base.tv64 >= tk->next_leap_ktime.tv64))
+			*offs_real = ktime_sub(tk->offs_real, ktime_set(1, 0));
+
 	} while (read_seqcount_retry(&tk_core.seq, seq));
 
-	return ktime_add_ns(base, nsecs);
+	return base;
 }
 
 /**
@@ -2006,6 +2025,8 @@ int do_adjtimex(struct timex *txc)
 		__timekeeping_set_tai_offset(tk, tai);
 		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 	}
+	tk_update_leap_state(tk);
+
 	write_seqcount_end(&tk_core.seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-- 
cgit v1.2.3


From 96efdcf2d080687e041b0353c604b708546689fd Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Thu, 11 Jun 2015 15:54:56 -0700
Subject: ntp: Do leapsecond adjustment in adjtimex read path

Since the leapsecond is applied at tick-time, this means there is a
small window of time at the start of a leap-second where we cross into
the next second before applying the leap.

This patch modified adjtimex so that the leap-second is applied on the
second edge. Providing more correct leapsecond behavior.

This does make it so that adjtimex()'s returned time values can be
inconsistent with time values read from gettimeofday() or
clock_gettime(CLOCK_REALTIME,...)  for a brief period of one tick at
the leapsecond.  However, those other interfaces do not provide the
TIME_OOP time_state return that adjtimex() provides, which allows the
leapsecond to be properly represented. They instead only see a time
discontinuity, and cannot tell the first 23:59:59 from the repeated
23:59:59 leap second.

This seems like a reasonable tradeoff given clock_gettime() /
gettimeofday() cannot properly represent a leapsecond, and users
likely care more about performance, while folks who are using
adjtimex() more likely care about leap-second correctness.

Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434063297-28657-5-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/ntp.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 033743e3647a..fb4d98c7fd43 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -740,6 +740,24 @@ int __do_adjtimex(struct timex *txc, struct timespec64 *ts, s32 *time_tai)
 	if (!(time_status & STA_NANO))
 		txc->time.tv_usec /= NSEC_PER_USEC;
 
+	/* Handle leapsec adjustments */
+	if (unlikely(ts->tv_sec >= ntp_next_leap_sec)) {
+		if ((time_state == TIME_INS) && (time_status & STA_INS)) {
+			result = TIME_OOP;
+			txc->tai++;
+			txc->time.tv_sec--;
+		}
+		if ((time_state == TIME_DEL) && (time_status & STA_DEL)) {
+			result = TIME_WAIT;
+			txc->tai--;
+			txc->time.tv_sec++;
+		}
+		if ((time_state == TIME_OOP) &&
+					(ts->tv_sec == ntp_next_leap_sec)) {
+			result = TIME_WAIT;
+		}
+	}
+
 	return result;
 }
 
-- 
cgit v1.2.3


From a9d20988ac7db47fec4510cefc966e876a4ce674 Mon Sep 17 00:00:00 2001
From: Viresh Kumar
Date: Wed, 17 Jun 2015 16:04:46 +0530
Subject: clockevents: Check state instead of mode in suspend/resume path

CLOCK_EVT_MODE_* macros are present for backward compatibility (as most
of the drivers are still using old ->set_mode() interface).

These macro's shouldn't be used anymore in code, that is common to both
driver interfaces, i.e. ->set_mode() and ->set_state_*().

Drivers implementing ->set_state_*() interface, which have their
clkevt->mode set to 0 (clkevt device structures are normally globally
defined), will not participate in suspend/resume as they will always be
marked as UNUSED.

Fix this by checking state of the clockevent device instead of mode,
which is updated for both the interfaces.

Fixes: ac34ad27fc16 ("clockevents: Do not suspend/resume if unused")
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: linaro-kernel@lists.linaro.org
Cc: alexandre.belloni@free-electrons.com
Cc: sylvain.rochet@finsecur.com
Link: http://lkml.kernel.org/r/a1964eef6e8a47d02b1ff9083c6c91f73f0ff643.1434537215.git.viresh.kumar@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 2397b97320d8..08ccc3da3ca0 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -639,7 +639,7 @@ void clockevents_suspend(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry_reverse(dev, &clockevent_devices, list)
-		if (dev->suspend && dev->mode != CLOCK_EVT_MODE_UNUSED)
+		if (dev->suspend && !clockevent_state_detached(dev))
 			dev->suspend(dev);
 }
 
@@ -651,7 +651,7 @@ void clockevents_resume(void)
 	struct clock_event_device *dev;
 
 	list_for_each_entry(dev, &clockevent_devices, list)
-		if (dev->resume && dev->mode != CLOCK_EVT_MODE_UNUSED)
+		if (dev->resume && !clockevent_state_detached(dev))
 			dev->resume(dev);
 }
 
-- 
cgit v1.2.3


From 906c55579a6360dd9ef5a3101bb2e3ae396dfb97 Mon Sep 17 00:00:00 2001
From: John Stultz
Date: Wed, 17 Jun 2015 10:05:53 -0700
Subject: timekeeping: Copy the shadow-timekeeper over the real timekeeper last

The fix in d151832650ed9 (time: Move clock_was_set_seq update
before updating shadow-timekeeper) was unfortunately incomplete.

The main gist of that change was to do the shadow-copy update
last, so that any state changes were properly duplicated, and
we wouldn't accidentally have stale data in the shadow.

Unfortunately in the main update_wall_time() logic, we update
use the shadow-timekeeper to calculate the next update values,
then while holding the lock, copy the shadow-timekeeper over,
then call timekeeping_update() to do some additional
bookkeeping, (skipping the shadow mirror). The bug with this is
the additional bookkeeping isn't all read-only, and some
changes timkeeper state. Thus we might then overwrite this state
change on the next update.

To avoid this problem, do the timekeeping_update() on the
shadow-timekeeper prior to copying the full state over to
the real-timekeeper.

This avoids problems with both the clock_was_set_seq and
next_leap_ktime being overwritten and possibly the
fast-timekeepers as well.

Many thanks to Prarit for his rigorous testing, which discovered
this problem, along with Prarit and Daniel's work validating this
fix.

Reported-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Prarit Bhargava <prarit@redhat.com>
Tested-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jiri Bohac <jbohac@suse.cz>
Cc: Ingo Molnar <mingo@kernel.org>
Link: http://lkml.kernel.org/r/1434560753-7441-1-git-send-email-john.stultz@linaro.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5d67ffb7e317..30b7a409bf1e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1853,8 +1853,9 @@ void update_wall_time(void)
 	 * memcpy under the tk_core.seq against one before we start
 	 * updating.
 	 */
+	timekeeping_update(tk, clock_set);
 	memcpy(real_tk, tk, sizeof(*tk));
-	timekeeping_update(real_tk, clock_set);
+	/* The memcpy must come last. Do not put anything here! */
 	write_seqcount_end(&tk_core.seq);
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-- 
cgit v1.2.3


From c04dca02bc73096435a5c36efd5ccb2171edcbe1 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov
Date: Thu, 11 Jun 2015 14:46:44 +0200
Subject: hrtimer: Remove HRTIMER_STATE_MIGRATE

I do not understand HRTIMER_STATE_MIGRATE. Unless I am totally
confused it looks buggy and simply unneeded.

migrate_hrtimer_list() sets it to keep hrtimer_active() == T, but this
is not enough: this can fool, say, hrtimer_is_queued() in
dequeue_signal().

Can't migrate_hrtimer_list() simply use HRTIMER_STATE_ENQUEUED?
This fixes the race and we can kill STATE_MIGRATE.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.072387650@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h | 6 +-----
 kernel/time/hrtimer.c   | 7 ++-----
 2 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 3f82a7edc03d..2f9e57d3d126 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -70,17 +70,13 @@ enum hrtimer_restart {
  * the handling of the timer.
  *
  * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario. This
- * also affects HRTIMER_STATE_MIGRATE where the preservation is not
- * necessary. HRTIMER_STATE_MIGRATE is cleared after the timer is
- * enqueued on the new cpu.
+ * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
 #define HRTIMER_STATE_CALLBACK	0x02
-#define HRTIMER_STATE_MIGRATE	0x04
 
 /**
  * struct hrtimer - the basic hrtimer structure
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 278d4b36fd94..b1b795e5e0b1 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1508,11 +1508,11 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		debug_deactivate(timer);
 
 		/*
-		 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
+		 * Mark it as ENQUEUED not INACTIVE otherwise the
 		 * timer could be seen as !active and just vanish away
 		 * under us on another CPU
 		 */
-		__remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
+		__remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0);
 		timer->base = new_base;
 		/*
 		 * Enqueue the timers on the new cpu. This does not
@@ -1523,9 +1523,6 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 		 * event device.
 		 */
 		enqueue_hrtimer(timer, new_base);
-
-		/* Clear the migration state bit */
-		timer->state &= ~HRTIMER_STATE_MIGRATE;
 	}
 }
 
-- 
cgit v1.2.3


From 8edfb0362e8e52dec2de08fa163af01c9da2c9d0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 11 Jun 2015 14:46:45 +0200
Subject: hrtimer: Fix hrtimer_is_queued() hole

A queued hrtimer that gets restarted (hrtimer_start*() while
hrtimer_is_queued()) will briefly appear as unqueued/inactive, even
though the timer has always been active, we just moved it.

Close this hole by preserving timer->state in
hrtimer_start_range_ns()'s remove_hrtimer() call.

Reported-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.175989138@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/hrtimer.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index b1b795e5e0b1..1604157374d7 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -889,10 +889,10 @@ static void __remove_hrtimer(struct hrtimer *timer,
  * remove hrtimer, called with base lock held
  */
 static inline int
-remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
+remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
 {
 	if (hrtimer_is_queued(timer)) {
-		unsigned long state;
+		unsigned long state = timer->state;
 		int reprogram;
 
 		/*
@@ -906,12 +906,15 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 		debug_deactivate(timer);
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
-		/*
-		 * We must preserve the CALLBACK state flag here,
-		 * otherwise we could move the timer base in
-		 * switch_hrtimer_base.
-		 */
-		state = timer->state & HRTIMER_STATE_CALLBACK;
+
+		if (!restart) {
+			/*
+			 * We must preserve the CALLBACK state flag here,
+			 * otherwise we could move the timer base in
+			 * switch_hrtimer_base.
+			 */
+			state &= HRTIMER_STATE_CALLBACK;
+		}
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -936,7 +939,7 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 	base = lock_hrtimer_base(timer, &flags);
 
 	/* Remove an active timer from the queue: */
-	remove_hrtimer(timer, base);
+	remove_hrtimer(timer, base, true);
 
 	if (mode & HRTIMER_MODE_REL) {
 		tim = ktime_add_safe(tim, base->get_time());
@@ -1005,7 +1008,7 @@ int hrtimer_try_to_cancel(struct hrtimer *timer)
 	base = lock_hrtimer_base(timer, &flags);
 
 	if (!hrtimer_callback_running(timer))
-		ret = remove_hrtimer(timer, base);
+		ret = remove_hrtimer(timer, base, false);
 
 	unlock_hrtimer_base(timer, &flags);
 
-- 
cgit v1.2.3


From 887d9dc989eb0154492e41e7c07492edbb088ba1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra
Date: Thu, 11 Jun 2015 14:46:48 +0200
Subject: hrtimer: Allow hrtimer::function() to free the timer

Currently an hrtimer callback function cannot free its own timer
because __run_hrtimer() still needs to clear HRTIMER_STATE_CALLBACK
after it. Freeing the timer would result in a clear use-after-free.

Solve this by using a scheme similar to regular timers; track the
current running timer in hrtimer_clock_base::running.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: ktkhai@parallels.com
Cc: rostedt@goodmis.org
Cc: juri.lelli@gmail.com
Cc: pang.xunlei@linaro.org
Cc: wanpeng.li@linux.intel.com
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: umgwanakikbuti@gmail.com
Link: http://lkml.kernel.org/r/20150611124743.471563047@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h |  41 +++++++----------
 kernel/time/hrtimer.c   | 114 ++++++++++++++++++++++++++++++++++++++----------
 2 files changed, 107 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 2f9e57d3d126..5db055821ef3 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -53,30 +53,25 @@ enum hrtimer_restart {
  *
  * 0x00		inactive
  * 0x01		enqueued into rbtree
- * 0x02		callback function running
- * 0x04		timer is migrated to another cpu
  *
- * Special cases:
- * 0x03		callback function running and enqueued
- *		(was requeued on another CPU)
- * 0x05		timer was migrated on CPU hotunplug
+ * The callback state is not part of the timer->state because clearing it would
+ * mean touching the timer after the callback, this makes it impossible to free
+ * the timer from the callback function.
  *
- * The "callback function running and enqueued" status is only possible on
- * SMP. It happens for example when a posix timer expired and the callback
+ * Therefore we track the callback state in:
+ *
+ *	timer->base->cpu_base->running == timer
+ *
+ * On SMP it is possible to have a "callback function running and enqueued"
+ * status. It happens for example when a posix timer expired and the callback
  * queued a signal. Between dropping the lock which protects the posix timer
  * and reacquiring the base lock of the hrtimer, another CPU can deliver the
- * signal and rearm the timer. We have to preserve the callback running state,
- * as otherwise the timer could be removed before the softirq code finishes the
- * the handling of the timer.
- *
- * The HRTIMER_STATE_ENQUEUED bit is always or'ed to the current state
- * to preserve the HRTIMER_STATE_CALLBACK in the above scenario.
+ * signal and rearm the timer.
  *
  * All state transitions are protected by cpu_base->lock.
  */
 #define HRTIMER_STATE_INACTIVE	0x00
 #define HRTIMER_STATE_ENQUEUED	0x01
-#define HRTIMER_STATE_CALLBACK	0x02
 
 /**
  * struct hrtimer - the basic hrtimer structure
@@ -163,6 +158,8 @@ enum  hrtimer_base_type {
  * struct hrtimer_cpu_base - the per cpu clock bases
  * @lock:		lock protecting the base and associated clock bases
  *			and timers
+ * @seq:		seqcount around __run_hrtimer
+ * @running:		pointer to the currently running hrtimer
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
@@ -184,6 +181,8 @@ enum  hrtimer_base_type {
  */
 struct hrtimer_cpu_base {
 	raw_spinlock_t			lock;
+	seqcount_t			seq;
+	struct hrtimer			*running;
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
@@ -391,15 +390,7 @@ extern ktime_t hrtimer_get_remaining(const struct hrtimer *timer);
 
 extern u64 hrtimer_get_next_event(void);
 
-/*
- * A timer is active, when it is enqueued into the rbtree or the
- * callback function is running or it's in the state of being migrated
- * to another cpu.
- */
-static inline int hrtimer_active(const struct hrtimer *timer)
-{
-	return timer->state != HRTIMER_STATE_INACTIVE;
-}
+extern bool hrtimer_active(const struct hrtimer *timer);
 
 /*
  * Helper function to check, whether the timer is on one of the queues
@@ -415,7 +406,7 @@ static inline int hrtimer_is_queued(struct hrtimer *timer)
  */
 static inline int hrtimer_callback_running(struct hrtimer *timer)
 {
-	return timer->state & HRTIMER_STATE_CALLBACK;
+	return timer->base->cpu_base->running == timer;
 }
 
 /* Forward a hrtimer so it expires after now: */
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1604157374d7..f026413de4d6 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -67,6 +67,7 @@
 DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
 {
 	.lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
+	.seq = SEQCNT_ZERO(hrtimer_bases.seq),
 	.clock_base =
 	{
 		{
@@ -110,6 +111,18 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  */
 #ifdef CONFIG_SMP
 
+/*
+ * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base()
+ * such that hrtimer_callback_running() can unconditionally dereference
+ * timer->base->cpu_base
+ */
+static struct hrtimer_cpu_base migration_cpu_base = {
+	.seq = SEQCNT_ZERO(migration_cpu_base),
+	.clock_base = { { .cpu_base = &migration_cpu_base, }, },
+};
+
+#define migration_base	migration_cpu_base.clock_base[0]
+
 /*
  * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock
  * means that all timers which are tied to this base via timer->base are
@@ -119,8 +132,8 @@ static inline int hrtimer_clockid_to_base(clockid_t clock_id)
  * be found on the lists/queues.
  *
  * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * possible to set timer->base = &migration_base and drop the lock: the timer
+ * remains locked.
  */
 static
 struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
@@ -130,7 +143,7 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer,
 
 	for (;;) {
 		base = timer->base;
-		if (likely(base != NULL)) {
+		if (likely(base != &migration_base)) {
 			raw_spin_lock_irqsave(&base->cpu_base->lock, *flags);
 			if (likely(base == timer->base))
 				return base;
@@ -194,8 +207,8 @@ again:
 		if (unlikely(hrtimer_callback_running(timer)))
 			return base;
 
-		/* See the comment in lock_timer_base() */
-		timer->base = NULL;
+		/* See the comment in lock_hrtimer_base() */
+		timer->base = &migration_base;
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
@@ -838,11 +851,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
 
 	base->cpu_base->active_bases |= 1 << base->index;
 
-	/*
-	 * HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
-	 * state of a possibly running callback.
-	 */
-	timer->state |= HRTIMER_STATE_ENQUEUED;
+	timer->state = HRTIMER_STATE_ENQUEUED;
 
 	return timerqueue_add(&base->active, &timer->node);
 }
@@ -907,14 +916,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
 		timer_stats_hrtimer_clear_start_info(timer);
 		reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases);
 
-		if (!restart) {
-			/*
-			 * We must preserve the CALLBACK state flag here,
-			 * otherwise we could move the timer base in
-			 * switch_hrtimer_base.
-			 */
-			state &= HRTIMER_STATE_CALLBACK;
-		}
+		if (!restart)
+			state = HRTIMER_STATE_INACTIVE;
+
 		__remove_hrtimer(timer, base, state, reprogram);
 		return 1;
 	}
@@ -1115,6 +1119,51 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 }
 EXPORT_SYMBOL_GPL(hrtimer_init);
 
+/*
+ * A timer is active, when it is enqueued into the rbtree or the
+ * callback function is running or it's in the state of being migrated
+ * to another cpu.
+ *
+ * It is important for this function to not return a false negative.
+ */
+bool hrtimer_active(const struct hrtimer *timer)
+{
+	struct hrtimer_cpu_base *cpu_base;
+	unsigned int seq;
+
+	do {
+		cpu_base = READ_ONCE(timer->base->cpu_base);
+		seq = raw_read_seqcount_begin(&cpu_base->seq);
+
+		if (timer->state != HRTIMER_STATE_INACTIVE ||
+		    cpu_base->running == timer)
+			return true;
+
+	} while (read_seqcount_retry(&cpu_base->seq, seq) ||
+		 cpu_base != READ_ONCE(timer->base->cpu_base));
+
+	return false;
+}
+EXPORT_SYMBOL_GPL(hrtimer_active);
+
+/*
+ * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3
+ * distinct sections:
+ *
+ *  - queued:	the timer is queued
+ *  - callback:	the timer is being ran
+ *  - post:	the timer is inactive or (re)queued
+ *
+ * On the read side we ensure we observe timer->state and cpu_base->running
+ * from the same section, if anything changed while we looked at it, we retry.
+ * This includes timer->base changing because sequence numbers alone are
+ * insufficient for that.
+ *
+ * The sequence numbers are required because otherwise we could still observe
+ * a false negative if the read side got smeared over multiple consequtive
+ * __run_hrtimer() invocations.
+ */
+
 static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 			  struct hrtimer_clock_base *base,
 			  struct hrtimer *timer, ktime_t *now)
@@ -1122,10 +1171,21 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	enum hrtimer_restart (*fn)(struct hrtimer *);
 	int restart;
 
-	WARN_ON(!irqs_disabled());
+	lockdep_assert_held(&cpu_base->lock);
 
 	debug_deactivate(timer);
-	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	cpu_base->running = timer;
+
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0);
 	timer_stats_account_hrtimer(timer);
 	fn = timer->function;
 
@@ -1141,7 +1201,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	raw_spin_lock(&cpu_base->lock);
 
 	/*
-	 * Note: We clear the CALLBACK bit after enqueue_hrtimer and
+	 * Note: We clear the running state after enqueue_hrtimer and
 	 * we do not reprogramm the event hardware. Happens either in
 	 * hrtimer_start_range_ns() or in hrtimer_interrupt()
 	 *
@@ -1153,9 +1213,17 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
 	    !(timer->state & HRTIMER_STATE_ENQUEUED))
 		enqueue_hrtimer(timer, base);
 
-	WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+	/*
+	 * Separate the ->running assignment from the ->state assignment.
+	 *
+	 * As with a regular write barrier, this ensures the read side in
+	 * hrtimer_active() cannot observe cpu_base->running == NULL &&
+	 * timer->state == INACTIVE.
+	 */
+	raw_write_seqcount_barrier(&cpu_base->seq);
 
-	timer->state &= ~HRTIMER_STATE_CALLBACK;
+	WARN_ON_ONCE(cpu_base->running != timer);
+	cpu_base->running = NULL;
 }
 
 static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now)
-- 
cgit v1.2.3


From 3bb475a3446facd0425d3f2fe7e85bf03c5c6c05 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:24 +0000
Subject: timers: Sanitize catchup_timer_jiffies() usage

catchup_timer_jiffies() has been applied blindly to several functions
without looking for possible better ways to do it.

1) internal_add_timer()

   Move the update to base->all_timers before we actually insert the
   timer into the wheel.

2) detach_if_pending()

   Again the update to base->all_timers allows us to explicitely do
   the timer_jiffies update in place, if this was the last timer which
   got removed.

3) __run_timers()

   We only check on entry, which is silly, because base->timer_jiffies
   can be behind - especially on NOHZ kernels - and if there is a
   single deferrable timer somewhere between base->timer_jiffies and
   jiffies we expire it and then loop until base->timer_jiffies ==
   jiffies.

   Move it into the loop.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224511.662994644@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 7775d454a204..d5e017970ac7 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -351,20 +351,6 @@ void set_timer_slack(struct timer_list *timer, int slack_hz)
 }
 EXPORT_SYMBOL_GPL(set_timer_slack);
 
-/*
- * If the list is empty, catch up ->timer_jiffies to the current time.
- * The caller must hold the tvec_base lock.  Returns true if the list
- * was empty and therefore ->timer_jiffies was updated.
- */
-static bool catchup_timer_jiffies(struct tvec_base *base)
-{
-	if (!base->all_timers) {
-		base->timer_jiffies = jiffies;
-		return true;
-	}
-	return false;
-}
-
 static void
 __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
@@ -411,7 +397,10 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
-	(void)catchup_timer_jiffies(base);
+	/* Advance base->jiffies, if the base is empty */
+	if (!base->all_timers++)
+		base->timer_jiffies = jiffies;
+
 	__internal_add_timer(base, timer);
 	/*
 	 * Update base->active_timers and base->next_timer
@@ -421,7 +410,6 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		    time_before(timer->expires, base->next_timer))
 			base->next_timer = timer->expires;
 	}
-	base->all_timers++;
 
 	/*
 	 * Check whether the other CPU is in dynticks mode and needs
@@ -718,7 +706,6 @@ detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 	if (!tbase_get_deferrable(timer->base))
 		base->active_timers--;
 	base->all_timers--;
-	(void)catchup_timer_jiffies(base);
 }
 
 static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
@@ -733,8 +720,9 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		if (timer->expires == base->next_timer)
 			base->next_timer = base->timer_jiffies;
 	}
-	base->all_timers--;
-	(void)catchup_timer_jiffies(base);
+	/* If this was the last timer, advance base->jiffies */
+	if (!--base->all_timers)
+		base->timer_jiffies = jiffies;
 	return 1;
 }
 
@@ -1184,14 +1172,18 @@ static inline void __run_timers(struct tvec_base *base)
 	struct timer_list *timer;
 
 	spin_lock_irq(&base->lock);
-	if (catchup_timer_jiffies(base)) {
-		spin_unlock_irq(&base->lock);
-		return;
-	}
+
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list;
 		struct list_head *head = &work_list;
-		int index = base->timer_jiffies & TVR_MASK;
+		int index;
+
+		if (!base->all_timers) {
+			base->timer_jiffies = jiffies;
+			break;
+		}
+
+		index = base->timer_jiffies & TVR_MASK;
 
 		/*
 		 * Cascade timers:
-- 
cgit v1.2.3


From 1bd04bf6f68d65f5422b2b85c495d65d49587a54 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:26 +0000
Subject: timer: Remove FIFO "guarantee"

The FIFO guarantee is only there if two timers are queued into the
same bucket at the same jiffie on the same cpu:

 - The slack value depends on the delta between expiry and enqueue
   time, so the resulting expiry time can be different for timers
   which are queued in different jiffies.

 - Timers which are queued into the secondary array end up after a
   later queued timer which was queued into the primary array due to
   cascading.

 - Timers can end up on different cpus due to the NOHZ target moving
   around. Obviously there is no guarantee of expiry ordering between
   cpus.

So anything which relies on FIFO behaviour of the timer wheel is
broken already.

This is a preparatory patch for converting the timer wheel to hlist
which reduces the memory foot print of the wheel by 50%.

It's a seperate patch so any (unlikely to happen) regression caused by
this can be identified clearly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Cc: George Spelvin <linux@horizon.com>
Link: http://lkml.kernel.org/r/20150526224511.757520403@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timer.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index d5e017970ac7..e212df24ad3f 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -389,10 +389,8 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		i = (expires >> (TVR_BITS + 3 * TVN_BITS)) & TVN_MASK;
 		vec = base->tv5.vec + i;
 	}
-	/*
-	 * Timers are FIFO:
-	 */
-	list_add_tail(&timer->entry, vec);
+
+	list_add(&timer->entry, vec);
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
-- 
cgit v1.2.3


From 1dabbcec2c0a36fe43509d06499b9e512e70a028 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:28 +0000
Subject: timer: Use hlist for the timer wheel hash buckets

This reduces the size of struct tvec_base by 50% and results in
slightly smaller code as well.

Before:
   struct tvec_base: size: 8256, cachelines: 129

   text	   data	    bss	    dec	    hex	filename
  17698	  13297	   8256	  39251	   9953	../build/kernel/time/timer.o

After:
  struct tvec_base: 4160, cachelines: 65

   text	   data	    bss	    dec	    hex	filename
  17491	   9201	   4160	  30852	   7884	../build/kernel/time/timer.o

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224511.854731214@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  6 ++---
 kernel/time/timer.c   | 64 ++++++++++++++++++++++-----------------------------
 2 files changed, 30 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index fbb80e0030bf..064ee24d3f38 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -14,7 +14,7 @@ struct timer_list {
 	 * All fields that change during normal runtime grouped to the
 	 * same cacheline
 	 */
-	struct list_head entry;
+	struct hlist_node entry;
 	unsigned long expires;
 	struct tvec_base *base;
 
@@ -71,7 +71,7 @@ extern struct tvec_base boot_tvec_bases;
 #define TIMER_FLAG_MASK			0x3LU
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
-		.entry = { .prev = TIMER_ENTRY_STATIC },	\
+		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
@@ -168,7 +168,7 @@ static inline void init_timer_on_stack_key(struct timer_list *timer,
  */
 static inline int timer_pending(const struct timer_list * timer)
 {
-	return timer->entry.next != NULL;
+	return timer->entry.pprev != NULL;
 }
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index e212df24ad3f..3a5e0c840884 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -70,11 +70,11 @@ EXPORT_SYMBOL(jiffies_64);
 #define MAX_TVAL ((unsigned long)((1ULL << (TVR_BITS + 4*TVN_BITS)) - 1))
 
 struct tvec {
-	struct list_head vec[TVN_SIZE];
+	struct hlist_head vec[TVN_SIZE];
 };
 
 struct tvec_root {
-	struct list_head vec[TVR_SIZE];
+	struct hlist_head vec[TVR_SIZE];
 };
 
 struct tvec_base {
@@ -356,7 +356,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 {
 	unsigned long expires = timer->expires;
 	unsigned long idx = expires - base->timer_jiffies;
-	struct list_head *vec;
+	struct hlist_head *vec;
 
 	if (idx < TVR_SIZE) {
 		int i = expires & TVR_MASK;
@@ -390,7 +390,7 @@ __internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 		vec = base->tv5.vec + i;
 	}
 
-	list_add(&timer->entry, vec);
+	hlist_add_head(&timer->entry, vec);
 }
 
 static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
@@ -504,8 +504,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
 		 * statically initialized. We just make sure that it
 		 * is tracked in the object tracker.
 		 */
-		if (timer->entry.next == NULL &&
-		    timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.pprev == NULL &&
+		    timer->entry.next == TIMER_ENTRY_STATIC) {
 			debug_object_init(timer, &timer_debug_descr);
 			debug_object_activate(timer, &timer_debug_descr);
 			return 0;
@@ -551,7 +551,7 @@ static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
 
 	switch (state) {
 	case ODEBUG_STATE_NOTAVAILABLE:
-		if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+		if (timer->entry.next == TIMER_ENTRY_STATIC) {
 			/*
 			 * This is not really a fixup. The timer was
 			 * statically initialized. We just make sure that it
@@ -655,7 +655,7 @@ static void do_init_timer(struct timer_list *timer, unsigned int flags,
 {
 	struct tvec_base *base = raw_cpu_read(tvec_bases);
 
-	timer->entry.next = NULL;
+	timer->entry.pprev = NULL;
 	timer->base = (void *)((unsigned long)base | flags);
 	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
@@ -687,14 +687,14 @@ EXPORT_SYMBOL(init_timer_key);
 
 static inline void detach_timer(struct timer_list *timer, bool clear_pending)
 {
-	struct list_head *entry = &timer->entry;
+	struct hlist_node *entry = &timer->entry;
 
 	debug_deactivate(timer);
 
-	__list_del(entry->prev, entry->next);
+	__hlist_del(entry);
 	if (clear_pending)
-		entry->next = NULL;
-	entry->prev = LIST_POISON2;
+		entry->pprev = NULL;
+	entry->next = LIST_POISON2;
 }
 
 static inline void
@@ -1095,16 +1095,17 @@ EXPORT_SYMBOL(del_timer_sync);
 static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 {
 	/* cascade all the timers from tv up one level */
-	struct timer_list *timer, *tmp;
-	struct list_head tv_list;
+	struct timer_list *timer;
+	struct hlist_node *tmp;
+	struct hlist_head tv_list;
 
-	list_replace_init(tv->vec + index, &tv_list);
+	hlist_move_list(tv->vec + index, &tv_list);
 
 	/*
 	 * We are removing _all_ timers from the list, so we
 	 * don't have to detach them individually.
 	 */
-	list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
+	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
 		BUG_ON(tbase_get_base(timer->base) != base);
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
@@ -1172,8 +1173,8 @@ static inline void __run_timers(struct tvec_base *base)
 	spin_lock_irq(&base->lock);
 
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
-		struct list_head work_list;
-		struct list_head *head = &work_list;
+		struct hlist_head work_list;
+		struct hlist_head *head = &work_list;
 		int index;
 
 		if (!base->all_timers) {
@@ -1192,13 +1193,13 @@ static inline void __run_timers(struct tvec_base *base)
 					!cascade(base, &base->tv4, INDEX(2)))
 			cascade(base, &base->tv5, INDEX(3));
 		++base->timer_jiffies;
-		list_replace_init(base->tv1.vec + index, head);
-		while (!list_empty(head)) {
+		hlist_move_list(base->tv1.vec + index, head);
+		while (!hlist_empty(head)) {
 			void (*fn)(unsigned long);
 			unsigned long data;
 			bool irqsafe;
 
-			timer = list_first_entry(head, struct timer_list,entry);
+			timer = hlist_entry(head->first, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;
 			irqsafe = tbase_get_irqsafe(timer->base);
@@ -1240,7 +1241,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 	/* Look for timer events in tv1. */
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
-		list_for_each_entry(nte, base->tv1.vec + slot, entry) {
+		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
 			if (tbase_get_deferrable(nte->base))
 				continue;
 
@@ -1271,7 +1272,7 @@ cascade:
 
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
-			list_for_each_entry(nte, varp->vec + slot, entry) {
+			hlist_for_each_entry(nte, varp->vec + slot, entry) {
 				if (tbase_get_deferrable(nte->base))
 					continue;
 
@@ -1530,12 +1531,12 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
+static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
 
-	while (!list_empty(head)) {
-		timer = list_first_entry(head, struct timer_list, entry);
+	while (!hlist_empty(head)) {
+		timer = hlist_entry(head->first, struct timer_list, entry);
 		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
 		timer_set_base(timer, new_base);
@@ -1603,23 +1604,12 @@ static inline void timer_register_cpu_notifier(void) { }
 
 static void __init init_timer_cpu(struct tvec_base *base, int cpu)
 {
-	int j;
-
 	BUG_ON(base != tbase_get_base(base));
 
 	base->cpu = cpu;
 	per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
-	for (j = 0; j < TVN_SIZE; j++) {
-		INIT_LIST_HEAD(base->tv5.vec + j);
-		INIT_LIST_HEAD(base->tv4.vec + j);
-		INIT_LIST_HEAD(base->tv3.vec + j);
-		INIT_LIST_HEAD(base->tv2.vec + j);
-	}
-	for (j = 0; j < TVR_SIZE; j++)
-		INIT_LIST_HEAD(base->tv1.vec + j);
-
 	base->timer_jiffies = jiffies;
 	base->next_timer = base->timer_jiffies;
 }
-- 
cgit v1.2.3


From 0eeda71bc30d74f66f8231f45621d5ace3419186 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:29 +0000
Subject: timer: Replace timer base by a cpu index

Instead of storing a pointer to the per cpu tvec_base we can simply
cache a CPU index in the timer_list and use that to get hold of the
correct per cpu tvec_base. This is only used in lock_timer_base() and
the slightly larger code is peanuts versus the spinlock operation and
the d-cache foot print of the timer wheel.

Aside of that this allows to get rid of following nuisances:

 - boot_tvec_base

   That statically allocated 4k bss data is just kept around so the
   timer has a home when it gets statically initialized. It serves no
   other purpose.

   With the CPU index we assign the timer to CPU0 at static
   initialization time and therefor can avoid the whole boot_tvec_base
   dance.  That also simplifies the init code, which just can use the
   per cpu base.

   Before:
     text	   data	    bss	    dec	    hex	filename
    17491	   9201	   4160	  30852	   7884	../build/kernel/time/timer.o
   After:
     text	   data	    bss	    dec	    hex	filename
    17440	   9193	      0	  26633	   6809	../build/kernel/time/timer.o

 - Overloading the base pointer with various flags

   The CPU index has enough space to hold the flags (deferrable,
   irqsafe) so we can get rid of the extra masking and bit fiddling
   with the base pointer.

As a benefit we reduce the size of struct timer_list on 64 bit
machines. 4 - 8 bytes, a size reduction up to 15% per struct timer_list,
which is a real win as we have tons of them embedded in other structs.

This changes also the newly added deferrable printout of the timer
start trace point to capture and print all timer->flags, which allows
us to decode the target cpu of the timer as well.

We might have used bitfields for this, but that would change the
static initializers and the init function for no value to accomodate
big endian bitfields.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Badhri Jagan Sridharan <Badhri@google.com>
Link: http://lkml.kernel.org/r/20150526224511.950084301@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>

---
 include/linux/timer.h        |  38 ++++++-------
 include/trace/events/timer.h |  13 ++---
 kernel/time/timer.c          | 127 ++++++++++++-------------------------------
 3 files changed, 58 insertions(+), 120 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 064ee24d3f38..4a0d52bc2073 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -14,27 +14,23 @@ struct timer_list {
 	 * All fields that change during normal runtime grouped to the
 	 * same cacheline
 	 */
-	struct hlist_node entry;
-	unsigned long expires;
-	struct tvec_base *base;
-
-	void (*function)(unsigned long);
-	unsigned long data;
-
-	int slack;
+	struct hlist_node	entry;
+	unsigned long		expires;
+	void			(*function)(unsigned long);
+	unsigned long		data;
+	u32			flags;
+	int			slack;
 
 #ifdef CONFIG_TIMER_STATS
-	int start_pid;
-	void *start_site;
-	char start_comm[16];
+	int			start_pid;
+	void			*start_site;
+	char			start_comm[16];
 #endif
 #ifdef CONFIG_LOCKDEP
-	struct lockdep_map lockdep_map;
+	struct lockdep_map	lockdep_map;
 #endif
 };
 
-extern struct tvec_base boot_tvec_bases;
-
 #ifdef CONFIG_LOCKDEP
 /*
  * NB: because we have to copy the lockdep_map, setting the lockdep_map key
@@ -49,9 +45,6 @@ extern struct tvec_base boot_tvec_bases;
 #endif
 
 /*
- * Note that all tvec_bases are at least 4 byte aligned and lower two bits
- * of base in timer_list is guaranteed to be zero. Use them for flags.
- *
  * A deferrable timer will work normally when the system is busy, but
  * will not cause a CPU to come out of idle just to service it; instead,
  * the timer will be serviced when the CPU eventually wakes up with a
@@ -65,17 +58,18 @@ extern struct tvec_base boot_tvec_bases;
  * workqueue locking issues. It's not meant for executing random crap
  * with interrupts disabled. Abuse is monitored!
  */
-#define TIMER_DEFERRABLE		0x1LU
-#define TIMER_IRQSAFE			0x2LU
-
-#define TIMER_FLAG_MASK			0x3LU
+#define TIMER_CPUMASK		0x0007FFFF
+#define TIMER_MIGRATING		0x00080000
+#define TIMER_BASEMASK		(TIMER_CPUMASK | TIMER_MIGRATING)
+#define TIMER_DEFERRABLE	0x00100000
+#define TIMER_IRQSAFE		0x00200000
 
 #define __TIMER_INITIALIZER(_function, _expires, _data, _flags) { \
 		.entry = { .next = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
 		.expires = (_expires),				\
 		.data = (_data),				\
-		.base = (void *)((unsigned long)&boot_tvec_bases + (_flags)), \
+		.flags = (_flags),				\
 		.slack = -1,					\
 		__TIMER_LOCKDEP_MAP_INITIALIZER(		\
 			__FILE__ ":" __stringify(__LINE__))	\
diff --git a/include/trace/events/timer.h b/include/trace/events/timer.h
index d7abef1fe6e0..073b9ac245ba 100644
--- a/include/trace/events/timer.h
+++ b/include/trace/events/timer.h
@@ -45,16 +45,16 @@ TRACE_EVENT(timer_start,
 
 	TP_PROTO(struct timer_list *timer,
 		unsigned long expires,
-		unsigned int deferrable),
+		unsigned int flags),
 
-	TP_ARGS(timer, expires, deferrable),
+	TP_ARGS(timer, expires, flags),
 
 	TP_STRUCT__entry(
 		__field( void *,	timer		)
 		__field( void *,	function	)
 		__field( unsigned long,	expires		)
 		__field( unsigned long,	now		)
-		__field( unsigned int,	deferrable	)
+		__field( unsigned int,	flags		)
 	),
 
 	TP_fast_assign(
@@ -62,13 +62,12 @@ TRACE_EVENT(timer_start,
 		__entry->function	= timer->function;
 		__entry->expires	= expires;
 		__entry->now		= jiffies;
-		__entry->deferrable     = deferrable;
+		__entry->flags		= flags;
 	),
 
-	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] defer=%c",
+	TP_printk("timer=%p function=%pf expires=%lu [timeout=%ld] flags=0x%08x",
 		  __entry->timer, __entry->function, __entry->expires,
-		  (long)__entry->expires - __entry->now,
-		  __entry->deferrable > 0 ? 'y':'n')
+		  (long)__entry->expires - __entry->now, __entry->flags)
 );
 
 /**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3a5e0c840884..1540af9f62eb 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -92,43 +92,8 @@ struct tvec_base {
 	struct tvec tv5;
 } ____cacheline_aligned;
 
-/*
- * __TIMER_INITIALIZER() needs to set ->base to a valid pointer (because we've
- * made NULL special, hint: lock_timer_base()) and we cannot get a compile time
- * pointer to per-cpu entries because we don't know where we'll map the section,
- * even for the boot cpu.
- *
- * And so we use boot_tvec_bases for boot CPU and per-cpu __tvec_bases for the
- * rest of them.
- */
-struct tvec_base boot_tvec_bases;
-EXPORT_SYMBOL(boot_tvec_bases);
-
-static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
-
-/* Functions below help us manage 'deferrable' flag */
-static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
-{
-	return ((unsigned int)(unsigned long)base & TIMER_DEFERRABLE);
-}
-
-static inline unsigned int tbase_get_irqsafe(struct tvec_base *base)
-{
-	return ((unsigned int)(unsigned long)base & TIMER_IRQSAFE);
-}
-
-static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
-{
-	return ((struct tvec_base *)((unsigned long)base & ~TIMER_FLAG_MASK));
-}
-
-static inline void
-timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
-{
-	unsigned long flags = (unsigned long)timer->base & TIMER_FLAG_MASK;
 
-	timer->base = (struct tvec_base *)((unsigned long)(new_base) | flags);
-}
+static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 		bool force_up)
@@ -403,7 +368,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	/*
 	 * Update base->active_timers and base->next_timer
 	 */
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		if (!base->active_timers++ ||
 		    time_before(timer->expires, base->next_timer))
 			base->next_timer = timer->expires;
@@ -422,7 +387,7 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!tbase_get_deferrable(timer->base) || tick_nohz_full_cpu(base->cpu))
+	if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu))
 		wake_up_nohz_cpu(base->cpu);
 }
 
@@ -443,7 +408,7 @@ static void timer_stats_account_timer(struct timer_list *timer)
 
 	if (likely(!timer->start_site))
 		return;
-	if (unlikely(tbase_get_deferrable(timer->base)))
+	if (unlikely(timer->flags & TIMER_DEFERRABLE))
 		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
@@ -636,7 +601,7 @@ static inline void
 debug_activate(struct timer_list *timer, unsigned long expires)
 {
 	debug_timer_activate(timer);
-	trace_timer_start(timer, expires, tbase_get_deferrable(timer->base));
+	trace_timer_start(timer, expires, timer->flags);
 }
 
 static inline void debug_deactivate(struct timer_list *timer)
@@ -653,10 +618,8 @@ static inline void debug_assert_init(struct timer_list *timer)
 static void do_init_timer(struct timer_list *timer, unsigned int flags,
 			  const char *name, struct lock_class_key *key)
 {
-	struct tvec_base *base = raw_cpu_read(tvec_bases);
-
 	timer->entry.pprev = NULL;
-	timer->base = (void *)((unsigned long)base | flags);
+	timer->flags = flags | raw_smp_processor_id();
 	timer->slack = -1;
 #ifdef CONFIG_TIMER_STATS
 	timer->start_site = NULL;
@@ -701,7 +664,7 @@ static inline void
 detach_expired_timer(struct timer_list *timer, struct tvec_base *base)
 {
 	detach_timer(timer, true);
-	if (!tbase_get_deferrable(timer->base))
+	if (!(timer->flags & TIMER_DEFERRABLE))
 		base->active_timers--;
 	base->all_timers--;
 }
@@ -713,7 +676,7 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
 		return 0;
 
 	detach_timer(timer, clear_pending);
-	if (!tbase_get_deferrable(timer->base)) {
+	if (!(timer->flags & TIMER_DEFERRABLE)) {
 		base->active_timers--;
 		if (timer->expires == base->next_timer)
 			base->next_timer = base->timer_jiffies;
@@ -732,24 +695,22 @@ static int detach_if_pending(struct timer_list *timer, struct tvec_base *base,
  * So __run_timers/migrate_timers can safely modify all timers which could
  * be found on ->tvX lists.
  *
- * When the timer's base is locked, and the timer removed from list, it is
- * possible to set timer->base = NULL and drop the lock: the timer remains
- * locked.
+ * When the timer's base is locked and removed from the list, the
+ * TIMER_MIGRATING flag is set, FIXME
  */
 static struct tvec_base *lock_timer_base(struct timer_list *timer,
 					unsigned long *flags)
 	__acquires(timer->base->lock)
 {
-	struct tvec_base *base;
-
 	for (;;) {
-		struct tvec_base *prelock_base = timer->base;
-		base = tbase_get_base(prelock_base);
-		if (likely(base != NULL)) {
+		u32 tf = timer->flags;
+		struct tvec_base *base;
+
+		if (!(tf & TIMER_MIGRATING)) {
+			base = per_cpu_ptr(&tvec_bases, tf & TIMER_CPUMASK);
 			spin_lock_irqsave(&base->lock, *flags);
-			if (likely(prelock_base == timer->base))
+			if (timer->flags == tf)
 				return base;
-			/* The timer has migrated to another CPU */
 			spin_unlock_irqrestore(&base->lock, *flags);
 		}
 		cpu_relax();
@@ -776,7 +737,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 	debug_activate(timer, expires);
 
 	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu(tvec_bases, cpu);
+	new_base = per_cpu_ptr(&tvec_bases, cpu);
 
 	if (base != new_base) {
 		/*
@@ -788,11 +749,12 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 		 */
 		if (likely(base->running_timer != timer)) {
 			/* See the comment in lock_timer_base() */
-			timer_set_base(timer, NULL);
+			timer->flags |= TIMER_MIGRATING;
+
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer_set_base(timer, base);
+			timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		}
 	}
 
@@ -954,13 +916,13 @@ EXPORT_SYMBOL(add_timer);
  */
 void add_timer_on(struct timer_list *timer, int cpu)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 	unsigned long flags;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(timer_pending(timer) || !timer->function);
 	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
+	timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 	debug_activate(timer, timer->expires);
 	internal_add_timer(base, timer);
 	spin_unlock_irqrestore(&base->lock, flags);
@@ -1025,8 +987,6 @@ int try_to_del_timer_sync(struct timer_list *timer)
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 #ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct tvec_base, __tvec_bases);
-
 /**
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -1081,7 +1041,7 @@ int del_timer_sync(struct timer_list *timer)
 	 * don't use it in hardirq context, because it
 	 * could lead to deadlock.
 	 */
-	WARN_ON(in_irq() && !tbase_get_irqsafe(timer->base));
+	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
 	for (;;) {
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
@@ -1106,7 +1066,6 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 	 * don't have to detach them individually.
 	 */
 	hlist_for_each_entry_safe(timer, tmp, &tv_list, entry) {
-		BUG_ON(tbase_get_base(timer->base) != base);
 		/* No accounting, while moving them */
 		__internal_add_timer(base, timer);
 	}
@@ -1202,7 +1161,7 @@ static inline void __run_timers(struct tvec_base *base)
 			timer = hlist_entry(head->first, struct timer_list, entry);
 			fn = timer->function;
 			data = timer->data;
-			irqsafe = tbase_get_irqsafe(timer->base);
+			irqsafe = timer->flags & TIMER_IRQSAFE;
 
 			timer_stats_account_timer(timer);
 
@@ -1242,7 +1201,7 @@ static unsigned long __next_timer_interrupt(struct tvec_base *base)
 	index = slot = timer_jiffies & TVR_MASK;
 	do {
 		hlist_for_each_entry(nte, base->tv1.vec + slot, entry) {
-			if (tbase_get_deferrable(nte->base))
+			if (nte->flags & TIMER_DEFERRABLE)
 				continue;
 
 			found = 1;
@@ -1273,7 +1232,7 @@ cascade:
 		index = slot = timer_jiffies & TVN_MASK;
 		do {
 			hlist_for_each_entry(nte, varp->vec + slot, entry) {
-				if (tbase_get_deferrable(nte->base))
+				if (nte->flags & TIMER_DEFERRABLE)
 					continue;
 
 				found = 1;
@@ -1343,7 +1302,7 @@ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  */
 u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 	u64 expires = KTIME_MAX;
 	unsigned long nextevt;
 
@@ -1395,7 +1354,7 @@ void update_process_times(int user_tick)
  */
 static void run_timer_softirq(struct softirq_action *h)
 {
-	struct tvec_base *base = __this_cpu_read(tvec_bases);
+	struct tvec_base *base = this_cpu_ptr(&tvec_bases);
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
@@ -1534,12 +1493,13 @@ EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head)
 {
 	struct timer_list *timer;
+	int cpu = new_base->cpu;
 
 	while (!hlist_empty(head)) {
 		timer = hlist_entry(head->first, struct timer_list, entry);
 		/* We ignore the accounting on the dying cpu */
 		detach_timer(timer, false);
-		timer_set_base(timer, new_base);
+		timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
 		internal_add_timer(new_base, timer);
 	}
 }
@@ -1551,8 +1511,8 @@ static void migrate_timers(int cpu)
 	int i;
 
 	BUG_ON(cpu_online(cpu));
-	old_base = per_cpu(tvec_bases, cpu);
-	new_base = get_cpu_var(tvec_bases);
+	old_base = per_cpu_ptr(&tvec_bases, cpu);
+	new_base = this_cpu_ptr(&tvec_bases);
 	/*
 	 * The caller is globally serialized and nobody else
 	 * takes two locks at once, deadlock is not possible.
@@ -1576,7 +1536,6 @@ static void migrate_timers(int cpu)
 
 	spin_unlock(&old_base->lock);
 	spin_unlock_irq(&new_base->lock);
-	put_cpu_var(tvec_bases);
 }
 
 static int timer_cpu_notify(struct notifier_block *self,
@@ -1602,12 +1561,11 @@ static inline void timer_register_cpu_notifier(void)
 static inline void timer_register_cpu_notifier(void) { }
 #endif /* CONFIG_HOTPLUG_CPU */
 
-static void __init init_timer_cpu(struct tvec_base *base, int cpu)
+static void __init init_timer_cpu(int cpu)
 {
-	BUG_ON(base != tbase_get_base(base));
+	struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu);
 
 	base->cpu = cpu;
-	per_cpu(tvec_bases, cpu) = base;
 	spin_lock_init(&base->lock);
 
 	base->timer_jiffies = jiffies;
@@ -1616,27 +1574,14 @@ static void __init init_timer_cpu(struct tvec_base *base, int cpu)
 
 static void __init init_timer_cpus(void)
 {
-	struct tvec_base *base;
-	int local_cpu = smp_processor_id();
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (cpu == local_cpu)
-			base = &boot_tvec_bases;
-#ifdef CONFIG_SMP
-		else
-			base = per_cpu_ptr(&__tvec_bases, cpu);
-#endif
-
-		init_timer_cpu(base, cpu);
-	}
+	for_each_possible_cpu(cpu)
+		init_timer_cpu(cpu);
 }
 
 void __init init_timers(void)
 {
-	/* ensure there are enough low bits for flags in timer->base pointer */
-	BUILD_BUG_ON(__alignof__(struct tvec_base) & TIMER_FLAG_MASK);
-
 	init_timer_cpus();
 	init_timer_stats();
 	timer_register_cpu_notifier();
-- 
cgit v1.2.3


From c74441a17eb975b604e339ca6c11b9ab9aaca11f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:31 +0000
Subject: timer: Stats: Simplify the flags handling

Simplify the handling of the flag storage for the timer statistics. No
intermediate storage anymore. Just hand over the flags field.

I left the printout of 'deferrable' for now because changing this
would be an ABI update and I have no idea how strong people feel about
that. OTOH, I wonder whether we should kill the whole timer stats
stuff because all of that information can be retrieved via ftrace/perf
as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.046626248@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h     |  5 +----
 kernel/time/timer.c       |  7 ++-----
 kernel/time/timer_stats.c | 10 +++++-----
 3 files changed, 8 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index 4a0d52bc2073..ff0689b6e297 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -188,13 +188,10 @@ extern void set_timer_slack(struct timer_list *time, int slack_hz);
 
 extern int timer_stats_active;
 
-#define TIMER_STATS_FLAG_DEFERRABLE	0x1
-
 extern void init_timer_stats(void);
 
 extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-				     void *timerf, char *comm,
-				     unsigned int timer_flag);
+				     void *timerf, char *comm, u32 flags);
 
 extern void __timer_stats_timer_set_start_info(struct timer_list *timer,
 					       void *addr);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 1540af9f62eb..3398d93c74a7 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -404,15 +404,12 @@ void __timer_stats_timer_set_start_info(struct timer_list *timer, void *addr)
 
 static void timer_stats_account_timer(struct timer_list *timer)
 {
-	unsigned int flag = 0;
-
 	if (likely(!timer->start_site))
 		return;
-	if (unlikely(timer->flags & TIMER_DEFERRABLE))
-		flag |= TIMER_STATS_FLAG_DEFERRABLE;
 
 	timer_stats_update_stats(timer, timer->start_pid, timer->start_site,
-				 timer->function, timer->start_comm, flag);
+				 timer->function, timer->start_comm,
+				 timer->flags);
 }
 
 #else
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 1fb08f21302e..1adecb4b87c8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -68,7 +68,7 @@ struct entry {
 	 * Number of timeout events:
 	 */
 	unsigned long		count;
-	unsigned int		timer_flag;
+	u32			flags;
 
 	/*
 	 * We save the command-line string to preserve
@@ -227,13 +227,13 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
  * @startf:	pointer to the function which did the timer setup
  * @timerf:	pointer to the timer callback function of the timer
  * @comm:	name of the process which set up the timer
+ * @tflags:	The flags field of the timer
  *
  * When the timer is already registered, then the event counter is
  * incremented. Otherwise the timer is registered in a free slot.
  */
 void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
-			      void *timerf, char *comm,
-			      unsigned int timer_flag)
+			      void *timerf, char *comm, u32 tflags)
 {
 	/*
 	 * It doesn't matter which lock we take:
@@ -251,7 +251,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
 	input.start_func = startf;
 	input.expire_func = timerf;
 	input.pid = pid;
-	input.timer_flag = timer_flag;
+	input.flags = tflags;
 
 	raw_spin_lock_irqsave(lock, flags);
 	if (!timer_stats_active)
@@ -306,7 +306,7 @@ static int tstats_show(struct seq_file *m, void *v)
 
 	for (i = 0; i < nr_entries; i++) {
 		entry = entries + i;
-		if (entry->timer_flag & TIMER_STATS_FLAG_DEFERRABLE) {
+		if (entry->flags & TIMER_DEFERRABLE) {
 			seq_printf(m, "%4luD, %5d %-16s ",
 				entry->count, entry->pid, entry->comm);
 		} else {
-- 
cgit v1.2.3


From bc7a34b8b9ebfb0f4b8a35a72a0b134fd6c5ef50 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:33 +0000
Subject: timer: Reduce timer migration overhead if disabled

Eric reported that the timer_migration sysctl is not really nice
performance wise as it needs to check at every timer insertion whether
the feature is enabled or not. Further the check does not live in the
timer code, so we have an extra function call which checks an extra
cache line to figure out that it is disabled.

We can do better and store that information in the per cpu (hr)timer
bases. I pondered to use a static key, but that's a nightmare to
update from the nohz code and the timer base cache line is hot anyway
when we select a timer base.

The old logic enabled the timer migration unconditionally if
CONFIG_NO_HZ was set even if nohz was disabled on the kernel command
line.

With this modification, we start off with migration disabled. The user
visible sysctl is still set to enabled. If the kernel switches to NOHZ
migration is enabled, if the user did not disable it via the sysctl
prior to the switch. If nohz=off is on the kernel command line,
migration stays disabled no matter what.

Before:
  47.76%  hog       [.] main
  14.84%  [kernel]  [k] _raw_spin_lock_irqsave
   9.55%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.71%  [kernel]  [k] mod_timer
   6.24%  [kernel]  [k] lock_timer_base.isra.38
   3.76%  [kernel]  [k] detach_if_pending
   3.71%  [kernel]  [k] del_timer
   2.50%  [kernel]  [k] internal_add_timer
   1.51%  [kernel]  [k] get_nohz_timer_target
   1.28%  [kernel]  [k] __internal_add_timer
   0.78%  [kernel]  [k] timerfn
   0.48%  [kernel]  [k] wake_up_nohz_cpu

After:
  48.10%  hog       [.] main
  15.25%  [kernel]  [k] _raw_spin_lock_irqsave
   9.76%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.50%  [kernel]  [k] mod_timer
   6.44%  [kernel]  [k] lock_timer_base.isra.38
   3.87%  [kernel]  [k] detach_if_pending
   3.80%  [kernel]  [k] del_timer
   2.67%  [kernel]  [k] internal_add_timer
   1.33%  [kernel]  [k] __internal_add_timer
   0.73%  [kernel]  [k] timerfn
   0.54%  [kernel]  [k] wake_up_nohz_cpu


Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h      |  2 ++
 include/linux/sched.h        |  6 +----
 include/linux/sched/sysctl.h | 12 ---------
 include/linux/timer.h        |  9 +++++++
 kernel/rcu/tree_plugin.h     |  2 --
 kernel/sched/core.c          |  9 +++----
 kernel/sysctl.c              | 18 +++++++-------
 kernel/time/hrtimer.c        | 35 ++++++++++++++++++++------
 kernel/time/tick-internal.h  | 14 +++++++++++
 kernel/time/tick-sched.c     | 25 ++++++++++---------
 kernel/time/timer.c          | 59 ++++++++++++++++++++++++++++++++++++++++----
 kernel/time/timer_list.c     |  2 --
 12 files changed, 133 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 5db055821ef3..69551020bb97 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -163,6 +163,7 @@ enum  hrtimer_base_type {
  * @cpu:		cpu number
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
+ * @migration_enabled:	The migration of hrtimers to other cpus is enabled
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @next_timer:		Pointer to the first expiring timer
@@ -186,6 +187,7 @@ struct hrtimer_cpu_base {
 	unsigned int			cpu;
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
+	bool				migration_enabled;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			in_hrtirq	: 1,
 					hres_active	: 1,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 26a2e6122734..d7151460b0cf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(int pinned);
+extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
-static inline int get_nohz_timer_target(int pinned)
-{
-	return smp_processor_id();
-}
 #endif
 
 /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 596a0e007c62..c9e4731cf10b 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *length,
 		loff_t *ppos);
 #endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-	return 1;
-}
-#endif
 
 /*
  *  control realtime throttling:
diff --git a/include/linux/timer.h b/include/linux/timer.h
index ff0689b6e297..61aa61dc410c 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -238,6 +238,15 @@ extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#include <linux/sysctl.h>
+
+extern unsigned int sysctl_timer_migration;
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos);
+#endif
+
 unsigned long __round_jiffies(unsigned long j, int cpu);
 unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0ef80a0bbabb..d72fa24f2312 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1432,8 +1432,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
 
-extern int tick_nohz_active;
-
 /*
  * Try to advance callbacks for all flavors of RCU on the current CPU, but
  * only if it has been awhile since the last time we did so.  Afterwards,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecb7c4216350..e9f25ce70c77 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -572,13 +572,12 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
 {
-	int cpu = smp_processor_id();
-	int i;
+	int i, cpu = smp_processor_id();
 	struct sched_domain *sd;
 
-	if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+	if (!idle_cpu(cpu))
 		return cpu;
 
 	rcu_read_lock();
@@ -7050,8 +7049,6 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
-const_debug unsigned int sysctl_timer_migration = 1;
-
 int in_sched_functions(unsigned long addr)
 {
 	return in_lock_functions(addr) ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2082b1a88fb9..b13e9d2de302 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
-	{
-		.procname	= "timer_migration",
-		.data		= &sysctl_timer_migration,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
-		.extra1		= &zero,
-		.extra2		= &one,
-	},
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
 	{
@@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &zero,
 		.extra2		= &one,
 	},
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+	{
+		.procname	= "timer_migration",
+		.data		= &sysctl_timer_migration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= timer_migration_handler,
+	},
+#endif
 	{ }
 };
 
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index f026413de4d6..6115f4df119b 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&hrtimer_bases);
+	return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+					 int pinned)
+{
+	return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
 /*
  * Switch the timer base to the current CPU when possible.
  */
@@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
 		    int pinned)
 {
+	struct hrtimer_cpu_base *new_cpu_base, *this_base;
 	struct hrtimer_clock_base *new_base;
-	struct hrtimer_cpu_base *new_cpu_base;
-	int this_cpu = smp_processor_id();
-	int cpu = get_nohz_timer_target(pinned);
 	int basenum = base->index;
 
+	this_base = this_cpu_ptr(&hrtimer_bases);
+	new_cpu_base = get_target_base(this_base, pinned);
 again:
-	new_cpu_base = &per_cpu(hrtimer_bases, cpu);
 	new_base = &new_cpu_base->clock_base[basenum];
 
 	if (base != new_base) {
@@ -212,17 +229,19 @@ again:
 		raw_spin_unlock(&base->cpu_base->lock);
 		raw_spin_lock(&new_base->cpu_base->lock);
 
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
 			raw_spin_unlock(&new_base->cpu_base->lock);
 			raw_spin_lock(&base->cpu_base->lock);
+			new_cpu_base = this_base;
 			timer->base = base;
 			goto again;
 		}
 		timer->base = new_base;
 	} else {
-		if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-			cpu = this_cpu;
+		if (new_cpu_base != this_base &&
+		    hrtimer_check_target(timer, new_base)) {
+			new_cpu_base = this_base;
 			goto again;
 		}
 	}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index ec2208aabdd1..2edde84744df 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -149,4 +149,18 @@ extern void tick_nohz_init(void);
 static inline void tick_nohz_init(void) { }
 #endif
 
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(void);
+#else
+static inline void timers_update_migration(void) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 812f7a3b9898..b1cb01699355 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -399,7 +399,7 @@ void __init tick_nohz_init(void)
  * NO HZ enabled ?
  */
 static int tick_nohz_enabled __read_mostly  = 1;
-int tick_nohz_active  __read_mostly;
+unsigned long tick_nohz_active  __read_mostly;
 /*
  * Enable / Disable tickless mode
  */
@@ -956,6 +956,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+	if (!tick_nohz_enabled)
+		return;
+	ts->nohz_mode = mode;
+	/* One update is enough */
+	if (!test_and_set_bit(0, &tick_nohz_active))
+		timers_update_migration();
+}
+
 /**
  * tick_nohz_switch_to_nohz - switch to nohz mode
  */
@@ -970,9 +980,6 @@ static void tick_nohz_switch_to_nohz(void)
 	if (tick_switch_to_oneshot(tick_nohz_handler))
 		return;
 
-	tick_nohz_active = 1;
-	ts->nohz_mode = NOHZ_MODE_LOWRES;
-
 	/*
 	 * Recycle the hrtimer in ts, so we can share the
 	 * hrtimer_forward with the highres code.
@@ -984,6 +991,7 @@ static void tick_nohz_switch_to_nohz(void)
 	hrtimer_forward_now(&ts->sched_timer, tick_period);
 	hrtimer_set_expires(&ts->sched_timer, next);
 	tick_program_event(next, 1);
+	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
 /*
@@ -1035,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void)
 
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -1117,13 +1126,7 @@ void tick_setup_sched_timer(void)
 
 	hrtimer_forward(&ts->sched_timer, now, tick_period);
 	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
-
-#ifdef CONFIG_NO_HZ_COMMON
-	if (tick_nohz_enabled) {
-		ts->nohz_mode = NOHZ_MODE_HIGHRES;
-		tick_nohz_active = 1;
-	}
-#endif
+	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
 #endif /* HIGH_RES_TIMERS */
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3398d93c74a7..343142ed996a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -85,6 +85,7 @@ struct tvec_base {
 	unsigned long active_timers;
 	unsigned long all_timers;
 	int cpu;
+	bool migration_enabled;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -95,6 +96,54 @@ struct tvec_base {
 
 static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+
+void timers_update_migration(void)
+{
+	bool on = sysctl_timer_migration && tick_nohz_active;
+	unsigned int cpu;
+
+	/* Avoid the loop, if nothing to update */
+	if (this_cpu_read(tvec_bases.migration_enabled) == on)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		per_cpu(tvec_bases.migration_enabled, cpu) = on;
+		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+	}
+}
+
+int timer_migration_handler(struct ctl_table *table, int write,
+			    void __user *buffer, size_t *lenp,
+			    loff_t *ppos)
+{
+	static DEFINE_MUTEX(mutex);
+	int ret;
+
+	mutex_lock(&mutex);
+	ret = proc_dointvec(table, write, buffer, lenp, ppos);
+	if (!ret && write)
+		timers_update_migration();
+	mutex_unlock(&mutex);
+	return ret;
+}
+
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
+{
+	if (pinned || !base->migration_enabled)
+		return this_cpu_ptr(&tvec_bases);
+	return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+}
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+						int pinned)
+{
+	return this_cpu_ptr(&tvec_bases);
+}
+#endif
+
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
 		bool force_up)
 {
@@ -716,11 +765,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
-						bool pending_only, int pinned)
+	    bool pending_only, int pinned)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0 , cpu;
+	int ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -733,8 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
 	debug_activate(timer, expires);
 
-	cpu = get_nohz_timer_target(pinned);
-	new_base = per_cpu_ptr(&tvec_bases, cpu);
+	new_base = get_target_base(base, pinned);
 
 	if (base != new_base) {
 		/*
@@ -751,7 +799,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 			spin_unlock(&base->lock);
 			base = new_base;
 			spin_lock(&base->lock);
-			timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
+			timer->flags &= ~TIMER_BASEMASK;
+			timer->flags |= base->cpu;
 		}
 	}
 
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index 1327004429be..a4536e1e3e2a 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -29,8 +29,6 @@ struct timer_list_iter {
 
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
 /*
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):
-- 
cgit v1.2.3


From 683be13a284720205228e29207ef11a1c3c322b9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 26 May 2015 22:50:35 +0000
Subject: timer: Minimize nohz off overhead

If nohz is disabled on the kernel command line the [hr]timer code
still calls wake_up_nohz_cpu() and tick_nohz_full_cpu(), a pretty
pointless exercise. Cache nohz_active in [hr]timer per cpu bases and
avoid the overhead.

Before:
  48.10%  hog       [.] main
  15.25%  [kernel]  [k] _raw_spin_lock_irqsave
   9.76%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.50%  [kernel]  [k] mod_timer
   6.44%  [kernel]  [k] lock_timer_base.isra.38
   3.87%  [kernel]  [k] detach_if_pending
   3.80%  [kernel]  [k] del_timer
   2.67%  [kernel]  [k] internal_add_timer
   1.33%  [kernel]  [k] __internal_add_timer
   0.73%  [kernel]  [k] timerfn
   0.54%  [kernel]  [k] wake_up_nohz_cpu

After:
  48.73%  hog       [.] main
  15.36%  [kernel]  [k] _raw_spin_lock_irqsave
   9.77%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.61%  [kernel]  [k] lock_timer_base.isra.38
   6.42%  [kernel]  [k] mod_timer
   3.90%  [kernel]  [k] detach_if_pending
   3.76%  [kernel]  [k] del_timer
   2.41%  [kernel]  [k] internal_add_timer
   1.39%  [kernel]  [k] __internal_add_timer
   0.76%  [kernel]  [k] timerfn

We probably should have a cached value for nohz full in the per cpu
bases as well to avoid the cpumask check. The base cache line is hot
already, the cpumask not necessarily.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.207378134@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/hrtimer.h     |  2 ++
 kernel/time/hrtimer.c       |  3 ++-
 kernel/time/tick-internal.h |  4 ++--
 kernel/time/tick-sched.c    |  2 +-
 kernel/time/timer.c         | 16 ++++++++++++----
 5 files changed, 19 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 69551020bb97..76dd4f0da5ca 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -164,6 +164,7 @@ enum  hrtimer_base_type {
  * @active_bases:	Bitfield to mark bases with active timers
  * @clock_was_set_seq:	Sequence counter of clock was set events
  * @migration_enabled:	The migration of hrtimers to other cpus is enabled
+ * @nohz_active:	The nohz functionality is enabled
  * @expires_next:	absolute time of the next event which was scheduled
  *			via clock_set_next_event()
  * @next_timer:		Pointer to the first expiring timer
@@ -188,6 +189,7 @@ struct hrtimer_cpu_base {
 	unsigned int			active_bases;
 	unsigned int			clock_was_set_seq;
 	bool				migration_enabled;
+	bool				nohz_active;
 #ifdef CONFIG_HIGH_RES_TIMERS
 	unsigned int			in_hrtirq	: 1,
 					hres_active	: 1,
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 6115f4df119b..db5c9508ed95 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -994,7 +994,8 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 		 * Kick to reschedule the next tick to handle the new timer
 		 * on dynticks target.
 		 */
-		wake_up_nohz_cpu(new_base->cpu_base->cpu);
+		if (new_base->cpu_base->nohz_active)
+			wake_up_nohz_cpu(new_base->cpu_base->cpu);
 	} else {
 		hrtimer_reprogram(timer, new_base);
 	}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 2edde84744df..966a5a6fdd0a 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -156,9 +156,9 @@ extern unsigned long tick_nohz_active;
 #endif
 
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void timers_update_migration(void);
+extern void timers_update_migration(bool update_nohz);
 #else
-static inline void timers_update_migration(void) { }
+static inline void timers_update_migration(bool update_nohz) { }
 #endif
 
 DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b1cb01699355..c792429e98c6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -963,7 +963,7 @@ static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
 	ts->nohz_mode = mode;
 	/* One update is enough */
 	if (!test_and_set_bit(0, &tick_nohz_active))
-		timers_update_migration();
+		timers_update_migration(true);
 }
 
 /**
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 343142ed996a..520499dd85af 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -86,6 +86,7 @@ struct tvec_base {
 	unsigned long all_timers;
 	int cpu;
 	bool migration_enabled;
+	bool nohz_active;
 	struct tvec_root tv1;
 	struct tvec tv2;
 	struct tvec tv3;
@@ -99,7 +100,7 @@ static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 unsigned int sysctl_timer_migration = 1;
 
-void timers_update_migration(void)
+void timers_update_migration(bool update_nohz)
 {
 	bool on = sysctl_timer_migration && tick_nohz_active;
 	unsigned int cpu;
@@ -111,6 +112,10 @@ void timers_update_migration(void)
 	for_each_possible_cpu(cpu) {
 		per_cpu(tvec_bases.migration_enabled, cpu) = on;
 		per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+		if (!update_nohz)
+			continue;
+		per_cpu(tvec_bases.nohz_active, cpu) = true;
+		per_cpu(hrtimer_bases.nohz_active, cpu) = true;
 	}
 }
 
@@ -124,7 +129,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
 	mutex_lock(&mutex);
 	ret = proc_dointvec(table, write, buffer, lenp, ppos);
 	if (!ret && write)
-		timers_update_migration();
+		timers_update_migration(false);
 	mutex_unlock(&mutex);
 	return ret;
 }
@@ -436,8 +441,11 @@ static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
 	 * require special care against races with idle_cpu(), lets deal
 	 * with that later.
 	 */
-	if (!(timer->flags & TIMER_DEFERRABLE) || tick_nohz_full_cpu(base->cpu))
-		wake_up_nohz_cpu(base->cpu);
+	if (base->nohz_active) {
+		if (!(timer->flags & TIMER_DEFERRABLE) ||
+		    tick_nohz_full_cpu(base->cpu))
+			wake_up_nohz_cpu(base->cpu);
+	}
 }
 
 #ifdef CONFIG_TIMER_STATS
-- 
cgit v1.2.3