From 1a2b85f1e2a93a3f84243e654d225e4088735336 Mon Sep 17 00:00:00 2001
From: Davidlohr Bueso
Date: Wed, 21 Oct 2020 12:07:49 -0700
Subject: timekeeping: Convert jiffies_seq to seqcount_raw_spinlock_t

Use the new api and associate the seqcounter to the jiffies_lock enabling
lockdep support - although for this particular case the write-side locking
and non-preemptibility are quite obvious.

Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201021190749.19363-1-dave@stgolabs.net

---
 kernel/time/jiffies.c     | 3 ++-
 kernel/time/timekeeping.h | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index eddcf4970444..a5cffe2a1770 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -59,7 +59,8 @@ static struct clocksource clocksource_jiffies = {
 };
 
 __cacheline_aligned_in_smp DEFINE_RAW_SPINLOCK(jiffies_lock);
-__cacheline_aligned_in_smp seqcount_t jiffies_seq;
+__cacheline_aligned_in_smp seqcount_raw_spinlock_t jiffies_seq =
+	SEQCNT_RAW_SPINLOCK_ZERO(jiffies_seq, &jiffies_lock);
 
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
index 099737f6f10c..6c2cbd9ef999 100644
--- a/kernel/time/timekeeping.h
+++ b/kernel/time/timekeeping.h
@@ -26,7 +26,7 @@ extern void do_timer(unsigned long ticks);
 extern void update_wall_time(void);
 
 extern raw_spinlock_t jiffies_lock;
-extern seqcount_t jiffies_seq;
+extern seqcount_raw_spinlock_t jiffies_seq;
 
 #define CS_NAME_LEN	32
 
-- 
cgit v1.2.3


From da88f9b3113620dcd30fc203236aa53d5430ee98 Mon Sep 17 00:00:00 2001
From: Helge Deller
Date: Wed, 4 Nov 2020 17:34:01 +0100
Subject: timer_list: Use printk format instead of open-coded symbol lookup

Use the "%ps" printk format string to resolve symbol names.

This works on all platforms, including ia64, ppc64 and parisc64 on which
one needs to dereference pointers to function descriptors instead of
function pointers.

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201104163401.GA3984@ls3530.fritz.box


---
 kernel/time/timer_list.c | 66 ++++++++++++++----------------------------------
 1 file changed, 19 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index acb326f5f50a..6939140ab7c5 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -42,24 +42,11 @@ static void SEQ_printf(struct seq_file *m, const char *fmt, ...)
 	va_end(args);
 }
 
-static void print_name_offset(struct seq_file *m, void *sym)
-{
-	char symname[KSYM_NAME_LEN];
-
-	if (lookup_symbol_name((unsigned long)sym, symname) < 0)
-		SEQ_printf(m, "<%pK>", sym);
-	else
-		SEQ_printf(m, "%s", symname);
-}
-
 static void
 print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
 	    int idx, u64 now)
 {
-	SEQ_printf(m, " #%d: ", idx);
-	print_name_offset(m, taddr);
-	SEQ_printf(m, ", ");
-	print_name_offset(m, timer->function);
+	SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function);
 	SEQ_printf(m, ", S:%02x", timer->state);
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
@@ -116,9 +103,7 @@ print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
 
 	SEQ_printf(m, "  .resolution: %u nsecs\n", hrtimer_resolution);
 
-	SEQ_printf(m,   "  .get_time:   ");
-	print_name_offset(m, base->get_time);
-	SEQ_printf(m,   "\n");
+	SEQ_printf(m,   "  .get_time:   %ps\n", base->get_time);
 #ifdef CONFIG_HIGH_RES_TIMERS
 	SEQ_printf(m, "  .offset:     %Lu nsecs\n",
 		   (unsigned long long) ktime_to_ns(base->offset));
@@ -218,42 +203,29 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
 	SEQ_printf(m, " next_event:     %Ld nsecs\n",
 		   (unsigned long long) ktime_to_ns(dev->next_event));
 
-	SEQ_printf(m, " set_next_event: ");
-	print_name_offset(m, dev->set_next_event);
-	SEQ_printf(m, "\n");
+	SEQ_printf(m, " set_next_event: %ps\n", dev->set_next_event);
 
-	if (dev->set_state_shutdown) {
-		SEQ_printf(m, " shutdown: ");
-		print_name_offset(m, dev->set_state_shutdown);
-		SEQ_printf(m, "\n");
-	}
+	if (dev->set_state_shutdown)
+		SEQ_printf(m, " shutdown:       %ps\n",
+			dev->set_state_shutdown);
 
-	if (dev->set_state_periodic) {
-		SEQ_printf(m, " periodic: ");
-		print_name_offset(m, dev->set_state_periodic);
-		SEQ_printf(m, "\n");
-	}
+	if (dev->set_state_periodic)
+		SEQ_printf(m, " periodic:       %ps\n",
+			dev->set_state_periodic);
 
-	if (dev->set_state_oneshot) {
-		SEQ_printf(m, " oneshot:  ");
-		print_name_offset(m, dev->set_state_oneshot);
-		SEQ_printf(m, "\n");
-	}
+	if (dev->set_state_oneshot)
+		SEQ_printf(m, " oneshot:        %ps\n",
+			dev->set_state_oneshot);
 
-	if (dev->set_state_oneshot_stopped) {
-		SEQ_printf(m, " oneshot stopped: ");
-		print_name_offset(m, dev->set_state_oneshot_stopped);
-		SEQ_printf(m, "\n");
-	}
+	if (dev->set_state_oneshot_stopped)
+		SEQ_printf(m, " oneshot stopped: %ps\n",
+			dev->set_state_oneshot_stopped);
 
-	if (dev->tick_resume) {
-		SEQ_printf(m, " resume:   ");
-		print_name_offset(m, dev->tick_resume);
-		SEQ_printf(m, "\n");
-	}
+	if (dev->tick_resume)
+		SEQ_printf(m, " resume:         %ps\n",
+			dev->tick_resume);
 
-	SEQ_printf(m, " event_handler:  ");
-	print_name_offset(m, dev->event_handler);
+	SEQ_printf(m, " event_handler:  %ps\n", dev->event_handler);
 	SEQ_printf(m, "\n");
 	SEQ_printf(m, " retries:        %lu\n", dev->retries);
 	SEQ_printf(m, "\n");
-- 
cgit v1.2.3


From c725dafc95f1b37027840aaeaa8b7e4e9cd20516 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior
Date: Tue, 3 Nov 2020 20:09:37 +0100
Subject: timers: Don't block on ->expiry_lock for TIMER_IRQSAFE timers

PREEMPT_RT does not spin and wait until a running timer completes its
callback but instead it blocks on a sleeping lock to prevent a livelock in
the case that the task waiting for the callback completion preempted the
callback.

This cannot be done for timers flagged with TIMER_IRQSAFE. These timers can
be canceled from an interrupt disabled context even on RT kernels.

The expiry callback of such timers is invoked with interrupts disabled so
there is no need to use the expiry lock mechanism because obviously the
callback cannot be preempted even on RT kernels.

Do not use the timer_base::expiry_lock mechanism when waiting for a running
callback to complete if the timer is flagged with TIMER_IRQSAFE.

Also add a lockdep assertion for RT kernels to validate that the expiry
lock mechanism is always invoked in preemptible context.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201103190937.hga67rqhvknki3tp@linutronix.de

---
 kernel/time/timer.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index de37e33a868d..af9ddfbd447d 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1288,7 +1288,7 @@ static void del_timer_wait_running(struct timer_list *timer)
 	u32 tf;
 
 	tf = READ_ONCE(timer->flags);
-	if (!(tf & TIMER_MIGRATING)) {
+	if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
 		struct timer_base *base = get_timer_base(tf);
 
 		/*
@@ -1372,6 +1372,13 @@ int del_timer_sync(struct timer_list *timer)
 	 */
 	WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
 
+	/*
+	 * Must be able to sleep on PREEMPT_RT because of the slowpath in
+	 * del_timer_wait_running().
+	 */
+	if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
+		lockdep_assert_preemption_enabled();
+
 	do {
 		ret = try_to_del_timer_sync(timer);
 
-- 
cgit v1.2.3


From a0f5a65fa5faeef708d022698d5fcba290a35856 Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:30 +0800
Subject: time: Add missing colons for parameter documentation of
 time64_to_tm()

Address these kernel-doc warnings:

 kernel/time/timeconv.c:79: warning: Function parameter or member
 'totalsecs' not described in 'time64_to_tm'
 kernel/time/timeconv.c:79: warning: Function parameter or member
 'offset' not described in 'time64_to_tm'
 kernel/time/timeconv.c:79: warning: Function parameter or member
 'result' not described in 'time64_to_tm'

The parameters are described but lack colons after the parameter name.

[ tglx: Massaged changelog ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-1-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timeconv.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
index 589e0a552129..62e3b46717a6 100644
--- a/kernel/time/timeconv.c
+++ b/kernel/time/timeconv.c
@@ -70,10 +70,10 @@ static const unsigned short __mon_yday[2][13] = {
 /**
  * time64_to_tm - converts the calendar time to local broken-down time
  *
- * @totalsecs	the number of seconds elapsed since 00:00:00 on January 1, 1970,
+ * @totalsecs:	the number of seconds elapsed since 00:00:00 on January 1, 1970,
  *		Coordinated Universal Time (UTC).
- * @offset	offset seconds adding to totalsecs.
- * @result	pointer to struct tm variable to receive broken-down time
+ * @offset:	offset seconds adding to totalsecs.
+ * @result:	pointer to struct tm variable to receive broken-down time
  */
 void time64_to_tm(time64_t totalsecs, int offset, struct tm *result)
 {
-- 
cgit v1.2.3


From 199d280c884de44c3b0daeb77438db43f6db01a2 Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:33 +0800
Subject: timekeeping: Remove static functions from kernel-doc markup

Various static functions in the timekeeping code have function comments
which pretend to be kernel-doc, but are incomplete and trigger parser
warnings.

As these functions are local to the timekeeping core code there is no need
to expose them via kernel-doc.

Remove the double star kernel-doc marker and remove excess newlines.

[ tglx: Massaged changelog and removed excess newlines ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-4-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timekeeping.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 6858a31364b6..570fc500d263 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1415,9 +1415,8 @@ void timekeeping_warp_clock(void)
 	}
 }
 
-/**
+/*
  * __timekeeping_set_tai_offset - Sets the TAI offset from UTC and monotonic
- *
  */
 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
@@ -1425,7 +1424,7 @@ static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
 }
 
-/**
+/*
  * change_clocksource - Swaps clocksources if a new one is available
  *
  * Accumulates current time interval and initializes new clocksource
@@ -2023,13 +2022,12 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 	}
 }
 
-/**
+/*
  * accumulate_nsecs_to_secs - Accumulates nsecs into secs
  *
  * Helper function that accumulates the nsecs greater than a second
  * from the xtime_nsec field to the xtime_secs field.
  * It also calls into the NTP code to handle leapsecond processing.
- *
  */
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
@@ -2071,7 +2069,7 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 	return clock_set;
 }
 
-/**
+/*
  * logarithmic_accumulation - shifted accumulation of cycles
  *
  * This functions accumulates a shifted interval of cycles into
@@ -2314,7 +2312,7 @@ ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_real,
 	return base;
 }
 
-/**
+/*
  * timekeeping_validate_timex - Ensures the timex is ok for use in do_adjtimex
  */
 static int timekeeping_validate_timex(const struct __kernel_timex *txc)
-- 
cgit v1.2.3


From e025b03113d27139ce2b28b82599018e4d8fa5f6 Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:31 +0800
Subject: timekeeping: Add missing parameter documentation for
 update_fast_timekeeper()

Address the following warning:

 kernel/time/timekeeping.c:415: warning: Function parameter or member
 'tkf' not described in 'update_fast_timekeeper'

[ tglx: Remove the bogus ktime_get_mono_fast_ns() part ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-2-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timekeeping.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 570fc500d263..a823703c905e 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -407,6 +407,7 @@ static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 c
 /**
  * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
  * @tkr: Timekeeping readout base from which we take the update
+ * @tkf: Pointer to NMI safe timekeeper
  *
  * We want to use this from any context including NMI and tracing /
  * instrumenting the timekeeping code itself.
-- 
cgit v1.2.3


From c1ce406e80fb15fa52b2b48dfd48fad6f3d2a32f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 15 Nov 2020 21:09:31 +0100
Subject: timekeeping: Fix up function documentation for the NMI safe accessors

Alex reported the following warning:

 kernel/time/timekeeping.c:464: warning: Function parameter or member
 'tkf' not described in '__ktime_get_fast_ns'

which is not entirely correct because the documented function is
ktime_get_mono_fast_ns() which does not have a parameter, but the
kernel-doc parser looks at the function declaration which follows the
comment and complains about the missing parameter documentation.

Aside of that the documentation for the rest of the NMI safe accessors is
either incomplete or missing.

  - Move the function documentation to the right place
  - Fixup the references and inconsistencies
  - Add the missing documentation for ktime_get_raw_fast_ns()

Reported-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/timekeeping.c | 58 +++++++++++++++++++++++++++--------------------
 1 file changed, 33 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index a823703c905e..ab4b83186331 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -437,6 +437,27 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
 	memcpy(base + 1, base, sizeof(*base));
 }
 
+static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
+{
+	struct tk_read_base *tkr;
+	unsigned int seq;
+	u64 now;
+
+	do {
+		seq = raw_read_seqcount_latch(&tkf->seq);
+		tkr = tkf->base + (seq & 0x01);
+		now = ktime_to_ns(tkr->base);
+
+		now += timekeeping_delta_to_ns(tkr,
+				clocksource_delta(
+					tk_clock_read(tkr),
+					tkr->cycle_last,
+					tkr->mask));
+	} while (read_seqcount_latch_retry(&tkf->seq, seq));
+
+	return now;
+}
+
 /**
  * ktime_get_mono_fast_ns - Fast NMI safe access to clock monotonic
  *
@@ -463,39 +484,24 @@ static void update_fast_timekeeper(const struct tk_read_base *tkr,
  *
  * So reader 6 will observe time going backwards versus reader 5.
  *
- * While other CPUs are likely to be able observe that, the only way
+ * While other CPUs are likely to be able to observe that, the only way
  * for a CPU local observation is when an NMI hits in the middle of
  * the update. Timestamps taken from that NMI context might be ahead
  * of the following timestamps. Callers need to be aware of that and
  * deal with it.
  */
-static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
-{
-	struct tk_read_base *tkr;
-	unsigned int seq;
-	u64 now;
-
-	do {
-		seq = raw_read_seqcount_latch(&tkf->seq);
-		tkr = tkf->base + (seq & 0x01);
-		now = ktime_to_ns(tkr->base);
-
-		now += timekeeping_delta_to_ns(tkr,
-				clocksource_delta(
-					tk_clock_read(tkr),
-					tkr->cycle_last,
-					tkr->mask));
-	} while (read_seqcount_latch_retry(&tkf->seq, seq));
-
-	return now;
-}
-
 u64 ktime_get_mono_fast_ns(void)
 {
 	return __ktime_get_fast_ns(&tk_fast_mono);
 }
 EXPORT_SYMBOL_GPL(ktime_get_mono_fast_ns);
 
+/**
+ * ktime_get_raw_fast_ns - Fast NMI safe access to clock monotonic raw
+ *
+ * Contrary to ktime_get_mono_fast_ns() this is always correct because the
+ * conversion factor is not affected by NTP/PTP correction.
+ */
 u64 ktime_get_raw_fast_ns(void)
 {
 	return __ktime_get_fast_ns(&tk_fast_raw);
@@ -522,6 +528,9 @@ EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
  * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
  * partially updated.  Since the tk->offs_boot update is a rare event, this
  * should be a rare occurrence which postprocessing should be able to handle.
+ *
+ * The caveats vs. timestamp ordering as documented for ktime_get_fast_ns()
+ * apply as well.
  */
 u64 notrace ktime_get_boot_fast_ns(void)
 {
@@ -531,9 +540,6 @@ u64 notrace ktime_get_boot_fast_ns(void)
 }
 EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
 
-/*
- * See comment for __ktime_get_fast_ns() vs. timestamp ordering
- */
 static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 {
 	struct tk_read_base *tkr;
@@ -558,6 +564,8 @@ static __always_inline u64 __ktime_get_real_fast(struct tk_fast *tkf, u64 *mono)
 
 /**
  * ktime_get_real_fast_ns: - NMI safe and fast access to clock realtime.
+ *
+ * See ktime_get_fast_ns() for documentation of the time stamp ordering.
  */
 u64 ktime_get_real_fast_ns(void)
 {
-- 
cgit v1.2.3


From f27f7c3f100e74a7f451a63a15788f50c52f7cce Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:32 +0800
Subject: timekeeping: Add missing parameter docs for
 pvclock_gtod_[un]register_notifier()

The kernel-doc parser complains about:
 kernel/time/timekeeping.c:651: warning: Function parameter or member
 'nb' not described in 'pvclock_gtod_register_notifier'
 kernel/time/timekeeping.c:670: warning: Function parameter or member
 'nb' not described in 'pvclock_gtod_unregister_notifier'

Add the missing parameter explanations.

[ tglx: Massaged changelog ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-3-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timekeeping.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index ab4b83186331..9c9392360ade 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -663,6 +663,7 @@ static void update_pvclock_gtod(struct timekeeper *tk, bool was_set)
 
 /**
  * pvclock_gtod_register_notifier - register a pvclock timedata update listener
+ * @nb: Pointer to the notifier block to register
  */
 int pvclock_gtod_register_notifier(struct notifier_block *nb)
 {
@@ -682,6 +683,7 @@ EXPORT_SYMBOL_GPL(pvclock_gtod_register_notifier);
 /**
  * pvclock_gtod_unregister_notifier - unregister a pvclock
  * timedata update listener
+ * @nb: Pointer to the notifier block to unregister
  */
 int pvclock_gtod_unregister_notifier(struct notifier_block *nb)
 {
-- 
cgit v1.2.3


From 29efc4612ac1b888e65da408b41dafa4dd00842f Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:35 +0800
Subject: timekeeping: Fix parameter docs of
 read_persistent_wall_and_boot_offset()

Address the following kernel-doc markup warnings:

 kernel/time/timekeeping.c:1563: warning: Function parameter or member
 'wall_time' not described in 'read_persistent_wall_and_boot_offset'
 kernel/time/timekeeping.c:1563: warning: Function parameter or member
 'boot_offset' not described in 'read_persistent_wall_and_boot_offset'

The parameters are described but miss the leading '@' and the colon after
the parameter names.

[ tglx: Massaged changelog ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-6-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timekeeping.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 9c9392360ade..75cba958ae29 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1576,8 +1576,9 @@ void __weak read_persistent_clock64(struct timespec64 *ts)
  *                                        from the boot.
  *
  * Weak dummy function for arches that do not yet support it.
- * wall_time	- current time as returned by persistent clock
- * boot_offset	- offset that is defined as wall_time - boot_time
+ * @wall_time:	- current time as returned by persistent clock
+ * @boot_offset: - offset that is defined as wall_time - boot_time
+ *
  * The default function calculates offset based on the current value of
  * local_clock(). This way architectures that support sched_clock() but don't
  * support dedicated boot time clock will provide the best estimate of the
-- 
cgit v1.2.3


From 6e5a91901c2dff3a0f2eb9f10e427dce2b0488fc Mon Sep 17 00:00:00 2001
From: Alex Shi
Date: Fri, 13 Nov 2020 15:24:34 +0800
Subject: timekeeping: Address parameter documentation issues for various
 functions

The kernel-doc parser complains:

 kernel/time/timekeeping.c:1543: warning: Function parameter or member
 'ts' not described in 'read_persistent_clock64'

 kernel/time/timekeeping.c:764: warning: Function parameter or member
 'tk' not described in 'timekeeping_forward_now'

 kernel/time/timekeeping.c:1331: warning: Function parameter or member
 'ts' not described in 'timekeeping_inject_offset'

 kernel/time/timekeeping.c:1331: warning: Excess function parameter 'tv'
 description in 'timekeeping_inject_offset'

Add the missing parameter documentations and rename the 'tv' parameter of
timekeeping_inject_offset() to 'ts' so it matches the implemention.

[ tglx: Reworded a few docs and massaged changelog ]

Signed-off-by: Alex Shi <alex.shi@linux.alibaba.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/1605252275-63652-5-git-send-email-alex.shi@linux.alibaba.com

---
 kernel/time/timekeeping.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 75cba958ae29..74503c0151e5 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -774,6 +774,7 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
 
 /**
  * timekeeping_forward_now - update clock to the current time
+ * @tk:		Pointer to the timekeeper to update
  *
  * Forward the current clock to update its state since the last call to
  * update_wall_time(). This is useful before significant clock changes,
@@ -1350,7 +1351,7 @@ EXPORT_SYMBOL(do_settimeofday64);
 
 /**
  * timekeeping_inject_offset - Adds or subtracts from the current time.
- * @tv:		pointer to the timespec variable containing the offset
+ * @ts:		Pointer to the timespec variable containing the offset
  *
  * Adds or subtracts an offset value from the current time.
  */
@@ -1558,6 +1559,7 @@ u64 timekeeping_max_deferment(void)
 
 /**
  * read_persistent_clock64 -  Return time from the persistent clock.
+ * @ts: Pointer to the storage for the readout value
  *
  * Weak dummy function for arches that do not yet support it.
  * Reads the time from the battery backed persistent clock.
@@ -1663,7 +1665,8 @@ static struct timespec64 timekeeping_suspend_time;
 
 /**
  * __timekeeping_inject_sleeptime - Internal function to add sleep interval
- * @delta: pointer to a timespec delta value
+ * @tk:		Pointer to the timekeeper to be updated
+ * @delta:	Pointer to the delta value in timespec64 format
  *
  * Takes a timespec offset measuring a suspend interval and properly
  * adds the sleep offset to the timekeeping variables.
-- 
cgit v1.2.3


From cc947f2b9c04113d84eeef67cc7c6326e1982019 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Mon, 16 Nov 2020 10:53:38 +0100
Subject: timers: Make run_local_timers() static

No users outside of the timer code. Move the caller below this function to
avoid a pointless forward declaration.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/timer.h |  1 -
 kernel/time/timer.c   | 48 ++++++++++++++++++++++++------------------------
 2 files changed, 24 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timer.h b/include/linux/timer.h
index d10bc7e73b41..fda13c9d1256 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -193,7 +193,6 @@ extern int try_to_del_timer_sync(struct timer_list *timer);
 #define del_singleshot_timer_sync(t) del_timer_sync(t)
 
 extern void init_timers(void);
-extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index af9ddfbd447d..ebf3b26d2501 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1705,29 +1705,6 @@ void timer_clear_idle(void)
 }
 #endif
 
-/*
- * Called from the timer interrupt handler to charge one tick to the current
- * process.  user_tick is 1 if the tick is user time, 0 for system.
- */
-void update_process_times(int user_tick)
-{
-	struct task_struct *p = current;
-
-	PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
-
-	/* Note: this timer irq context must be accounted for as well. */
-	account_process_tick(p, user_tick);
-	run_local_timers();
-	rcu_sched_clock_irq(user_tick);
-#ifdef CONFIG_IRQ_WORK
-	if (in_irq())
-		irq_work_tick();
-#endif
-	scheduler_tick();
-	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
-		run_posix_cpu_timers();
-}
-
 /**
  * __run_timers - run all expired timers (if any) on this CPU.
  * @base: the timer vector to be processed.
@@ -1777,7 +1754,7 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
 /*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
-void run_local_timers(void)
+static void run_local_timers(void)
 {
 	struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
 
@@ -1794,6 +1771,29 @@ void run_local_timers(void)
 	raise_softirq(TIMER_SOFTIRQ);
 }
 
+/*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process.  user_tick is 1 if the tick is user time, 0 for system.
+ */
+void update_process_times(int user_tick)
+{
+	struct task_struct *p = current;
+
+	PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0);
+
+	/* Note: this timer irq context must be accounted for as well. */
+	account_process_tick(p, user_tick);
+	run_local_timers();
+	rcu_sched_clock_irq(user_tick);
+#ifdef CONFIG_IRQ_WORK
+	if (in_irq())
+		irq_work_tick();
+#endif
+	scheduler_tick();
+	if (IS_ENABLED(CONFIG_POSIX_TIMERS))
+		run_posix_cpu_timers();
+}
+
 /*
  * Since schedule_timeout()'s timer is defined on the stack, it must store
  * the target task on the stack as well.
-- 
cgit v1.2.3


From 66981c37b3199d293c58f84cf2366e86a06e1a3d Mon Sep 17 00:00:00 2001
From: Mauro Carvalho Chehab
Date: Mon, 16 Nov 2020 11:18:14 +0100
Subject: hrtimer: Fix kernel-doc markups

The hrtimer_get_remaining() markup is documenting, instead,
__hrtimer_get_remaining(), as it is placed at the C file.

In order to properly document it, a kernel-doc markup is needed together
with the function prototype. So, add a new one, while preserving the
existing one, just fixing the function name.

The hrtimer_is_queued prototype has a typo: it is using
'=' instead of '-' to split: identifier - description
as required by kernel-doc markup.

Signed-off-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/9dc87808c2fd07b7e050bafcd033c5ef05808fea.1605521731.git.mchehab+huawei@kernel.org

---
 include/linux/hrtimer.h | 6 +++++-
 kernel/time/hrtimer.c   | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index 107cedd7019a..bb5e7b0a4274 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -447,6 +447,10 @@ static inline void hrtimer_restart(struct hrtimer *timer)
 /* Query timers: */
 extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust);
 
+/**
+ * hrtimer_get_remaining - get remaining time for the timer
+ * @timer:	the timer to read
+ */
 static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
 {
 	return __hrtimer_get_remaining(timer, false);
@@ -458,7 +462,7 @@ extern u64 hrtimer_next_event_without(const struct hrtimer *exclude);
 extern bool hrtimer_active(const struct hrtimer *timer);
 
 /**
- * hrtimer_is_queued = check, whether the timer is on one of the queues
+ * hrtimer_is_queued - check, whether the timer is on one of the queues
  * @timer:	Timer to check
  *
  * Returns: True if the timer is queued, false otherwise
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 3624b9b5835d..61c39ff68439 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1289,7 +1289,7 @@ int hrtimer_cancel(struct hrtimer *timer)
 EXPORT_SYMBOL_GPL(hrtimer_cancel);
 
 /**
- * hrtimer_get_remaining - get remaining time for the timer
+ * __hrtimer_get_remaining - get remaining time for the timer
  * @timer:	the timer to read
  * @adjust:	adjust relative timers when CONFIG_TIME_LOW_RES=y
  */
-- 
cgit v1.2.3


From f73f64d5687192bc8eb7f3d9521ca6256b79f224 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 17 Nov 2020 14:19:43 +0100
Subject: tick/broadcast: Serialize access to tick_next_period

tick_broadcast_setup_oneshot() accesses tick_next_period twice without any
serialization. This is wrong in two aspects:

  - Reading it twice might make the broadcast data inconsistent if the
    variable is updated concurrently.

  - On 32bit systems the access might see an partial update

Protect it with jiffies_lock. That's safe as none of the callchains leading
up to this function can create a lock ordering violation:

timer interrupt
  run_local_timers()
    hrtimer_run_queues()
      hrtimer_switch_to_hres()
        tick_init_highres()
	  tick_switch_to_oneshot()
	    tick_broadcast_switch_to_oneshot()
or
     tick_check_oneshot_change()
       tick_nohz_switch_to_nohz()
         tick_switch_to_oneshot()
           tick_broadcast_switch_to_oneshot()

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.061341507@linutronix.de

---
 kernel/time/tick-broadcast.c | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 36d7464c8962..2a47c8f80e53 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -877,6 +877,22 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
 	}
 }
 
+static inline ktime_t tick_get_next_period(void)
+{
+	ktime_t next;
+
+	/*
+	 * Protect against concurrent updates (store /load tearing on
+	 * 32bit). It does not matter if the time is already in the
+	 * past. The broadcast device which is about to be programmed will
+	 * fire in any case.
+	 */
+	raw_spin_lock(&jiffies_lock);
+	next = tick_next_period;
+	raw_spin_unlock(&jiffies_lock);
+	return next;
+}
+
 /**
  * tick_broadcast_setup_oneshot - setup the broadcast device
  */
@@ -905,10 +921,11 @@ static void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
 			   tick_broadcast_oneshot_mask, tmpmask);
 
 		if (was_periodic && !cpumask_empty(tmpmask)) {
+			ktime_t nextevt = tick_get_next_period();
+
 			clockevents_switch_state(bc, CLOCK_EVT_STATE_ONESHOT);
-			tick_broadcast_init_next_event(tmpmask,
-						       tick_next_period);
-			tick_broadcast_set_event(bc, cpu, tick_next_period);
+			tick_broadcast_init_next_event(tmpmask, nextevt);
+			tick_broadcast_set_event(bc, cpu, nextevt);
 		} else
 			bc->next_event = KTIME_MAX;
 	} else {
-- 
cgit v1.2.3


From c398960cd82b233886fbff163986f998b5a5c008 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 17 Nov 2020 14:19:44 +0100
Subject: tick: Document protections for tick related data

The protection rules for tick_next_period and last_jiffies_update are blury
at best. Clarify this.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.197713794@linutronix.de

---
 kernel/time/tick-common.c | 4 +++-
 kernel/time/tick-sched.c  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 6c9c342dd0e5..68504eb0d38b 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -27,7 +27,9 @@
  */
 DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
 /*
- * Tick next event: keeps track of the tick time
+ * Tick next event: keeps track of the tick time. It's updated by the
+ * CPU which handles the tick and protected by jiffies_lock. There is
+ * no requirement to write hold the jiffies seqcount for it.
  */
 ktime_t tick_next_period;
 ktime_t tick_period;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 81632cd5e3b7..15360e652c85 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -44,7 +44,9 @@ struct tick_sched *tick_get_tick_sched(int cpu)
 
 #if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
 /*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ * The time, when the last jiffy update happened. Write access must hold
+ * jiffies_lock and jiffies_seq. tick_nohz_next_event() needs to get a
+ * consistent view of jiffies and last_jiffies_update.
  */
 static ktime_t last_jiffies_update;
 
-- 
cgit v1.2.3


From 372acbbaa80940189593f9d69c7c069955f24f7a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 17 Nov 2020 14:19:45 +0100
Subject: tick/sched: Use tick_next_period for lockless quick check

No point in doing calculations.

   tick_next_period = last_jiffies_update + tick_period

Just check whether now is before tick_next_period to figure out whether
jiffies need an update.

Add a comment why the intentional data race in the quick check is safe or
not so safe in a 32bit corner case and why we don't worry about it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.337366695@linutronix.de

---
 kernel/time/tick-sched.c | 46 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 33 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 15360e652c85..b4b6abc81e4a 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -59,11 +59,29 @@ static void tick_do_update_jiffies64(ktime_t now)
 	ktime_t delta;
 
 	/*
-	 * Do a quick check without holding jiffies_lock:
-	 * The READ_ONCE() pairs with two updates done later in this function.
+	 * Do a quick check without holding jiffies_lock. The READ_ONCE()
+	 * pairs with the update done later in this function.
+	 *
+	 * This is also an intentional data race which is even safe on
+	 * 32bit in theory. If there is a concurrent update then the check
+	 * might give a random answer. It does not matter because if it
+	 * returns then the concurrent update is already taking care, if it
+	 * falls through then it will pointlessly contend on jiffies_lock.
+	 *
+	 * Though there is one nasty case on 32bit due to store tearing of
+	 * the 64bit value. If the first 32bit store makes the quick check
+	 * return on all other CPUs and the writing CPU context gets
+	 * delayed to complete the second store (scheduled out on virt)
+	 * then jiffies can become stale for up to ~2^32 nanoseconds
+	 * without noticing. After that point all CPUs will wait for
+	 * jiffies lock.
+	 *
+	 * OTOH, this is not any different than the situation with NOHZ=off
+	 * where one CPU is responsible for updating jiffies and
+	 * timekeeping. If that CPU goes out for lunch then all other CPUs
+	 * will operate on stale jiffies until it decides to come back.
 	 */
-	delta = ktime_sub(now, READ_ONCE(last_jiffies_update));
-	if (delta < tick_period)
+	if (ktime_before(now, READ_ONCE(tick_next_period)))
 		return;
 
 	/* Reevaluate with jiffies_lock held */
@@ -74,9 +92,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 	if (delta >= tick_period) {
 
 		delta = ktime_sub(delta, tick_period);
-		/* Pairs with the lockless read in this function. */
-		WRITE_ONCE(last_jiffies_update,
-			   ktime_add(last_jiffies_update, tick_period));
+		last_jiffies_update = ktime_add(last_jiffies_update,
+						tick_period);
 
 		/* Slow path for long timeouts */
 		if (unlikely(delta >= tick_period)) {
@@ -84,15 +101,18 @@ static void tick_do_update_jiffies64(ktime_t now)
 
 			ticks = ktime_divns(delta, incr);
 
-			/* Pairs with the lockless read in this function. */
-			WRITE_ONCE(last_jiffies_update,
-				   ktime_add_ns(last_jiffies_update,
-						incr * ticks));
+			last_jiffies_update = ktime_add_ns(last_jiffies_update,
+							   incr * ticks);
 		}
 		do_timer(++ticks);
 
-		/* Keep the tick_next_period variable up to date */
-		tick_next_period = ktime_add(last_jiffies_update, tick_period);
+		/*
+		 * Keep the tick_next_period variable up to date.
+		 * WRITE_ONCE() pairs with the READ_ONCE() in the lockless
+		 * quick check above.
+		 */
+		WRITE_ONCE(tick_next_period,
+			   ktime_add(last_jiffies_update, tick_period));
 	} else {
 		write_seqcount_end(&jiffies_seq);
 		raw_spin_unlock(&jiffies_lock);
-- 
cgit v1.2.3


From 94ad2e3cedb82af034f6d97c58022f162b669f9b Mon Sep 17 00:00:00 2001
From: Yunfeng Ye
Date: Tue, 17 Nov 2020 14:19:46 +0100
Subject: tick/sched: Reduce seqcount held scope in tick_do_update_jiffies64()

If jiffies are up to date already (caller lost the race against another
CPU) there is no point to change the sequence count. Doing that just forces
other CPUs into the seqcount retry loop in tick_nohz_next_event() for
nothing.

Just bail out early.

[ tglx: Rewrote most of it ]

Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.462195901@linutronix.de

---
 kernel/time/tick-sched.c | 47 ++++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b4b6abc81e4a..ca9191ced4b5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -86,38 +86,35 @@ static void tick_do_update_jiffies64(ktime_t now)
 
 	/* Reevaluate with jiffies_lock held */
 	raw_spin_lock(&jiffies_lock);
+	if (ktime_before(now, tick_next_period)) {
+		raw_spin_unlock(&jiffies_lock);
+		return;
+	}
+
 	write_seqcount_begin(&jiffies_seq);
 
-	delta = ktime_sub(now, last_jiffies_update);
-	if (delta >= tick_period) {
+	last_jiffies_update = ktime_add(last_jiffies_update, tick_period);
 
-		delta = ktime_sub(delta, tick_period);
-		last_jiffies_update = ktime_add(last_jiffies_update,
-						tick_period);
+	delta = ktime_sub(now, tick_next_period);
+	if (unlikely(delta >= tick_period)) {
+		/* Slow path for long idle sleep times */
+		s64 incr = ktime_to_ns(tick_period);
 
-		/* Slow path for long timeouts */
-		if (unlikely(delta >= tick_period)) {
-			s64 incr = ktime_to_ns(tick_period);
+		ticks = ktime_divns(delta, incr);
 
-			ticks = ktime_divns(delta, incr);
+		last_jiffies_update = ktime_add_ns(last_jiffies_update,
+						   incr * ticks);
+	}
 
-			last_jiffies_update = ktime_add_ns(last_jiffies_update,
-							   incr * ticks);
-		}
-		do_timer(++ticks);
+	do_timer(++ticks);
+
+	/*
+	 * Keep the tick_next_period variable up to date.  WRITE_ONCE()
+	 * pairs with the READ_ONCE() in the lockless quick check above.
+	 */
+	WRITE_ONCE(tick_next_period,
+		   ktime_add(last_jiffies_update, tick_period));
 
-		/*
-		 * Keep the tick_next_period variable up to date.
-		 * WRITE_ONCE() pairs with the READ_ONCE() in the lockless
-		 * quick check above.
-		 */
-		WRITE_ONCE(tick_next_period,
-			   ktime_add(last_jiffies_update, tick_period));
-	} else {
-		write_seqcount_end(&jiffies_seq);
-		raw_spin_unlock(&jiffies_lock);
-		return;
-	}
 	write_seqcount_end(&jiffies_seq);
 	raw_spin_unlock(&jiffies_lock);
 	update_wall_time();
-- 
cgit v1.2.3


From 7a35bf2a6a871cd0252cd371d741e7d070b53af9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 17 Nov 2020 14:19:47 +0100
Subject: tick/sched: Optimize tick_do_update_jiffies64() further

Now that it's clear that there is always one tick to account, simplify the
calculations some more.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.565663056@linutronix.de

---
 kernel/time/tick-sched.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index ca9191ced4b5..306adeb6ce4c 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -55,7 +55,7 @@ static ktime_t last_jiffies_update;
  */
 static void tick_do_update_jiffies64(ktime_t now)
 {
-	unsigned long ticks = 0;
+	unsigned long ticks = 1;
 	ktime_t delta;
 
 	/*
@@ -93,20 +93,21 @@ static void tick_do_update_jiffies64(ktime_t now)
 
 	write_seqcount_begin(&jiffies_seq);
 
-	last_jiffies_update = ktime_add(last_jiffies_update, tick_period);
-
 	delta = ktime_sub(now, tick_next_period);
 	if (unlikely(delta >= tick_period)) {
 		/* Slow path for long idle sleep times */
 		s64 incr = ktime_to_ns(tick_period);
 
-		ticks = ktime_divns(delta, incr);
+		ticks += ktime_divns(delta, incr);
 
 		last_jiffies_update = ktime_add_ns(last_jiffies_update,
 						   incr * ticks);
+	} else {
+		last_jiffies_update = ktime_add(last_jiffies_update,
+						tick_period);
 	}
 
-	do_timer(++ticks);
+	do_timer(ticks);
 
 	/*
 	 * Keep the tick_next_period variable up to date.  WRITE_ONCE()
-- 
cgit v1.2.3


From 896b969e6732b68ee3c12ae4e1aeddf5db99bc46 Mon Sep 17 00:00:00 2001
From: Yunfeng Ye
Date: Tue, 17 Nov 2020 14:19:48 +0100
Subject: tick/sched: Release seqcount before invoking calc_load_global()

calc_load_global() does not need the sequence count protection.

[ tglx: Split it up properly and added comments ]

Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.660902274@linutronix.de

---
 kernel/time/tick-sched.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 306adeb6ce4c..33c897bb88c6 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
 #include <linux/sched/clock.h>
 #include <linux/sched/stat.h>
 #include <linux/sched/nohz.h>
+#include <linux/sched/loadavg.h>
 #include <linux/module.h>
 #include <linux/irq_work.h>
 #include <linux/posix-timers.h>
@@ -107,7 +108,8 @@ static void tick_do_update_jiffies64(ktime_t now)
 						tick_period);
 	}
 
-	do_timer(ticks);
+	/* Advance jiffies to complete the jiffies_seq protected job */
+	jiffies_64 += ticks;
 
 	/*
 	 * Keep the tick_next_period variable up to date.  WRITE_ONCE()
@@ -116,7 +118,15 @@ static void tick_do_update_jiffies64(ktime_t now)
 	WRITE_ONCE(tick_next_period,
 		   ktime_add(last_jiffies_update, tick_period));
 
+	/*
+	 * Release the sequence count. calc_global_load() below is not
+	 * protected by it, but jiffies_lock needs to be held to prevent
+	 * concurrent invocations.
+	 */
 	write_seqcount_end(&jiffies_seq);
+
+	calc_global_load();
+
 	raw_spin_unlock(&jiffies_lock);
 	update_wall_time();
 }
-- 
cgit v1.2.3


From b996544916429946bf4934c1c01a306d1690972c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Tue, 17 Nov 2020 14:19:49 +0100
Subject: tick: Get rid of tick_period

The variable tick_period is initialized to NSEC_PER_TICK / HZ during boot
and never updated again.

If NSEC_PER_TICK is not an integer multiple of HZ this computation is less
accurate than TICK_NSEC which has proper rounding in place.

Aside of the inaccuracy there is no reason for having this variable at
all. It's just a pointless indirection and all usage sites can just use the
TICK_NSEC constant.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Link: https://lore.kernel.org/r/20201117132006.766643526@linutronix.de

---
 kernel/time/tick-broadcast.c |  2 +-
 kernel/time/tick-common.c    |  8 +++-----
 kernel/time/tick-internal.h  |  1 -
 kernel/time/tick-sched.c     | 22 +++++++++++-----------
 4 files changed, 15 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2a47c8f80e53..5a23829372c7 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -331,7 +331,7 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
 	bc_local = tick_do_periodic_broadcast();
 
 	if (clockevent_state_oneshot(dev)) {
-		ktime_t next = ktime_add(dev->next_event, tick_period);
+		ktime_t next = ktime_add_ns(dev->next_event, TICK_NSEC);
 
 		clockevents_program_event(dev, next, true);
 	}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 68504eb0d38b..a03764df5366 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -32,7 +32,6 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
  * no requirement to write hold the jiffies seqcount for it.
  */
 ktime_t tick_next_period;
-ktime_t tick_period;
 
 /*
  * tick_do_timer_cpu is a timer core internal variable which holds the CPU NR
@@ -90,7 +89,7 @@ static void tick_periodic(int cpu)
 		write_seqcount_begin(&jiffies_seq);
 
 		/* Keep track of the next tick event */
-		tick_next_period = ktime_add(tick_next_period, tick_period);
+		tick_next_period = ktime_add_ns(tick_next_period, TICK_NSEC);
 
 		do_timer(1);
 		write_seqcount_end(&jiffies_seq);
@@ -129,7 +128,7 @@ void tick_handle_periodic(struct clock_event_device *dev)
 		 * Setup the next period for devices, which do not have
 		 * periodic mode:
 		 */
-		next = ktime_add(next, tick_period);
+		next = ktime_add_ns(next, TICK_NSEC);
 
 		if (!clockevents_program_event(dev, next, false))
 			return;
@@ -175,7 +174,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
 		for (;;) {
 			if (!clockevents_program_event(dev, next, false))
 				return;
-			next = ktime_add(next, tick_period);
+			next = ktime_add_ns(next, TICK_NSEC);
 		}
 	}
 }
@@ -222,7 +221,6 @@ static void tick_setup_device(struct tick_device *td,
 			tick_do_timer_cpu = cpu;
 
 			tick_next_period = ktime_get();
-			tick_period = NSEC_PER_SEC / HZ;
 #ifdef CONFIG_NO_HZ_FULL
 			/*
 			 * The boot CPU may be nohz_full, in which case set
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 7b2496136729..7a981c9e87a4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -15,7 +15,6 @@
 
 DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
 extern ktime_t tick_next_period;
-extern ktime_t tick_period;
 extern int tick_do_timer_cpu __read_mostly;
 
 extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 33c897bb88c6..cc7cba20382e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -95,17 +95,17 @@ static void tick_do_update_jiffies64(ktime_t now)
 	write_seqcount_begin(&jiffies_seq);
 
 	delta = ktime_sub(now, tick_next_period);
-	if (unlikely(delta >= tick_period)) {
+	if (unlikely(delta >= TICK_NSEC)) {
 		/* Slow path for long idle sleep times */
-		s64 incr = ktime_to_ns(tick_period);
+		s64 incr = TICK_NSEC;
 
 		ticks += ktime_divns(delta, incr);
 
 		last_jiffies_update = ktime_add_ns(last_jiffies_update,
 						   incr * ticks);
 	} else {
-		last_jiffies_update = ktime_add(last_jiffies_update,
-						tick_period);
+		last_jiffies_update = ktime_add_ns(last_jiffies_update,
+						   TICK_NSEC);
 	}
 
 	/* Advance jiffies to complete the jiffies_seq protected job */
@@ -116,7 +116,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 	 * pairs with the READ_ONCE() in the lockless quick check above.
 	 */
 	WRITE_ONCE(tick_next_period,
-		   ktime_add(last_jiffies_update, tick_period));
+		   ktime_add_ns(last_jiffies_update, TICK_NSEC));
 
 	/*
 	 * Release the sequence count. calc_global_load() below is not
@@ -691,7 +691,7 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 	hrtimer_set_expires(&ts->sched_timer, ts->last_tick);
 
 	/* Forward the time to expire in the future */
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
 
 	if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
 		hrtimer_start_expires(&ts->sched_timer,
@@ -1260,7 +1260,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
 	if (unlikely(ts->tick_stopped))
 		return;
 
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
 	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
@@ -1297,7 +1297,7 @@ static void tick_nohz_switch_to_nohz(void)
 	next = tick_init_jiffy_update();
 
 	hrtimer_set_expires(&ts->sched_timer, next);
-	hrtimer_forward_now(&ts->sched_timer, tick_period);
+	hrtimer_forward_now(&ts->sched_timer, TICK_NSEC);
 	tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 	tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
@@ -1363,7 +1363,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 	if (unlikely(ts->tick_stopped))
 		return HRTIMER_NORESTART;
 
-	hrtimer_forward(timer, now, tick_period);
+	hrtimer_forward(timer, now, TICK_NSEC);
 
 	return HRTIMER_RESTART;
 }
@@ -1397,13 +1397,13 @@ void tick_setup_sched_timer(void)
 
 	/* Offset the tick to avert jiffies_lock contention. */
 	if (sched_skew_tick) {
-		u64 offset = ktime_to_ns(tick_period) >> 1;
+		u64 offset = TICK_NSEC >> 1;
 		do_div(offset, num_possible_cpus());
 		offset *= smp_processor_id();
 		hrtimer_add_expires_ns(&ts->sched_timer, offset);
 	}
 
-	hrtimer_forward(&ts->sched_timer, now, tick_period);
+	hrtimer_forward(&ts->sched_timer, now, TICK_NSEC);
 	hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED_HARD);
 	tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
-- 
cgit v1.2.3


From c9e6189fb03123a7dfb93589280347b46f30b161 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 6 Dec 2020 22:46:18 +0100
Subject: ntp: Make the RTC synchronization more reliable

Miroslav reported that the periodic RTC synchronization in the NTP code
fails more often than not to hit the specified update window.

The reason is that the code uses delayed_work to schedule the update which
needs to be in thread context as the underlying RTC might be connected via
a slow bus, e.g. I2C. In the update function it verifies whether the
current time is correct vs. the requirements of the underlying RTC.

But delayed_work is using the timer wheel for scheduling which is
inaccurate by design. Depending on the distance to the expiry the wheel
gets less granular to allow batching and to avoid the cascading of the
original timer wheel. See 500462a9de65 ("timers: Switch to a non-cascading
wheel") and the code for further details.

The code already deals with this by splitting the 660 seconds period into a
long 659 seconds timer and then retrying with a smaller delta.

But looking at the actual granularities of the timer wheel (which depend on
the HZ configuration) the 659 seconds timer ends up in an outer wheel level
and is affected by a worst case granularity of:

HZ          Granularity
1000        32s
 250        16s
 100        40s

So the initial timer can be already off by max 12.5% which is not a big
issue as the period of the sync is defined as ~11 minutes.

The fine grained second attempt schedules to the desired update point with
a timer expiring less than a second from now. Depending on the actual delta
and the HZ setting even the second attempt can end up in outer wheel levels
which have a large enough granularity to make the correctness check fail.

As this is a fundamental property of the timer wheel there is no way to
make this more accurate short of iterating in one jiffies steps towards the
update point.

Switch it to an hrtimer instead which schedules the actual update work. The
hrtimer will expire precisely (max 1 jiffie delay when high resolution
timers are not available). The actual scheduling delay of the work is the
same as before.

The update is triggered from do_adjtimex() which is a bit racy but not much
more racy than it was before:

     if (ntp_synced())
     	queue_delayed_work(system_power_efficient_wq, &sync_work, 0);

which is racy when the work is currently executed and has not managed to
reschedule itself.

This becomes now:

     if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
     	queue_work(system_power_efficient_wq, &sync_work, 0);

which is racy when the hrtimer has expired and the work is currently
executed and has not yet managed to rearm the hrtimer.

Not a big problem as it just schedules work for nothing.

The new implementation has a safe guard in place to catch the case where
the hrtimer is queued on entry to the work function and avoids an extra
update attempt of the RTC that way.

Reported-by: Miroslav Lichvar <mlichvar@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Miroslav Lichvar <mlichvar@redhat.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.062910520@linutronix.de

---
 include/linux/timex.h      |  1 -
 kernel/time/ntp.c          | 90 ++++++++++++++++++++++++----------------------
 kernel/time/ntp_internal.h |  7 ++++
 3 files changed, 55 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index ce0859763670..9c2e54faf9b7 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -157,7 +157,6 @@ extern int do_clock_adjtime(const clockid_t which_clock, struct __kernel_timex *
 extern void hardpps(const struct timespec64 *, const struct timespec64 *);
 
 int read_current_timer(unsigned long *timer_val);
-void ntp_notify_cmos_timer(void);
 
 /* The clock frequency of the i8253/i8254 PIT */
 #define PIT_TICK_RATE 1193182ul
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 069ca78fb0bf..ff1a7b8ec4ef 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -494,65 +494,55 @@ out:
 	return leap;
 }
 
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
 static void sync_hw_clock(struct work_struct *work);
-static DECLARE_DELAYED_WORK(sync_work, sync_hw_clock);
-
-static void sched_sync_hw_clock(struct timespec64 now,
-				unsigned long target_nsec, bool fail)
+static DECLARE_WORK(sync_work, sync_hw_clock);
+static struct hrtimer sync_hrtimer;
+#define SYNC_PERIOD_NS (11UL * 60 * NSEC_PER_SEC)
 
+static enum hrtimer_restart sync_timer_callback(struct hrtimer *timer)
 {
-	struct timespec64 next;
+	queue_work(system_power_efficient_wq, &sync_work);
 
-	ktime_get_real_ts64(&next);
-	if (!fail)
-		next.tv_sec = 659;
-	else {
-		/*
-		 * Try again as soon as possible. Delaying long periods
-		 * decreases the accuracy of the work queue timer. Due to this
-		 * the algorithm is very likely to require a short-sleep retry
-		 * after the above long sleep to synchronize ts_nsec.
-		 */
-		next.tv_sec = 0;
-	}
+	return HRTIMER_NORESTART;
+}
 
-	/* Compute the needed delay that will get to tv_nsec == target_nsec */
-	next.tv_nsec = target_nsec - next.tv_nsec;
-	if (next.tv_nsec <= 0)
-		next.tv_nsec += NSEC_PER_SEC;
-	if (next.tv_nsec >= NSEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_nsec -= NSEC_PER_SEC;
-	}
+static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
+{
+	ktime_t exp = ktime_set(ktime_get_real_seconds(), 0);
+
+	if (retry)
+		exp = ktime_add_ns(exp, 2 * NSEC_PER_SEC - offset_nsec);
+	else
+		exp = ktime_add_ns(exp, SYNC_PERIOD_NS - offset_nsec);
 
-	queue_delayed_work(system_power_efficient_wq, &sync_work,
-			   timespec64_to_jiffies(&next));
+	hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
 }
 
 static void sync_rtc_clock(void)
 {
-	unsigned long target_nsec;
-	struct timespec64 adjust, now;
+	unsigned long offset_nsec;
+	struct timespec64 adjust;
 	int rc;
 
 	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
 		return;
 
-	ktime_get_real_ts64(&now);
+	ktime_get_real_ts64(&adjust);
 
-	adjust = now;
 	if (persistent_clock_is_local)
 		adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 
 	/*
-	 * The current RTC in use will provide the target_nsec it wants to be
-	 * called at, and does rtc_tv_nsec_ok internally.
+	 * The current RTC in use will provide the nanoseconds offset prior
+	 * to a full second it wants to be called at, and invokes
+	 * rtc_tv_nsec_ok() internally.
 	 */
-	rc = rtc_set_ntp_time(adjust, &target_nsec);
+	rc = rtc_set_ntp_time(adjust, &offset_nsec);
 	if (rc == -ENODEV)
 		return;
 
-	sched_sync_hw_clock(now, target_nsec, rc);
+	sched_sync_hw_clock(offset_nsec, rc != 0);
 }
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
@@ -599,7 +589,7 @@ static bool sync_cmos_clock(void)
 		}
 	}
 
-	sched_sync_hw_clock(now, target_nsec, rc);
+	sched_sync_hw_clock(target_nsec, rc != 0);
 	return true;
 }
 
@@ -613,7 +603,12 @@ static bool sync_cmos_clock(void)
  */
 static void sync_hw_clock(struct work_struct *work)
 {
-	if (!ntp_synced())
+	/*
+	 * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer()
+	 * managed to schedule the work between the timer firing and the
+	 * work being able to rearm the timer. Wait for the timer to expire.
+	 */
+	if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer))
 		return;
 
 	if (sync_cmos_clock())
@@ -624,13 +619,23 @@ static void sync_hw_clock(struct work_struct *work)
 
 void ntp_notify_cmos_timer(void)
 {
-	if (!ntp_synced())
-		return;
+	/*
+	 * When the work is currently executed but has not yet the timer
+	 * rearmed this queues the work immediately again. No big issue,
+	 * just a pointless work scheduled.
+	 */
+	if (ntp_synced() && !hrtimer_is_queued(&sync_hrtimer))
+		queue_work(system_power_efficient_wq, &sync_work);
+}
 
-	if (IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE) ||
-	    IS_ENABLED(CONFIG_RTC_SYSTOHC))
-		queue_delayed_work(system_power_efficient_wq, &sync_work, 0);
+static void __init ntp_init_cmos_sync(void)
+{
+	hrtimer_init(&sync_hrtimer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+	sync_hrtimer.function = sync_timer_callback;
 }
+#else /* CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
+static inline void __init ntp_init_cmos_sync(void) { }
+#endif /* !CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC) */
 
 /*
  * Propagate a new txc->status value into the NTP state:
@@ -1044,4 +1049,5 @@ __setup("ntp_tick_adj=", ntp_tick_adj_setup);
 void __init ntp_init(void)
 {
 	ntp_clear();
+	ntp_init_cmos_sync();
 }
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
index 908ecaa65fc3..23d1b74c3065 100644
--- a/kernel/time/ntp_internal.h
+++ b/kernel/time/ntp_internal.h
@@ -12,4 +12,11 @@ extern int __do_adjtimex(struct __kernel_timex *txc,
 			 const struct timespec64 *ts,
 			 s32 *time_tai, struct audit_ntp_data *ad);
 extern void __hardpps(const struct timespec64 *phase_ts, const struct timespec64 *raw_ts);
+
+#if defined(CONFIG_GENERIC_CMOS_UPDATE) || defined(CONFIG_RTC_SYSTOHC)
+extern void ntp_notify_cmos_timer(void);
+#else
+static inline void ntp_notify_cmos_timer(void) { }
+#endif
+
 #endif /* _LINUX_NTP_INTERNAL_H */
-- 
cgit v1.2.3


From 33e62e832384c8cb523044e0e9d99d7133f98e93 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 6 Dec 2020 22:46:19 +0100
Subject: ntp, rtc: Move rtc_set_ntp_time() to ntp code

rtc_set_ntp_time() is not really RTC functionality as the code is just a
user of RTC. Move it into the NTP code which allows further cleanups.

Requested-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.166871172@linutronix.de

---
 drivers/rtc/Makefile  |  1 -
 drivers/rtc/systohc.c | 61 -----------------------------------
 include/linux/rtc.h   | 34 --------------------
 kernel/time/ntp.c     | 88 +++++++++++++++++++++++++++++++++++++++++++++++++--
 4 files changed, 85 insertions(+), 99 deletions(-)
 delete mode 100644 drivers/rtc/systohc.c

(limited to 'kernel')

diff --git a/drivers/rtc/Makefile b/drivers/rtc/Makefile
index bfb57464118d..bb8f319b09fb 100644
--- a/drivers/rtc/Makefile
+++ b/drivers/rtc/Makefile
@@ -6,7 +6,6 @@
 ccflags-$(CONFIG_RTC_DEBUG)	:= -DDEBUG
 
 obj-$(CONFIG_RTC_LIB)		+= lib.o
-obj-$(CONFIG_RTC_SYSTOHC)	+= systohc.o
 obj-$(CONFIG_RTC_CLASS)		+= rtc-core.o
 obj-$(CONFIG_RTC_MC146818_LIB)	+= rtc-mc146818-lib.o
 rtc-core-y			:= class.o interface.o
diff --git a/drivers/rtc/systohc.c b/drivers/rtc/systohc.c
deleted file mode 100644
index 8b70f0520e13..000000000000
--- a/drivers/rtc/systohc.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/rtc.h>
-#include <linux/time.h>
-
-/**
- * rtc_set_ntp_time - Save NTP synchronized time to the RTC
- * @now: Current time of day
- * @target_nsec: pointer for desired now->tv_nsec value
- *
- * Replacement for the NTP platform function update_persistent_clock64
- * that stores time for later retrieval by rtc_hctosys.
- *
- * Returns 0 on successful RTC update, -ENODEV if a RTC update is not
- * possible at all, and various other -errno for specific temporary failure
- * cases.
- *
- * -EPROTO is returned if now.tv_nsec is not close enough to *target_nsec.
- *
- * If temporary failure is indicated the caller should try again 'soon'
- */
-int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
-{
-	struct rtc_device *rtc;
-	struct rtc_time tm;
-	struct timespec64 to_set;
-	int err = -ENODEV;
-	bool ok;
-
-	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
-	if (!rtc)
-		goto out_err;
-
-	if (!rtc->ops || !rtc->ops->set_time)
-		goto out_close;
-
-	/* Compute the value of tv_nsec we require the caller to supply in
-	 * now.tv_nsec.  This is the value such that (now +
-	 * set_offset_nsec).tv_nsec == 0.
-	 */
-	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
-	*target_nsec = to_set.tv_nsec;
-
-	/* The ntp code must call this with the correct value in tv_nsec, if
-	 * it does not we update target_nsec and return EPROTO to make the ntp
-	 * code try again later.
-	 */
-	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
-	if (!ok) {
-		err = -EPROTO;
-		goto out_close;
-	}
-
-	rtc_time64_to_tm(to_set.tv_sec, &tm);
-
-	err = rtc_set_time(rtc, &tm);
-
-out_close:
-	rtc_class_close(rtc);
-out_err:
-	return err;
-}
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index 22d1575e4991..ff62680b48ca 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -165,7 +165,6 @@ int __rtc_register_device(struct module *owner, struct rtc_device *rtc);
 
 extern int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm);
 extern int rtc_set_time(struct rtc_device *rtc, struct rtc_time *tm);
-extern int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec);
 int __rtc_read_alarm(struct rtc_device *rtc, struct rtc_wkalrm *alarm);
 extern int rtc_read_alarm(struct rtc_device *rtc,
 			struct rtc_wkalrm *alrm);
@@ -205,39 +204,6 @@ static inline bool is_leap_year(unsigned int year)
 	return (!(year % 4) && (year % 100)) || !(year % 400);
 }
 
-/* Determine if we can call to driver to set the time. Drivers can only be
- * called to set a second aligned time value, and the field set_offset_nsec
- * specifies how far away from the second aligned time to call the driver.
- *
- * This also computes 'to_set' which is the time we are trying to set, and has
- * a zero in tv_nsecs, such that:
- *    to_set - set_delay_nsec == now +/- FUZZ
- *
- */
-static inline bool rtc_tv_nsec_ok(s64 set_offset_nsec,
-				  struct timespec64 *to_set,
-				  const struct timespec64 *now)
-{
-	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
-	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
-	struct timespec64 delay = {.tv_sec = 0,
-				   .tv_nsec = set_offset_nsec};
-
-	*to_set = timespec64_add(*now, delay);
-
-	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
-		to_set->tv_nsec = 0;
-		return true;
-	}
-
-	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
-		to_set->tv_sec++;
-		to_set->tv_nsec = 0;
-		return true;
-	}
-	return false;
-}
-
 #define rtc_register_device(device) \
 	__rtc_register_device(THIS_MODULE, device)
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ff1a7b8ec4ef..84a554622cee 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -519,15 +519,94 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
 	hrtimer_start(&sync_hrtimer, exp, HRTIMER_MODE_ABS);
 }
 
+/*
+ * Determine if we can call to driver to set the time. Drivers can only be
+ * called to set a second aligned time value, and the field set_offset_nsec
+ * specifies how far away from the second aligned time to call the driver.
+ *
+ * This also computes 'to_set' which is the time we are trying to set, and has
+ * a zero in tv_nsecs, such that:
+ *    to_set - set_delay_nsec == now +/- FUZZ
+ *
+ */
+static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
+				  struct timespec64 *to_set,
+				  const struct timespec64 *now)
+{
+	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
+	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
+	struct timespec64 delay = {.tv_sec = 0,
+				   .tv_nsec = set_offset_nsec};
+
+	*to_set = timespec64_add(*now, delay);
+
+	if (to_set->tv_nsec < TIME_SET_NSEC_FUZZ) {
+		to_set->tv_nsec = 0;
+		return true;
+	}
+
+	if (to_set->tv_nsec > NSEC_PER_SEC - TIME_SET_NSEC_FUZZ) {
+		to_set->tv_sec++;
+		to_set->tv_nsec = 0;
+		return true;
+	}
+	return false;
+}
+
+#ifdef CONFIG_RTC_SYSTOHC
+/*
+ * rtc_set_ntp_time - Save NTP synchronized time to the RTC
+ */
+static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
+{
+	struct rtc_device *rtc;
+	struct rtc_time tm;
+	struct timespec64 to_set;
+	int err = -ENODEV;
+	bool ok;
+
+	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
+	if (!rtc)
+		goto out_err;
+
+	if (!rtc->ops || !rtc->ops->set_time)
+		goto out_close;
+
+	/*
+	 * Compute the value of tv_nsec we require the caller to supply in
+	 * now.tv_nsec.  This is the value such that (now +
+	 * set_offset_nsec).tv_nsec == 0.
+	 */
+	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
+	*target_nsec = to_set.tv_nsec;
+
+	/*
+	 * The ntp code must call this with the correct value in tv_nsec, if
+	 * it does not we update target_nsec and return EPROTO to make the ntp
+	 * code try again later.
+	 */
+	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
+	if (!ok) {
+		err = -EPROTO;
+		goto out_close;
+	}
+
+	rtc_time64_to_tm(to_set.tv_sec, &tm);
+
+	err = rtc_set_time(rtc, &tm);
+
+out_close:
+	rtc_class_close(rtc);
+out_err:
+	return err;
+}
+
 static void sync_rtc_clock(void)
 {
 	unsigned long offset_nsec;
 	struct timespec64 adjust;
 	int rc;
 
-	if (!IS_ENABLED(CONFIG_RTC_SYSTOHC))
-		return;
-
 	ktime_get_real_ts64(&adjust);
 
 	if (persistent_clock_is_local)
@@ -544,6 +623,9 @@ static void sync_rtc_clock(void)
 
 	sched_sync_hw_clock(offset_nsec, rc != 0);
 }
+#else
+static inline void sync_rtc_clock(void) { }
+#endif
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
 int __weak update_persistent_clock64(struct timespec64 now64)
-- 
cgit v1.2.3


From 69eca258c85000564577642ba28335eb4e1df8f0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 6 Dec 2020 22:46:20 +0100
Subject: ntp: Make the RTC sync offset less obscure

The current RTC set_offset_nsec value is not really intuitive to
understand. 

  tsched       twrite(t2.tv_sec - 1) 	 t2 (seconds increment)

The offset is calculated from twrite based on the assumption that t2 -
twrite == 1s. That means for the MC146818 RTC the offset needs to be
negative so that the write happens 500ms before t2.

It's easier to understand when the whole calculation is based on t2. That
avoids negative offsets and the meaning is obvious:

 t2 - twrite:     The time defined by the chip when seconds increment
      		  after the write.

 twrite - tsched: The time for the transport to the point where the chip
 	  	  is updated. 

==> set_offset_nsec =  t2 - tsched
    ttransport      =  twrite - tsched
    tRTCinc         =  t2 - twrite
==> set_offset_nsec =  ttransport + tRTCinc

tRTCinc is a chip property and can be obtained from the data sheet.

ttransport depends on how the RTC is connected. It is close to 0 for
directly accessible RTCs. For RTCs behind a slow bus, e.g. i2c, it's the
time required to send the update over the bus. This can be estimated or
even calibrated, but that's a different problem.

Adjust the implementation and update comments accordingly.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.263204937@linutronix.de

---
 drivers/rtc/class.c    |  9 +++++++--
 drivers/rtc/rtc-cmos.c |  2 +-
 include/linux/rtc.h    | 35 +++++++++++++++++++++++++++++------
 kernel/time/ntp.c      | 47 ++++++++++++++++++++++++-----------------------
 4 files changed, 61 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/drivers/rtc/class.c b/drivers/rtc/class.c
index d7957376eb96..5855aa2eef62 100644
--- a/drivers/rtc/class.c
+++ b/drivers/rtc/class.c
@@ -200,8 +200,13 @@ static struct rtc_device *rtc_allocate_device(void)
 
 	device_initialize(&rtc->dev);
 
-	/* Drivers can revise this default after allocating the device. */
-	rtc->set_offset_nsec =  5 * NSEC_PER_MSEC;
+	/*
+	 * Drivers can revise this default after allocating the device.
+	 * The default is what most RTCs do: Increment seconds exactly one
+	 * second after the write happened. This adds a default transport
+	 * time of 5ms which is at least halfways close to reality.
+	 */
+	rtc->set_offset_nsec = NSEC_PER_SEC + 5 * NSEC_PER_MSEC;
 
 	rtc->irq_freq = 1;
 	rtc->max_user_freq = 64;
diff --git a/drivers/rtc/rtc-cmos.c b/drivers/rtc/rtc-cmos.c
index 7728faca01b4..c5bcd2adc9fe 100644
--- a/drivers/rtc/rtc-cmos.c
+++ b/drivers/rtc/rtc-cmos.c
@@ -869,7 +869,7 @@ cmos_do_probe(struct device *dev, struct resource *ports, int rtc_irq)
 		goto cleanup2;
 
 	/* Set the sync offset for the periodic 11min update correct */
-	cmos_rtc.rtc->set_offset_nsec = -(NSEC_PER_SEC / 2);
+	cmos_rtc.rtc->set_offset_nsec = NSEC_PER_SEC / 2;
 
 	/* export at least the first block of NVRAM */
 	nvmem_cfg.size = address_space - NVRAM_OFFSET;
diff --git a/include/linux/rtc.h b/include/linux/rtc.h
index ff62680b48ca..b829382de6c3 100644
--- a/include/linux/rtc.h
+++ b/include/linux/rtc.h
@@ -110,13 +110,36 @@ struct rtc_device {
 	/* Some hardware can't support UIE mode */
 	int uie_unsupported;
 
-	/* Number of nsec it takes to set the RTC clock. This influences when
-	 * the set ops are called. An offset:
-	 *   - of 0.5 s will call RTC set for wall clock time 10.0 s at 9.5 s
-	 *   - of 1.5 s will call RTC set for wall clock time 10.0 s at 8.5 s
-	 *   - of -0.5 s will call RTC set for wall clock time 10.0 s at 10.5 s
+	/*
+	 * This offset specifies the update timing of the RTC.
+	 *
+	 * tsched     t1 write(t2.tv_sec - 1sec))  t2 RTC increments seconds
+	 *
+	 * The offset defines how tsched is computed so that the write to
+	 * the RTC (t2.tv_sec - 1sec) is correct versus the time required
+	 * for the transport of the write and the time which the RTC needs
+	 * to increment seconds the first time after the write (t2).
+	 *
+	 * For direct accessible RTCs tsched ~= t1 because the write time
+	 * is negligible. For RTCs behind slow busses the transport time is
+	 * significant and has to be taken into account.
+	 *
+	 * The time between the write (t1) and the first increment after
+	 * the write (t2) is RTC specific. For a MC146818 RTC it's 500ms,
+	 * for many others it's exactly 1 second. Consult the datasheet.
+	 *
+	 * The value of this offset is also used to calculate the to be
+	 * written value (t2.tv_sec - 1sec) at tsched.
+	 *
+	 * The default value for this is NSEC_PER_SEC + 10 msec default
+	 * transport time. The offset can be adjusted by drivers so the
+	 * calculation for the to be written value at tsched becomes
+	 * correct:
+	 *
+	 *	newval = tsched + set_offset_nsec - NSEC_PER_SEC
+	 * and  (tsched + set_offset_nsec) % NSEC_PER_SEC == 0
 	 */
-	long set_offset_nsec;
+	unsigned long set_offset_nsec;
 
 	bool registered;
 
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 84a554622cee..a34ac069335f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -520,22 +520,33 @@ static void sched_sync_hw_clock(unsigned long offset_nsec, bool retry)
 }
 
 /*
- * Determine if we can call to driver to set the time. Drivers can only be
- * called to set a second aligned time value, and the field set_offset_nsec
- * specifies how far away from the second aligned time to call the driver.
+ * Check whether @now is correct versus the required time to update the RTC
+ * and calculate the value which needs to be written to the RTC so that the
+ * next seconds increment of the RTC after the write is aligned with the next
+ * seconds increment of clock REALTIME.
  *
- * This also computes 'to_set' which is the time we are trying to set, and has
- * a zero in tv_nsecs, such that:
- *    to_set - set_delay_nsec == now +/- FUZZ
+ * tsched     t1 write(t2.tv_sec - 1sec))	t2 RTC increments seconds
  *
+ * t2.tv_nsec == 0
+ * tsched = t2 - set_offset_nsec
+ * newval = t2 - NSEC_PER_SEC
+ *
+ * ==> neval = tsched + set_offset_nsec - NSEC_PER_SEC
+ *
+ * As the execution of this code is not guaranteed to happen exactly at
+ * tsched this allows it to happen within a fuzzy region:
+ *
+ *	abs(now - tsched) < FUZZ
+ *
+ * If @now is not inside the allowed window the function returns false.
  */
-static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
+static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
 				  struct timespec64 *to_set,
 				  const struct timespec64 *now)
 {
 	/* Allowed error in tv_nsec, arbitarily set to 5 jiffies in ns. */
 	const unsigned long TIME_SET_NSEC_FUZZ = TICK_NSEC * 5;
-	struct timespec64 delay = {.tv_sec = 0,
+	struct timespec64 delay = {.tv_sec = -1,
 				   .tv_nsec = set_offset_nsec};
 
 	*to_set = timespec64_add(*now, delay);
@@ -557,11 +568,11 @@ static inline bool rtc_tv_nsec_ok(long set_offset_nsec,
 /*
  * rtc_set_ntp_time - Save NTP synchronized time to the RTC
  */
-static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
+static int rtc_set_ntp_time(struct timespec64 now, unsigned long *offset_nsec)
 {
+	struct timespec64 to_set;
 	struct rtc_device *rtc;
 	struct rtc_time tm;
-	struct timespec64 to_set;
 	int err = -ENODEV;
 	bool ok;
 
@@ -572,19 +583,9 @@ static int rtc_set_ntp_time(struct timespec64 now, unsigned long *target_nsec)
 	if (!rtc->ops || !rtc->ops->set_time)
 		goto out_close;
 
-	/*
-	 * Compute the value of tv_nsec we require the caller to supply in
-	 * now.tv_nsec.  This is the value such that (now +
-	 * set_offset_nsec).tv_nsec == 0.
-	 */
-	set_normalized_timespec64(&to_set, 0, -rtc->set_offset_nsec);
-	*target_nsec = to_set.tv_nsec;
+	/* Store the update offset for this RTC */
+	*offset_nsec = rtc->set_offset_nsec;
 
-	/*
-	 * The ntp code must call this with the correct value in tv_nsec, if
-	 * it does not we update target_nsec and return EPROTO to make the ntp
-	 * code try again later.
-	 */
 	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
 	if (!ok) {
 		err = -EPROTO;
@@ -657,7 +658,7 @@ static bool sync_cmos_clock(void)
 	 * implement this legacy API.
 	 */
 	ktime_get_real_ts64(&now);
-	if (rtc_tv_nsec_ok(-1 * target_nsec, &adjust, &now)) {
+	if (rtc_tv_nsec_ok(target_nsec, &adjust, &now)) {
 		if (persistent_clock_is_local)
 			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
 		rc = update_persistent_clock64(adjust);
-- 
cgit v1.2.3


From 76e87d96b30b5fee91b381fbc444a3eabcd9469a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Sun, 6 Dec 2020 22:46:21 +0100
Subject: ntp: Consolidate the RTC update implementation

The code for the legacy RTC and the RTC class based update are pretty much
the same. Consolidate the common parts into one function and just invoke
the actual setter functions.

For RTC class based devices the update code checks whether the offset is
valid for the device, which is usually not the case for the first
invocation. If it's not the same it stores the correct offset and lets the
caller try again. That's not much different from the previous approach
where the first invocation had a pretty low probability to actually hit the
allowed window.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Acked-by: Alexandre Belloni <alexandre.belloni@bootlin.com>
Link: https://lore.kernel.org/r/20201206220542.355743355@linutronix.de

---
 kernel/time/ntp.c | 144 ++++++++++++++++++++----------------------------------
 1 file changed, 52 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a34ac069335f..7404d3831527 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -564,118 +564,53 @@ static inline bool rtc_tv_nsec_ok(unsigned long set_offset_nsec,
 	return false;
 }
 
+#ifdef CONFIG_GENERIC_CMOS_UPDATE
+int __weak update_persistent_clock64(struct timespec64 now64)
+{
+	return -ENODEV;
+}
+#else
+static inline int update_persistent_clock64(struct timespec64 now64)
+{
+	return -ENODEV;
+}
+#endif
+
 #ifdef CONFIG_RTC_SYSTOHC
-/*
- * rtc_set_ntp_time - Save NTP synchronized time to the RTC
- */
-static int rtc_set_ntp_time(struct timespec64 now, unsigned long *offset_nsec)
+/* Save NTP synchronized time to the RTC */
+static int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
 {
-	struct timespec64 to_set;
 	struct rtc_device *rtc;
 	struct rtc_time tm;
 	int err = -ENODEV;
-	bool ok;
 
 	rtc = rtc_class_open(CONFIG_RTC_SYSTOHC_DEVICE);
 	if (!rtc)
-		goto out_err;
+		return -ENODEV;
 
 	if (!rtc->ops || !rtc->ops->set_time)
 		goto out_close;
 
-	/* Store the update offset for this RTC */
-	*offset_nsec = rtc->set_offset_nsec;
-
-	ok = rtc_tv_nsec_ok(rtc->set_offset_nsec, &to_set, &now);
-	if (!ok) {
-		err = -EPROTO;
-		goto out_close;
+	/* First call might not have the correct offset */
+	if (*offset_nsec == rtc->set_offset_nsec) {
+		rtc_time64_to_tm(to_set->tv_sec, &tm);
+		err = rtc_set_time(rtc, &tm);
+	} else {
+		/* Store the update offset and let the caller try again */
+		*offset_nsec = rtc->set_offset_nsec;
+		err = -EAGAIN;
 	}
-
-	rtc_time64_to_tm(to_set.tv_sec, &tm);
-
-	err = rtc_set_time(rtc, &tm);
-
 out_close:
 	rtc_class_close(rtc);
-out_err:
 	return err;
 }
-
-static void sync_rtc_clock(void)
-{
-	unsigned long offset_nsec;
-	struct timespec64 adjust;
-	int rc;
-
-	ktime_get_real_ts64(&adjust);
-
-	if (persistent_clock_is_local)
-		adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-
-	/*
-	 * The current RTC in use will provide the nanoseconds offset prior
-	 * to a full second it wants to be called at, and invokes
-	 * rtc_tv_nsec_ok() internally.
-	 */
-	rc = rtc_set_ntp_time(adjust, &offset_nsec);
-	if (rc == -ENODEV)
-		return;
-
-	sched_sync_hw_clock(offset_nsec, rc != 0);
-}
 #else
-static inline void sync_rtc_clock(void) { }
-#endif
-
-#ifdef CONFIG_GENERIC_CMOS_UPDATE
-int __weak update_persistent_clock64(struct timespec64 now64)
+static inline int update_rtc(struct timespec64 *to_set, unsigned long *offset_nsec)
 {
 	return -ENODEV;
 }
 #endif
 
-static bool sync_cmos_clock(void)
-{
-	static bool no_cmos;
-	struct timespec64 now;
-	struct timespec64 adjust;
-	int rc = -EPROTO;
-	long target_nsec = NSEC_PER_SEC / 2;
-
-	if (!IS_ENABLED(CONFIG_GENERIC_CMOS_UPDATE))
-		return false;
-
-	if (no_cmos)
-		return false;
-
-	/*
-	 * Historically update_persistent_clock64() has followed x86
-	 * semantics, which match the MC146818A/etc RTC. This RTC will store
-	 * 'adjust' and then in .5s it will advance once second.
-	 *
-	 * Architectures are strongly encouraged to use rtclib and not
-	 * implement this legacy API.
-	 */
-	ktime_get_real_ts64(&now);
-	if (rtc_tv_nsec_ok(target_nsec, &adjust, &now)) {
-		if (persistent_clock_is_local)
-			adjust.tv_sec -= (sys_tz.tz_minuteswest * 60);
-		rc = update_persistent_clock64(adjust);
-		/*
-		 * The machine does not support update_persistent_clock64 even
-		 * though it defines CONFIG_GENERIC_CMOS_UPDATE.
-		 */
-		if (rc == -ENODEV) {
-			no_cmos = true;
-			return false;
-		}
-	}
-
-	sched_sync_hw_clock(target_nsec, rc != 0);
-	return true;
-}
-
 /*
  * If we have an externally synchronized Linux clock, then update RTC clock
  * accordingly every ~11 minutes. Generally RTCs can only store second
@@ -686,6 +621,15 @@ static bool sync_cmos_clock(void)
  */
 static void sync_hw_clock(struct work_struct *work)
 {
+	/*
+	 * The default synchronization offset is 500ms for the deprecated
+	 * update_persistent_clock64() under the assumption that it uses
+	 * the infamous CMOS clock (MC146818).
+	 */
+	static unsigned long offset_nsec = NSEC_PER_SEC / 2;
+	struct timespec64 now, to_set;
+	int res = -EAGAIN;
+
 	/*
 	 * Don't update if STA_UNSYNC is set and if ntp_notify_cmos_timer()
 	 * managed to schedule the work between the timer firing and the
@@ -694,10 +638,26 @@ static void sync_hw_clock(struct work_struct *work)
 	if (!ntp_synced() || hrtimer_is_queued(&sync_hrtimer))
 		return;
 
-	if (sync_cmos_clock())
-		return;
+	ktime_get_real_ts64(&now);
+	/* If @now is not in the allowed window, try again */
+	if (!rtc_tv_nsec_ok(offset_nsec, &to_set, &now))
+		goto rearm;
 
-	sync_rtc_clock();
+	/* Take timezone adjusted RTCs into account */
+	if (persistent_clock_is_local)
+		to_set.tv_sec -= (sys_tz.tz_minuteswest * 60);
+
+	/* Try the legacy RTC first. */
+	res = update_persistent_clock64(to_set);
+	if (res != -ENODEV)
+		goto rearm;
+
+	/* Try the RTC class */
+	res = update_rtc(&to_set, &offset_nsec);
+	if (res == -ENODEV)
+		return;
+rearm:
+	sched_sync_hw_clock(offset_nsec, res != 0);
 }
 
 void ntp_notify_cmos_timer(void)
-- 
cgit v1.2.3


From aa3b66f401b372598b29421bab4d17b631b92407 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner
Date: Fri, 4 Dec 2020 11:55:19 +0100
Subject: tick/sched: Make jiffies update quick check more robust

The quick check in tick_do_update_jiffies64() whether jiffies need to be
updated is not really correct under all circumstances and on all
architectures, especially not on 32bit systems.

The quick check does:

    if (now < READ_ONCE(tick_next_period))
    	return;

and the counterpart in the update is:

    WRITE_ONCE(tick_next_period, next_update_time);

This has two problems:

  1) On weakly ordered architectures there is no guarantee that the stores
     before the WRITE_ONCE() are visible which means that other CPUs can
     operate on a stale jiffies value.

  2) On 32bit the store of tick_next_period which is an u64 is split into
     two 32bit stores. If the first 32bit store advances tick_next_period
     far out and the second 32bit store is delayed (virt, NMI ...) then
     jiffies will become stale until the second 32bit store happens.

Address this by seperating the handling for 32bit and 64bit.

On 64bit problem #1 is addressed by replacing READ_ONCE() / WRITE_ONCE()
with smp_load_acquire() / smp_store_release().

On 32bit problem #2 is addressed by protecting the quick check with the
jiffies sequence counter. The load and stores can be plain because the
sequence count mechanics provides the required barriers already.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Frederic Weisbecker <frederic@kernel.org>
Link: https://lore.kernel.org/r/87czzpc02w.fsf@nanos.tec.linutronix.de

---
 kernel/time/tick-sched.c | 74 ++++++++++++++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cc7cba20382e..a9e68936822d 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -57,36 +57,42 @@ static ktime_t last_jiffies_update;
 static void tick_do_update_jiffies64(ktime_t now)
 {
 	unsigned long ticks = 1;
-	ktime_t delta;
+	ktime_t delta, nextp;
 
 	/*
-	 * Do a quick check without holding jiffies_lock. The READ_ONCE()
+	 * 64bit can do a quick check without holding jiffies lock and
+	 * without looking at the sequence count. The smp_load_acquire()
 	 * pairs with the update done later in this function.
 	 *
-	 * This is also an intentional data race which is even safe on
-	 * 32bit in theory. If there is a concurrent update then the check
-	 * might give a random answer. It does not matter because if it
-	 * returns then the concurrent update is already taking care, if it
-	 * falls through then it will pointlessly contend on jiffies_lock.
-	 *
-	 * Though there is one nasty case on 32bit due to store tearing of
-	 * the 64bit value. If the first 32bit store makes the quick check
-	 * return on all other CPUs and the writing CPU context gets
-	 * delayed to complete the second store (scheduled out on virt)
-	 * then jiffies can become stale for up to ~2^32 nanoseconds
-	 * without noticing. After that point all CPUs will wait for
-	 * jiffies lock.
-	 *
-	 * OTOH, this is not any different than the situation with NOHZ=off
-	 * where one CPU is responsible for updating jiffies and
-	 * timekeeping. If that CPU goes out for lunch then all other CPUs
-	 * will operate on stale jiffies until it decides to come back.
+	 * 32bit cannot do that because the store of tick_next_period
+	 * consists of two 32bit stores and the first store could move it
+	 * to a random point in the future.
 	 */
-	if (ktime_before(now, READ_ONCE(tick_next_period)))
-		return;
+	if (IS_ENABLED(CONFIG_64BIT)) {
+		if (ktime_before(now, smp_load_acquire(&tick_next_period)))
+			return;
+	} else {
+		unsigned int seq;
 
-	/* Reevaluate with jiffies_lock held */
+		/*
+		 * Avoid contention on jiffies_lock and protect the quick
+		 * check with the sequence count.
+		 */
+		do {
+			seq = read_seqcount_begin(&jiffies_seq);
+			nextp = tick_next_period;
+		} while (read_seqcount_retry(&jiffies_seq, seq));
+
+		if (ktime_before(now, nextp))
+			return;
+	}
+
+	/* Quick check failed, i.e. update is required. */
 	raw_spin_lock(&jiffies_lock);
+	/*
+	 * Reevaluate with the lock held. Another CPU might have done the
+	 * update already.
+	 */
 	if (ktime_before(now, tick_next_period)) {
 		raw_spin_unlock(&jiffies_lock);
 		return;
@@ -112,11 +118,25 @@ static void tick_do_update_jiffies64(ktime_t now)
 	jiffies_64 += ticks;
 
 	/*
-	 * Keep the tick_next_period variable up to date.  WRITE_ONCE()
-	 * pairs with the READ_ONCE() in the lockless quick check above.
+	 * Keep the tick_next_period variable up to date.
 	 */
-	WRITE_ONCE(tick_next_period,
-		   ktime_add_ns(last_jiffies_update, TICK_NSEC));
+	nextp = ktime_add_ns(last_jiffies_update, TICK_NSEC);
+
+	if (IS_ENABLED(CONFIG_64BIT)) {
+		/*
+		 * Pairs with smp_load_acquire() in the lockless quick
+		 * check above and ensures that the update to jiffies_64 is
+		 * not reordered vs. the store to tick_next_period, neither
+		 * by the compiler nor by the CPU.
+		 */
+		smp_store_release(&tick_next_period, nextp);
+	} else {
+		/*
+		 * A plain store is good enough on 32bit as the quick check
+		 * above is protected by the sequence count.
+		 */
+		tick_next_period = nextp;
+	}
 
 	/*
 	 * Release the sequence count. calc_global_load() below is not
-- 
cgit v1.2.3