From 58b79a91f57efec9457de8ff93a4cc4fb8daf753 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 6 Sep 2013 15:31:08 -0400 Subject: cgroup: fix cgroup post-order descendant walk of empty subtree bd8815a6d8 ("cgroup: make css_for_each_descendant() and friends include the origin css in the iteration") updated cgroup descendant iterators to include the origin css; unfortuantely, it forgot to drop special case handling in css_next_descendant_post() for empty subtree leading to failure to visit the origin css without any child. Fix it by dropping the special case handling and always returning the leftmost descendant on the first iteration. Signed-off-by: Tejun Heo Acked-by: Li Zefan --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e0aeb32415ff..8075b72d22be 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3187,11 +3187,9 @@ css_next_descendant_post(struct cgroup_subsys_state *pos, WARN_ON_ONCE(!rcu_read_lock_held()); - /* if first iteration, visit the leftmost descendant */ - if (!pos) { - next = css_leftmost_descendant(root); - return next != root ? next : NULL; - } + /* if first iteration, visit leftmost descendant which may be @root */ + if (!pos) + return css_leftmost_descendant(root); /* if we visited @root, we're done */ if (pos == root) -- cgit v1.2.3 From ea84753c98a7ac6b74e530b64c444a912b3835ca Mon Sep 17 00:00:00 2001 From: Anjana V Kumar Date: Sat, 12 Oct 2013 10:59:17 +0800 Subject: cgroup: fix to break the while loop in cgroup_attach_task() correctly Both Anjana and Eunki reported a stall in the while_each_thread loop in cgroup_attach_task(). It's because, when we attach a single thread to a cgroup, if the cgroup is exiting or is already in that cgroup, we won't break the loop. If the task is already in the cgroup, the bug can lead to another thread being attached to the cgroup unexpectedly: # echo 5207 > tasks # cat tasks 5207 # echo 5207 > tasks # cat tasks 5207 5215 What's worse, if the task to be attached isn't the leader of the thread group, we might never exit the loop, hence cpu stall. Thanks for Oleg's analysis. This bug was introduced by commit 081aa458c38ba576bdd4265fc807fa95b48b9e79 ("cgroup: consolidate cgroup_attach_task() and cgroup_attach_proc()") [ lizf: - fixed the first continue, pointed out by Oleg, - rewrote changelog. ] Cc: # 3.9+ Reported-by: Eunki Kim Reported-by: Anjana V Kumar Signed-off-by: Anjana V Kumar Signed-off-by: Li Zefan Signed-off-by: Tejun Heo --- kernel/cgroup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 8075b72d22be..1bf4f7a12703 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2038,7 +2038,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) - continue; + goto next; /* as per above, nr_threads may decrease, but not increase. */ BUG_ON(i >= group_size); @@ -2046,7 +2046,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, ent.cgrp = task_cgroup_from_root(tsk, root); /* nothing to do if this task is already in the cgroup */ if (ent.cgrp == cgrp) - continue; + goto next; /* * saying GFP_ATOMIC has no effect here because we did prealloc * earlier, but it's good form to communicate our expectations. @@ -2054,7 +2054,7 @@ static int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk, retval = flex_array_put(group, i, &ent, GFP_ATOMIC); BUG_ON(retval != 0); i++; - + next: if (!threadgroup) break; } while_each_thread(leader, tsk); -- cgit v1.2.3 From 3090ffb5a2515990182f3f55b0688a7817325488 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 17 Oct 2013 19:32:15 +0200 Subject: perf: Disable PERF_RECORD_MMAP2 support For now, we disable the extended MMAP record support (MMAP2). We have identified cases where it would not report the correct mapping information, clone(VM_CLONE) but with separate pids. We will revisit the support once we find a solution for this case. The patch changes the kernel to return EINVAL if attr->mmap2 is set. The patch also modifies the perf tool to use regular PERF_RECORD_MMAP for synthetic events and it also prevents the tool from requesting attr->mmap2 mode because the kernel would reject it. The support will be revisited once the kenrel interface is updated. In V2, we reduce the patch to the strict minimum. In V3, we avoid calling perf_event_open() with mmap2 set because we know it will fail and require fallback retry. Signed-off-by: Stephane Eranian Cc: Andi Kleen Cc: David Ahern Cc: Ingo Molnar Cc: Jiri Olsa Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20131017173215.GA8820@quad Signed-off-by: Arnaldo Carvalho de Melo --- kernel/events/core.c | 4 ++++ tools/perf/util/event.c | 30 +++++++++++++----------------- tools/perf/util/evsel.c | 1 - 3 files changed, 17 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index d49a9d29334c..953c14348375 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6767,6 +6767,10 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (ret) return -EFAULT; + /* disabled for now */ + if (attr->mmap2) + return -EINVAL; + if (attr->__reserved_1) return -EINVAL; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 9b393e7dca6f..63df031fc9c7 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -187,7 +187,7 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, return -1; } - event->header.type = PERF_RECORD_MMAP2; + event->header.type = PERF_RECORD_MMAP; /* * Just like the kernel, see __perf_event_mmap in kernel/perf_event.c */ @@ -198,7 +198,6 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, char prot[5]; char execname[PATH_MAX]; char anonstr[] = "//anon"; - unsigned int ino; size_t size; ssize_t n; @@ -209,13 +208,10 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, ""); /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ - n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %x:%x %u %s\n", - &event->mmap2.start, &event->mmap2.len, prot, - &event->mmap2.pgoff, &event->mmap2.maj, - &event->mmap2.min, - &ino, execname); - - event->mmap2.ino = (u64)ino; + n = sscanf(bf, "%"PRIx64"-%"PRIx64" %s %"PRIx64" %*x:%*x %*u %s\n", + &event->mmap.start, &event->mmap.len, prot, + &event->mmap.pgoff, + execname); if (n != 8) continue; @@ -227,15 +223,15 @@ static int perf_event__synthesize_mmap_events(struct perf_tool *tool, strcpy(execname, anonstr); size = strlen(execname) + 1; - memcpy(event->mmap2.filename, execname, size); + memcpy(event->mmap.filename, execname, size); size = PERF_ALIGN(size, sizeof(u64)); - event->mmap2.len -= event->mmap.start; - event->mmap2.header.size = (sizeof(event->mmap2) - - (sizeof(event->mmap2.filename) - size)); - memset(event->mmap2.filename + size, 0, machine->id_hdr_size); - event->mmap2.header.size += machine->id_hdr_size; - event->mmap2.pid = tgid; - event->mmap2.tid = pid; + event->mmap.len -= event->mmap.start; + event->mmap.header.size = (sizeof(event->mmap) - + (sizeof(event->mmap.filename) - size)); + memset(event->mmap.filename + size, 0, machine->id_hdr_size); + event->mmap.header.size += machine->id_hdr_size; + event->mmap.pid = tgid; + event->mmap.tid = pid; if (process(tool, event, &synth_sample, machine) != 0) { rc = -1; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 0ce9febf1ba0..9f1ef9bee2d0 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -678,7 +678,6 @@ void perf_evsel__config(struct perf_evsel *evsel, attr->sample_type |= PERF_SAMPLE_WEIGHT; attr->mmap = track; - attr->mmap2 = track && !perf_missing_features.mmap2; attr->comm = track; /* -- cgit v1.2.3 From b0267507dfd0187fb7840a0ec461a510a7f041c5 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 17 Oct 2013 19:45:29 +0900 Subject: mutex: Avoid gcc version dependent __builtin_constant_p() usage Commit 040a0a37 ("mutex: Add support for wound/wait style locks") used "!__builtin_constant_p(p == NULL)" but gcc 3.x cannot handle such expression correctly, leading to boot failure when built with CONFIG_DEBUG_MUTEXES=y. Fix it by explicitly passing a bool which tells whether p != NULL or not. [ PeterZ: This is a sad patch, but provided it actually generates similar code I suppose its the best we can do bar whole sale deprecating gcc-3. ] Signed-off-by: Tetsuo Handa Acked-by: Peter Zijlstra Acked-by: Maarten Lankhorst Cc: peterz@infradead.org Cc: imirkin@alum.mit.edu Cc: daniel.vetter@ffwll.ch Cc: robdclark@gmail.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/201310171945.AGB17114.FSQVtHOJFOOFML@I-love.SAKURA.ne.jp Signed-off-by: Ingo Molnar --- kernel/mutex.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 6d647aedffea..d24105b1b794 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -410,7 +410,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, static __always_inline int __sched __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct lockdep_map *nest_lock, unsigned long ip, - struct ww_acquire_ctx *ww_ctx) + struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) { struct task_struct *task = current; struct mutex_waiter waiter; @@ -450,7 +450,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *owner; struct mspin_node node; - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -480,7 +480,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, if ((atomic_read(&lock->count) == 1) && (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww; ww = container_of(lock, struct ww_mutex, base); @@ -551,7 +551,7 @@ slowpath: goto err; } - if (!__builtin_constant_p(ww_ctx == NULL) && ww_ctx->acquired > 0) { + if (use_ww_ctx && ww_ctx->acquired > 0) { ret = __mutex_lock_check_stamp(lock, ww_ctx); if (ret) goto err; @@ -575,7 +575,7 @@ skip_wait: lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); - if (!__builtin_constant_p(ww_ctx == NULL)) { + if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); struct mutex_waiter *cur; @@ -615,7 +615,7 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_nested); @@ -625,7 +625,7 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) { might_sleep(); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, - 0, nest, _RET_IP_, NULL); + 0, nest, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); @@ -635,7 +635,7 @@ mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_KILLABLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); @@ -644,7 +644,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) { might_sleep(); return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, - subclass, NULL, _RET_IP_, NULL); + subclass, NULL, _RET_IP_, NULL, 0); } EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); @@ -682,7 +682,7 @@ __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -697,7 +697,7 @@ __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) might_sleep(); ret = __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, - 0, &ctx->dep_map, _RET_IP_, ctx); + 0, &ctx->dep_map, _RET_IP_, ctx, 1); if (!ret && ctx->acquired > 1) return ww_mutex_deadlock_injection(lock, ctx); @@ -809,28 +809,28 @@ __mutex_lock_slowpath(atomic_t *lock_count) struct mutex *lock = container_of(lock_count, struct mutex, count); __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_killable_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_KILLABLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __mutex_lock_interruptible_slowpath(struct mutex *lock) { return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, NULL); + NULL, _RET_IP_, NULL, 0); } static noinline int __sched __ww_mutex_lock_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_UNINTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } static noinline int __sched @@ -838,7 +838,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { return __mutex_lock_common(&lock->base, TASK_INTERRUPTIBLE, 0, - NULL, _RET_IP_, ctx); + NULL, _RET_IP_, ctx, 1); } #endif -- cgit v1.2.3 From 97b9410643475d6557d2517c2aff9fd2221141a9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Sep 2013 21:50:23 +0200 Subject: clockevents: Sanitize ticks to nsec conversion Marc Kleine-Budde pointed out, that commit 77cc982 "clocksource: use clockevents_config_and_register() where possible" caused a regression for some of the converted subarchs. The reason is, that the clockevents core code converts the minimal hardware tick delta to a nanosecond value for core internal usage. This conversion is affected by integer math rounding loss, so the backwards conversion to hardware ticks will likely result in a value which is less than the configured hardware limitation. The affected subarchs used their own workaround (SIGH!) which got lost in the conversion. The solution for the issue at hand is simple: adding evt->mult - 1 to the shifted value before the integer divison in the core conversion function takes care of it. But this only works for the case where for the scaled math mult/shift pair "mult <= 1 << shift" is true. For the case where "mult > 1 << shift" we can apply the rounding add only for the minimum delta value to make sure that the backward conversion is not less than the given hardware limit. For the upper bound we need to omit the rounding add, because the backwards conversion is always larger than the original latch value. That would violate the upper bound of the hardware device. Though looking closer at the details of that function reveals another bogosity: The upper bounds check is broken as well. Checking for a resulting "clc" value greater than KTIME_MAX after the conversion is pointless. The conversion does: u64 clc = (latch << evt->shift) / evt->mult; So there is no sanity check for (latch << evt->shift) exceeding the 64bit boundary. The latch argument is "unsigned long", so on a 64bit arch the handed in argument could easily lead to an unnoticed shift overflow. With the above rounding fix applied the calculation before the divison is: u64 clc = (latch << evt->shift) + evt->mult - 1; So we need to make sure, that neither the shift nor the rounding add is overflowing the u64 boundary. [ukl: move assignment to rnd after eventually changing mult, fix build issue and correct comment with the right math] Signed-off-by: Thomas Gleixner Cc: Russell King - ARM Linux Cc: Marc Kleine-Budde Cc: nicolas.ferre@atmel.com Cc: Marc Pignat Cc: john.stultz@linaro.org Cc: kernel@pengutronix.de Cc: Ronald Wahl Cc: LAK Cc: Ludovic Desroches Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/1380052223-24139-1-git-send-email-u.kleine-koenig@pengutronix.de Signed-off-by: Uwe Kleine-König --- kernel/time/clockevents.c | 65 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 50 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 38959c866789..662c5798a685 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -33,29 +33,64 @@ struct ce_unbind { int res; }; -/** - * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds - * @latch: value to convert - * @evt: pointer to clock event device descriptor - * - * Math helper, returns latch value converted to nanoseconds (bound checked) - */ -u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +static u64 cev_delta2ns(unsigned long latch, struct clock_event_device *evt, + bool ismax) { u64 clc = (u64) latch << evt->shift; + u64 rnd; if (unlikely(!evt->mult)) { evt->mult = 1; WARN_ON(1); } + rnd = (u64) evt->mult - 1; + + /* + * Upper bound sanity check. If the backwards conversion is + * not equal latch, we know that the above shift overflowed. + */ + if ((clc >> evt->shift) != (u64)latch) + clc = ~0ULL; + + /* + * Scaled math oddities: + * + * For mult <= (1 << shift) we can safely add mult - 1 to + * prevent integer rounding loss. So the backwards conversion + * from nsec to device ticks will be correct. + * + * For mult > (1 << shift), i.e. device frequency is > 1GHz we + * need to be careful. Adding mult - 1 will result in a value + * which when converted back to device ticks can be larger + * than latch by up to (mult - 1) >> shift. For the min_delta + * calculation we still want to apply this in order to stay + * above the minimum device ticks limit. For the upper limit + * we would end up with a latch value larger than the upper + * limit of the device, so we omit the add to stay below the + * device upper boundary. + * + * Also omit the add if it would overflow the u64 boundary. + */ + if ((~0ULL - clc > rnd) && + (!ismax || evt->mult <= (1U << evt->shift))) + clc += rnd; do_div(clc, evt->mult); - if (clc < 1000) - clc = 1000; - if (clc > KTIME_MAX) - clc = KTIME_MAX; - return clc; + /* Deltas less than 1usec are pointless noise */ + return clc > 1000 ? clc : 1000; +} + +/** + * clockevents_delta2ns - Convert a latch value (device ticks) to nanoseconds + * @latch: value to convert + * @evt: pointer to clock event device descriptor + * + * Math helper, returns latch value converted to nanoseconds (bound checked) + */ +u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt) +{ + return cev_delta2ns(latch, evt, false); } EXPORT_SYMBOL_GPL(clockevent_delta2ns); @@ -380,8 +415,8 @@ void clockevents_config(struct clock_event_device *dev, u32 freq) sec = 600; clockevents_calc_mult_shift(dev, freq, sec); - dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev); - dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev); + dev->min_delta_ns = cev_delta2ns(dev->min_delta_ticks, dev, false); + dev->max_delta_ns = cev_delta2ns(dev->max_delta_ticks, dev, true); } /** -- cgit v1.2.3 From d3c345dbc7c083414ef74eb22ff26ba2bd100759 Mon Sep 17 00:00:00 2001 From: Russ Dill Date: Thu, 24 Oct 2013 14:25:26 +0100 Subject: PM / hibernate: Move software_resume to late_initcall_sync software_resume is being called after deferred_probe_initcall in drivers base. If the probing of the device that contains the resume image is deferred, and the system has been instructed to wait for it to show up, this wait will occur in software_resume. This causes a deadlock. Move software_resume into late_initcall_sync so that it happens after all the other late_initcalls. Signed-off-by: Russ Dill Acked-by: Pavel Machek Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index c9c759d5a15c..0121dab83f43 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -846,7 +846,7 @@ static int software_resume(void) goto Finish; } -late_initcall(software_resume); +late_initcall_sync(software_resume); static const char * const hibernation_modes[] = { -- cgit v1.2.3