aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds2015-06-24 15:09:40 -0700
committerLinus Torvalds2015-06-24 15:09:40 -0700
commit98ec21a01896751b673b6c731ca8881daa8b2c6d (patch)
tree9d6d780675436efc894878475284c70f766126dd
parenta262948335bc5359b82f0ed5ef35f6e82ca44d16 (diff)
parentcbce1a686700595de65ee363b9b3283ae85d8fc5 (diff)
Merge branch 'sched-hrtimers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Thomas Gleixner: "This series of scheduler updates depends on sched/core and timers/core branches, which are already in your tree: - Scheduler balancing overhaul to plug a hard to trigger race which causes an oops in the balancer (Peter Zijlstra) - Lockdep updates which are related to the balancing updates (Peter Zijlstra)" * 'sched-hrtimers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched,lockdep: Employ lock pinning lockdep: Implement lock pinning lockdep: Simplify lock_release() sched: Streamline the task migration locking a little sched: Move code around sched,dl: Fix sched class hopping CBS hole sched, dl: Convert switched_{from, to}_dl() / prio_changed_dl() to balance callbacks sched,dl: Remove return value from pull_dl_task() sched, rt: Convert switched_{from, to}_rt() / prio_changed_rt() to balance callbacks sched,rt: Remove return value from pull_rt_task() sched: Allow balance callbacks for check_class_changed() sched: Use replace normalize_task() with __sched_setscheduler() sched: Replace post_schedule with a balance callback list
-rw-r--r--include/linux/lockdep.h10
-rw-r--r--kernel/locking/lockdep.c177
-rw-r--r--kernel/sched/core.c540
-rw-r--r--kernel/sched/deadline.c238
-rw-r--r--kernel/sched/fair.c11
-rw-r--r--kernel/sched/rt.c84
-rw-r--r--kernel/sched/sched.h29
7 files changed, 597 insertions, 492 deletions
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 2722111591a3..70400dc7660f 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -255,6 +255,7 @@ struct held_lock {
unsigned int check:1; /* see lock_acquire() comment */
unsigned int hardirqs_off:1;
unsigned int references:12; /* 32 bits */
+ unsigned int pin_count;
};
/*
@@ -354,6 +355,9 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
extern void lockdep_clear_current_reclaim_state(void);
extern void lockdep_trace_alloc(gfp_t mask);
+extern void lock_pin_lock(struct lockdep_map *lock);
+extern void lock_unpin_lock(struct lockdep_map *lock);
+
# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
#define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0)
@@ -368,6 +372,9 @@ extern void lockdep_trace_alloc(gfp_t mask);
#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
+#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map)
+#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map)
+
#else /* !CONFIG_LOCKDEP */
static inline void lockdep_off(void)
@@ -420,6 +427,9 @@ struct lock_class_key { };
#define lockdep_recursing(tsk) (0)
+#define lockdep_pin_lock(l) do { (void)(l); } while (0)
+#define lockdep_unpin_lock(l) do { (void)(l); } while (0)
+
#endif /* !LOCKDEP */
#ifdef CONFIG_LOCK_STAT
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 456614136f1a..8acfbf773e06 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3157,6 +3157,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
hlock->waittime_stamp = 0;
hlock->holdtime_stamp = lockstat_clock();
#endif
+ hlock->pin_count = 0;
if (check && !mark_irqflags(curr, hlock))
return 0;
@@ -3260,26 +3261,6 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
return 0;
}
-/*
- * Common debugging checks for both nested and non-nested unlock:
- */
-static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
- unsigned long ip)
-{
- if (unlikely(!debug_locks))
- return 0;
- /*
- * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
- */
- if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
- return 0;
-
- if (curr->lockdep_depth <= 0)
- return print_unlock_imbalance_bug(curr, lock, ip);
-
- return 1;
-}
-
static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
{
if (hlock->instance == lock)
@@ -3376,31 +3357,35 @@ found_it:
}
/*
- * Remove the lock to the list of currently held locks in a
- * potentially non-nested (out of order) manner. This is a
- * relatively rare operation, as all the unlock APIs default
- * to nested mode (which uses lock_release()):
+ * Remove the lock to the list of currently held locks - this gets
+ * called on mutex_unlock()/spin_unlock*() (or on a failed
+ * mutex_lock_interruptible()).
+ *
+ * @nested is an hysterical artifact, needs a tree wide cleanup.
*/
static int
-lock_release_non_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
{
+ struct task_struct *curr = current;
struct held_lock *hlock, *prev_hlock;
unsigned int depth;
int i;
- /*
- * Check whether the lock exists in the current stack
- * of held locks:
- */
+ if (unlikely(!debug_locks))
+ return 0;
+
depth = curr->lockdep_depth;
/*
* So we're all set to release this lock.. wait what lock? We don't
* own any locks, you've been drinking again?
*/
- if (DEBUG_LOCKS_WARN_ON(!depth))
- return 0;
+ if (DEBUG_LOCKS_WARN_ON(depth <= 0))
+ return print_unlock_imbalance_bug(curr, lock, ip);
+ /*
+ * Check whether the lock exists in the current stack
+ * of held locks:
+ */
prev_hlock = NULL;
for (i = depth-1; i >= 0; i--) {
hlock = curr->held_locks + i;
@@ -3419,6 +3404,8 @@ found_it:
if (hlock->instance == lock)
lock_release_holdtime(hlock);
+ WARN(hlock->pin_count, "releasing a pinned lock\n");
+
if (hlock->references) {
hlock->references--;
if (hlock->references) {
@@ -3456,91 +3443,66 @@ found_it:
*/
if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
return 0;
+
return 1;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static int lock_release_nested(struct task_struct *curr,
- struct lockdep_map *lock, unsigned long ip)
+static int __lock_is_held(struct lockdep_map *lock)
{
- struct held_lock *hlock;
- unsigned int depth;
-
- /*
- * Pop off the top of the lock stack:
- */
- depth = curr->lockdep_depth - 1;
- hlock = curr->held_locks + depth;
-
- /*
- * Is the unlock non-nested:
- */
- if (hlock->instance != lock || hlock->references)
- return lock_release_non_nested(curr, lock, ip);
- curr->lockdep_depth--;
-
- /*
- * No more locks, but somehow we've got hash left over, who left it?
- */
- if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
- return 0;
+ struct task_struct *curr = current;
+ int i;
- curr->curr_chain_key = hlock->prev_chain_key;
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
- lock_release_holdtime(hlock);
+ if (match_held_lock(hlock, lock))
+ return 1;
+ }
-#ifdef CONFIG_DEBUG_LOCKDEP
- hlock->prev_chain_key = 0;
- hlock->class_idx = 0;
- hlock->acquire_ip = 0;
- hlock->irq_context = 0;
-#endif
- return 1;
+ return 0;
}
-/*
- * Remove the lock to the list of currently held locks - this gets
- * called on mutex_unlock()/spin_unlock*() (or on a failed
- * mutex_lock_interruptible()). This is done for unlocks that nest
- * perfectly. (i.e. the current top of the lock-stack is unlocked)
- */
-static void
-__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
+static void __lock_pin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
+ int i;
- if (!check_unlock(curr, lock, ip))
+ if (unlikely(!debug_locks))
return;
- if (nested) {
- if (!lock_release_nested(curr, lock, ip))
- return;
- } else {
- if (!lock_release_non_nested(curr, lock, ip))
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ hlock->pin_count++;
return;
+ }
}
- check_chain_key(curr);
+ WARN(1, "pinning an unheld lock\n");
}
-static int __lock_is_held(struct lockdep_map *lock)
+static void __lock_unpin_lock(struct lockdep_map *lock)
{
struct task_struct *curr = current;
int i;
+ if (unlikely(!debug_locks))
+ return;
+
for (i = 0; i < curr->lockdep_depth; i++) {
struct held_lock *hlock = curr->held_locks + i;
- if (match_held_lock(hlock, lock))
- return 1;
+ if (match_held_lock(hlock, lock)) {
+ if (WARN(!hlock->pin_count, "unpinning an unpinned lock\n"))
+ return;
+
+ hlock->pin_count--;
+ return;
+ }
}
- return 0;
+ WARN(1, "unpinning an unheld lock\n");
}
/*
@@ -3639,7 +3601,8 @@ void lock_release(struct lockdep_map *lock, int nested,
check_flags(flags);
current->lockdep_recursion = 1;
trace_lock_release(lock, ip);
- __lock_release(lock, nested, ip);
+ if (__lock_release(lock, nested, ip))
+ check_chain_key(current);
current->lockdep_recursion = 0;
raw_local_irq_restore(flags);
}
@@ -3665,6 +3628,40 @@ int lock_is_held(struct lockdep_map *lock)
}
EXPORT_SYMBOL_GPL(lock_is_held);
+void lock_pin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_pin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_pin_lock);
+
+void lock_unpin_lock(struct lockdep_map *lock)
+{
+ unsigned long flags;
+
+ if (unlikely(current->lockdep_recursion))
+ return;
+
+ raw_local_irq_save(flags);
+ check_flags(flags);
+
+ current->lockdep_recursion = 1;
+ __lock_unpin_lock(lock);
+ current->lockdep_recursion = 0;
+ raw_local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(lock_unpin_lock);
+
void lockdep_set_current_reclaim_state(gfp_t gfp_mask)
{
current->lockdep_reclaim_gfp = gfp_mask;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c86935a7f1f8..b803e1b8ab0c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1000,7 +1000,11 @@ inline int task_curr(const struct task_struct *p)
}
/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
*/
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
@@ -1009,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
+
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1041,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return rq;
+
+ rq = move_queued_task(rq, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+
+ local_irq_enable();
+ return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ rq = move_queued_task(rq, p, dest_cpu);
+ lockdep_pin_lock(&rq->lock);
+ }
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1181,13 +1356,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1320,9 +1488,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1402,6 +1568,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+ lockdep_assert_held(&p->pi_lock);
+
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
@@ -1427,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1490,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so its safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1510,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ lockdep_assert_held(&rq->lock);
+
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -1554,6 +1731,7 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
+ lockdep_pin_lock(&rq->lock);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1561,6 +1739,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1657,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#endif
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1752,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p)
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
}
if (!(p->state & TASK_NORMAL))
@@ -2294,23 +2483,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
#ifdef CONFIG_SMP
/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
{
- if (rq->post_schedule) {
- unsigned long flags;
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
- rq->post_schedule = 0;
+ func(rq);
}
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
}
#else
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
{
}
@@ -2328,7 +2529,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/* finish_task_switch() drops rq->lock and enables preemtion */
preempt_disable();
rq = finish_task_switch(prev);
- post_schedule(rq);
+ balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -2372,6 +2573,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
+ lockdep_unpin_lock(&rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
@@ -2794,6 +2996,7 @@ static void __sched __schedule(void)
*/
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -2836,10 +3039,12 @@ static void __sched __schedule(void)
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
- } else
+ } else {
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
+ }
- post_schedule(rq);
+ balance_callback(rq);
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -3103,7 +3308,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
+ preempt_disable(); /* avoid rq from going away on us */
__task_rq_unlock(rq);
+
+ balance_callback(rq);
+ preempt_enable();
}
#endif
@@ -3441,7 +3650,7 @@ static bool dl_param_changed(struct task_struct *p,
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
- bool user)
+ bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3627,18 +3836,20 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
}
queued = task_on_rq_queued(p);
@@ -3649,7 +3860,7 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, true);
+ __setscheduler(rq, p, attr, pi);
if (running)
p->sched_class->set_curr_task(rq);
@@ -3662,9 +3873,17 @@ change:
}
check_class_changed(rq, p, prev_class, oldprio);
+ preempt_disable(); /* avoid rq from going away on us */
task_rq_unlock(rq, p, &flags);
- rt_mutex_adjust_pi(p);
+ if (pi)
+ rt_mutex_adjust_pi(p);
+
+ /*
+ * Run balance callbacks after we've adjusted the PI chain.
+ */
+ balance_callback(rq);
+ preempt_enable();
return 0;
}
@@ -3685,7 +3904,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
attr.sched_policy = policy;
}
- return __sched_setscheduler(p, &attr, check);
+ return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3706,7 +3925,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
- return __sched_setscheduler(p, attr, true);
+ return __sched_setscheduler(p, attr, true, true);
}
EXPORT_SYMBOL_GPL(sched_setattr);
@@ -4754,149 +4973,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4944,35 +5020,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -5028,9 +5078,9 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
{
- struct rq *rq = cpu_rq(dead_cpu);
+ struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
@@ -5052,7 +5102,7 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
update_rq_clock(rq);
- for ( ; ; ) {
+ for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
@@ -5060,22 +5110,29 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
+ /*
+ * Ensure rq->lock covers the entire task selection
+ * until the migration.
+ */
+ lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_cpu, next);
- raw_spin_unlock(&rq->lock);
-
- __migrate_task(next, dead_cpu, dest_cpu);
-
- raw_spin_lock(&rq->lock);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+ lockdep_unpin_lock(&rq->lock);
+ rq = __migrate_task(rq, next, dest_cpu);
+ if (rq != dead_rq) {
+ raw_spin_unlock(&rq->lock);
+ rq = dead_rq;
+ raw_spin_lock(&rq->lock);
+ }
}
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5254,7 +5311,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5323,7 +5380,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(cpu);
+ migrate_tasks(rq);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -5401,9 +5458,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -6629,7 +6683,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -7235,7 +7289,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -7365,32 +7419,12 @@ EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
{
- const struct sched_class *prev_class = p->sched_class;
+ struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr, false);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
-
- check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7417,9 +7451,7 @@ void normalize_rt_tasks(void)
continue;
}
- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index eac20c557a55..0a17af35670a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -213,14 +213,28 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return dl_task(prev);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, dl_push_head);
+static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
+
+static void push_dl_tasks(struct rq *);
+static void pull_dl_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- rq->post_schedule = has_pushable_dl_tasks(rq);
+ if (!has_pushable_dl_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
}
static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
-static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
+static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
{
struct rq *later_rq = NULL;
bool fallback = false;
@@ -254,14 +268,19 @@ static void dl_task_offline_migration(struct rq *rq, struct task_struct *p)
double_lock_balance(rq, later_rq);
}
+ /*
+ * By now the task is replenished and enqueued; migrate it.
+ */
deactivate_task(rq, p, 0);
set_task_cpu(p, later_rq->cpu);
- activate_task(later_rq, p, ENQUEUE_REPLENISH);
+ activate_task(later_rq, p, 0);
if (!fallback)
resched_curr(later_rq);
- double_unlock_balance(rq, later_rq);
+ double_unlock_balance(later_rq, rq);
+
+ return later_rq;
}
#else
@@ -291,12 +310,15 @@ static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_dl_task(struct rq *rq)
+static inline void pull_dl_task(struct rq *rq)
+{
+}
+
+static inline void queue_push_tasks(struct rq *rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_pull_task(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -498,22 +520,23 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+static int start_dl_timer(struct task_struct *p)
{
- struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
- struct rq *rq = rq_of_dl_rq(dl_rq);
+ struct sched_dl_entity *dl_se = &p->dl;
+ struct hrtimer *timer = &dl_se->dl_timer;
+ struct rq *rq = task_rq(p);
ktime_t now, act;
s64 delta;
- if (boosted)
- return 0;
+ lockdep_assert_held(&rq->lock);
+
/*
* We want the timer to fire at the deadline, but considering
* that it is actually coming from rq->clock and not from
* hrtimer's time base reading.
*/
act = ns_to_ktime(dl_se->deadline);
- now = hrtimer_cb_get_time(&dl_se->dl_timer);
+ now = hrtimer_cb_get_time(timer);
delta = ktime_to_ns(now) - rq_clock(rq);
act = ktime_add_ns(act, delta);
@@ -525,7 +548,19 @@ static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
if (ktime_us_delta(act, now) < 0)
return 0;
- hrtimer_start(&dl_se->dl_timer, act, HRTIMER_MODE_ABS);
+ /*
+ * !enqueued will guarantee another callback; even if one is already in
+ * progress. This ensures a balanced {get,put}_task_struct().
+ *
+ * The race against __run_timer() clearing the enqueued state is
+ * harmless because we're holding task_rq()->lock, therefore the timer
+ * expiring after we've done the check will wait on its task_rq_lock()
+ * and observe our state.
+ */
+ if (!hrtimer_is_queued(timer)) {
+ get_task_struct(p);
+ hrtimer_start(timer, act, HRTIMER_MODE_ABS);
+ }
return 1;
}
@@ -555,35 +590,40 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
rq = task_rq_lock(p, &flags);
/*
- * We need to take care of several possible races here:
- *
- * - the task might have changed its scheduling policy
- * to something different than SCHED_DEADLINE
- * - the task might have changed its reservation parameters
- * (through sched_setattr())
- * - the task might have been boosted by someone else and
- * might be in the boosting/deboosting path
+ * The task might have changed its scheduling policy to something
+ * different than SCHED_DEADLINE (through switched_fromd_dl()).
+ */
+ if (!dl_task(p)) {
+ __dl_clear_params(p);
+ goto unlock;
+ }
+
+ /*
+ * This is possible if switched_from_dl() raced against a running
+ * callback that took the above !dl_task() path and we've since then
+ * switched back into SCHED_DEADLINE.
*
- * In all this cases we bail out, as the task is already
- * in the runqueue or is going to be enqueued back anyway.
+ * There's nothing to do except drop our task reference.
*/
- if (!dl_task(p) || dl_se->dl_new ||
- dl_se->dl_boosted || !dl_se->dl_throttled)
+ if (dl_se->dl_new)
goto unlock;
- sched_clock_tick();
- update_rq_clock(rq);
+ /*
+ * The task might have been boosted by someone else and might be in the
+ * boosting/deboosting path, its not throttled.
+ */
+ if (dl_se->dl_boosted)
+ goto unlock;
-#ifdef CONFIG_SMP
/*
- * If we find that the rq the task was on is no longer
- * available, we need to select a new rq.
+ * Spurious timer due to start_dl_timer() race; or we already received
+ * a replenishment from rt_mutex_setprio().
*/
- if (unlikely(!rq->online)) {
- dl_task_offline_migration(rq, p);
+ if (!dl_se->dl_throttled)
goto unlock;
- }
-#endif
+
+ sched_clock_tick();
+ update_rq_clock(rq);
/*
* If the throttle happened during sched-out; like:
@@ -609,17 +649,38 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
+
#ifdef CONFIG_SMP
/*
- * Queueing this task back might have overloaded rq,
- * check if we need to kick someone away.
+ * Perform balancing operations here; after the replenishments. We
+ * cannot drop rq->lock before this, otherwise the assertion in
+ * start_dl_timer() about not missing updates is not true.
+ *
+ * If we find that the rq the task was on is no longer available, we
+ * need to select a new rq.
+ *
+ * XXX figure out if select_task_rq_dl() deals with offline cpus.
+ */
+ if (unlikely(!rq->online))
+ rq = dl_task_offline_migration(rq, p);
+
+ /*
+ * Queueing this task back might have overloaded rq, check if we need
+ * to kick someone away.
*/
if (has_pushable_dl_tasks(rq))
push_dl_task(rq);
#endif
+
unlock:
task_rq_unlock(rq, p, &flags);
+ /*
+ * This can free the task_struct, including this hrtimer, do not touch
+ * anything related to that after this.
+ */
+ put_task_struct(p);
+
return HRTIMER_NORESTART;
}
@@ -679,7 +740,7 @@ static void update_curr_dl(struct rq *rq)
if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(!start_dl_timer(dl_se, curr->dl.dl_boosted)))
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
if (!is_leftmost(curr, &rq->dl))
@@ -1036,8 +1097,6 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}
-static int pull_dl_task(struct rq *this_rq);
-
#endif /* CONFIG_SMP */
/*
@@ -1094,7 +1153,15 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
dl_rq = &rq->dl;
if (need_pull_dl_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_dl_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a stop task can slip in, in which case we need to
@@ -1128,7 +1195,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1165,7 +1232,6 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
- struct hrtimer *timer = &p->dl.dl_timer;
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
/*
@@ -1175,8 +1241,6 @@ static void task_dead_dl(struct task_struct *p)
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
-
- hrtimer_cancel(timer);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1504,15 +1568,16 @@ static void push_dl_tasks(struct rq *rq)
;
}
-static int pull_dl_task(struct rq *this_rq)
+static void pull_dl_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ int this_cpu = this_rq->cpu, cpu;
struct task_struct *p;
+ bool resched = false;
struct rq *src_rq;
u64 dmin = LONG_MAX;
if (likely(!dl_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from dl_set_overloaded; this guarantees that if we
@@ -1567,7 +1632,7 @@ static int pull_dl_task(struct rq *this_rq)
src_rq->curr->dl.deadline))
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -1580,12 +1645,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_dl(struct rq *rq)
-{
- push_dl_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -1701,37 +1762,16 @@ void __init init_sched_dl_class(void)
#endif /* CONFIG_SMP */
-/*
- * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
- */
-static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
-{
- struct hrtimer *dl_timer = &p->dl.dl_timer;
-
- /* Nobody will change task's class if pi_lock is held */
- lockdep_assert_held(&p->pi_lock);
-
- if (hrtimer_active(dl_timer)) {
- int ret = hrtimer_try_to_cancel(dl_timer);
-
- if (unlikely(ret == -1)) {
- /*
- * Note, p may migrate OR new deadline tasks
- * may appear in rq when we are unlocking it.
- * A caller of us must be fine with that.
- */
- raw_spin_unlock(&rq->lock);
- hrtimer_cancel(dl_timer);
- raw_spin_lock(&rq->lock);
- }
- }
-}
-
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- /* XXX we should retain the bw until 0-lag */
- cancel_dl_timer(rq, p);
- __dl_clear_params(p);
+ /*
+ * Start the deadline timer; if we switch back to dl before this we'll
+ * continue consuming our current CBS slice. If we stay outside of
+ * SCHED_DEADLINE until the deadline passes, the timer will reset the
+ * task.
+ */
+ if (!start_dl_timer(p))
+ __dl_clear_params(p);
/*
* Since this might be the only -deadline task on the rq,
@@ -1741,8 +1781,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;
- if (pull_dl_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
/*
@@ -1751,21 +1790,16 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
- push_dl_task(rq) && rq != task_rq(p))
- /* Only reschedule if pushing failed */
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched) {
- if (dl_task(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
- else
- resched_curr(rq);
- }
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+#endif
}
}
@@ -1785,15 +1819,14 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
* or lowering its prio, so...
*/
if (!rq->dl.overloaded)
- pull_dl_task(rq);
+ queue_pull_task(rq);
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
* runqueue.
*/
- if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
- rq->curr == p)
+ if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
#else
/*
@@ -1823,7 +1856,6 @@ const struct sched_class dl_sched_class = {
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
- .post_schedule = post_schedule_dl,
.task_woken = task_woken_dl,
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 40a7fcbf491e..3d57cc0ca0a6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5392,7 +5392,15 @@ simple:
return p;
idle:
+ /*
+ * This is OK, because current is on_cpu, which avoids it being picked
+ * for load-balance and preemption/IRQs are still disabled avoiding
+ * further scheduler activity on it and we're being very careful to
+ * re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
new_tasks = idle_balance(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* Because idle_balance() releases (and re-acquires) rq->lock, it is
* possible for any higher priority task to appear. In that case we
@@ -7426,9 +7434,6 @@ static int idle_balance(struct rq *this_rq)
goto out;
}
- /*
- * Drop the rq->lock, but keep IRQ/preempt disabled.
- */
raw_spin_unlock(&this_rq->lock);
update_blocked_averages(this_cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7d7093c51f8d..0d193a243e96 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -260,7 +260,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
#ifdef CONFIG_SMP
-static int pull_rt_task(struct rq *this_rq);
+static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
{
@@ -354,13 +354,23 @@ static inline int has_pushable_tasks(struct rq *rq)
return !plist_head_empty(&rq->rt.pushable_tasks);
}
-static inline void set_post_schedule(struct rq *rq)
+static DEFINE_PER_CPU(struct callback_head, rt_push_head);
+static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
+
+static void push_rt_tasks(struct rq *);
+static void pull_rt_task(struct rq *);
+
+static inline void queue_push_tasks(struct rq *rq)
{
- /*
- * We detect this state here so that we can avoid taking the RQ
- * lock again later if there is no need to push
- */
- rq->post_schedule = has_pushable_tasks(rq);
+ if (!has_pushable_tasks(rq))
+ return;
+
+ queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
+}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+ queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}
static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
@@ -412,12 +422,11 @@ static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
return false;
}
-static inline int pull_rt_task(struct rq *this_rq)
+static inline void pull_rt_task(struct rq *this_rq)
{
- return 0;
}
-static inline void set_post_schedule(struct rq *rq)
+static inline void queue_push_tasks(struct rq *rq)
{
}
#endif /* CONFIG_SMP */
@@ -1469,7 +1478,15 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
struct rt_rq *rt_rq = &rq->rt;
if (need_pull_rt_task(rq, prev)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we're
+ * being very careful to re-start the picking loop.
+ */
+ lockdep_unpin_lock(&rq->lock);
pull_rt_task(rq);
+ lockdep_pin_lock(&rq->lock);
/*
* pull_rt_task() can drop (and re-acquire) rq->lock; this
* means a dl or stop task can slip in, in which case we need
@@ -1497,7 +1514,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
- set_post_schedule(rq);
+ queue_push_tasks(rq);
return p;
}
@@ -1952,14 +1969,15 @@ static void push_irq_work_func(struct irq_work *work)
}
#endif /* HAVE_RT_PUSH_IPI */
-static int pull_rt_task(struct rq *this_rq)
+static void pull_rt_task(struct rq *this_rq)
{
- int this_cpu = this_rq->cpu, ret = 0, cpu;
+ int this_cpu = this_rq->cpu, cpu;
+ bool resched = false;
struct task_struct *p;
struct rq *src_rq;
if (likely(!rt_overloaded(this_rq)))
- return 0;
+ return;
/*
* Match the barrier from rt_set_overloaded; this guarantees that if we
@@ -1970,7 +1988,7 @@ static int pull_rt_task(struct rq *this_rq)
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
- return 0;
+ return;
}
#endif
@@ -2023,7 +2041,7 @@ static int pull_rt_task(struct rq *this_rq)
if (p->prio < src_rq->curr->prio)
goto skip;
- ret = 1;
+ resched = true;
deactivate_task(src_rq, p, 0);
set_task_cpu(p, this_cpu);
@@ -2039,12 +2057,8 @@ skip:
double_unlock_balance(this_rq, src_rq);
}
- return ret;
-}
-
-static void post_schedule_rt(struct rq *rq)
-{
- push_rt_tasks(rq);
+ if (resched)
+ resched_curr(this_rq);
}
/*
@@ -2140,8 +2154,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
return;
- if (pull_rt_task(rq))
- resched_curr(rq);
+ queue_pull_task(rq);
}
void __init init_sched_rt_class(void)
@@ -2162,8 +2175,6 @@ void __init init_sched_rt_class(void)
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
- int check_resched = 1;
-
/*
* If we are already running, then there's nothing
* that needs to be done. But if we are not running
@@ -2173,13 +2184,12 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
- /* Don't resched if we changed runqueues */
- push_rt_task(rq) && rq != task_rq(p))
- check_resched = 0;
-#endif /* CONFIG_SMP */
- if (check_resched && p->prio < rq->curr->prio)
+ if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
+ queue_push_tasks(rq);
+#else
+ if (p->prio < rq->curr->prio)
resched_curr(rq);
+#endif /* CONFIG_SMP */
}
}
@@ -2200,14 +2210,13 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
- pull_rt_task(rq);
+ queue_pull_task(rq);
+
/*
* If there's a higher priority task waiting to run
- * then reschedule. Note, the above pull_rt_task
- * can release the rq lock and p could migrate.
- * Only reschedule if p is still on the same runqueue.
+ * then reschedule.
*/
- if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
+ if (p->prio > rq->rt.highest_prio.curr)
resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
@@ -2318,7 +2327,6 @@ const struct sched_class rt_sched_class = {
.set_cpus_allowed = set_cpus_allowed_rt,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
- .post_schedule = post_schedule_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index aea7c1f393cb..885889190a1f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -624,9 +624,10 @@ struct rq {
unsigned long cpu_capacity;
unsigned long cpu_capacity_orig;
+ struct callback_head *balance_callback;
+
unsigned char idle_balance;
/* For active balancing */
- int post_schedule;
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
@@ -767,6 +768,21 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+static inline void
+queue_balance_callback(struct rq *rq,
+ struct callback_head *head,
+ void (*func)(struct rq *rq))
+{
+ lockdep_assert_held(&rq->lock);
+
+ if (unlikely(head->next))
+ return;
+
+ head->func = (void (*)(struct callback_head *))func;
+ head->next = rq->balance_callback;
+ rq->balance_callback = head;
+}
+
extern void sched_ttwu_pending(void);
#define rcu_dereference_check_sched_domain(p) \
@@ -1192,7 +1208,6 @@ struct sched_class {
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
- void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1423,8 +1438,10 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
while (unlikely(task_on_rq_migrating(p)))
@@ -1461,8 +1478,10 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
* If we observe the new cpu in task_rq_lock, the acquire will
* pair with the WMB to ensure we must then also see migrating.
*/
- if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
+ if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
+ lockdep_pin_lock(&rq->lock);
return rq;
+ }
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
@@ -1474,6 +1493,7 @@ static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flag
static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1482,6 +1502,7 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
__releases(p->pi_lock)
{
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}