summaryrefslogtreecommitdiff
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c741
1 files changed, 410 insertions, 331 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 123673291..78b4bad10 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,26 +90,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
-{
- unsigned long delta;
- ktime_t soft, hard, now;
-
- for (;;) {
- if (hrtimer_active(period_timer))
- break;
-
- now = hrtimer_cb_get_time(period_timer);
- hrtimer_forward(period_timer, now, period);
-
- soft = hrtimer_get_softexpires(period_timer);
- hard = hrtimer_get_expires(period_timer);
- delta = ktime_to_ns(ktime_sub(hard, soft));
- __hrtimer_start_range_ns(period_timer, soft, delta,
- HRTIMER_MODE_ABS_PINNED, 0);
- }
-}
-
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -355,12 +335,11 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
#ifdef CONFIG_SMP
-static int __hrtick_restart(struct rq *rq)
+static void __hrtick_restart(struct rq *rq)
{
struct hrtimer *timer = &rq->hrtick_timer;
- ktime_t time = hrtimer_get_softexpires(timer);
- return __hrtimer_start_range_ns(timer, time, 0, HRTIMER_MODE_ABS_PINNED, 0);
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
}
/*
@@ -440,8 +419,8 @@ void hrtick_start(struct rq *rq, u64 delay)
* doesn't make sense. Rely on vruntime for fairness.
*/
delay = max_t(u64, delay, 10000LL);
- __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
- HRTIMER_MODE_REL_PINNED, 0);
+ hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
}
static inline void init_hrtick(void)
@@ -511,7 +490,7 @@ static bool set_nr_and_not_polling(struct task_struct *p)
static bool set_nr_if_polling(struct task_struct *p)
{
struct thread_info *ti = task_thread_info(p);
- typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
for (;;) {
if (!(val & _TIF_POLLING_NRFLAG))
@@ -541,6 +520,52 @@ static bool set_nr_if_polling(struct task_struct *p)
#endif
#endif
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
+{
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_list().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
+}
+
/*
* resched_curr - mark rq's current task 'to be rescheduled now'.
*
@@ -593,13 +618,12 @@ void resched_cpu(int cpu)
* selecting an idle cpu will add more delays to the timers than intended
* (as that cpu's timer base may not be uptodate wrt jiffies etc).
*/
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
{
- int cpu = smp_processor_id();
- int i;
+ int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+ if (!idle_cpu(cpu))
return cpu;
rcu_read_lock();
@@ -976,7 +1000,11 @@ inline int task_curr(const struct task_struct *p)
}
/*
- * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
+ * use the balance_callback list if you want balancing.
+ *
+ * this means any call to check_class_changed() must be followed by a call to
+ * balance_callback().
*/
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
@@ -985,7 +1013,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
- /* Possble rq->lock 'hole'. */
+
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1017,6 +1045,177 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
}
#ifdef CONFIG_SMP
+/*
+ * This is how migration works:
+ *
+ * 1) we invoke migration_cpu_stop() on the target CPU using
+ * stop_one_cpu().
+ * 2) stopper starts to run (implicitly forcing the migrated thread
+ * off the CPU)
+ * 3) it checks whether the migrated task is still in the wrong runqueue.
+ * 4) if it's in the wrong runqueue then the migration thread removes
+ * it and puts it into the right queue.
+ * 5) stopper completes and stop_one_cpu() returns and the migration
+ * is done.
+ */
+
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
+{
+ lockdep_assert_held(&rq->lock);
+
+ dequeue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, new_cpu);
+ raw_spin_unlock(&rq->lock);
+
+ rq = cpu_rq(new_cpu);
+
+ raw_spin_lock(&rq->lock);
+ BUG_ON(task_cpu(p) != new_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ enqueue_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+
+ return rq;
+}
+
+struct migration_arg {
+ struct task_struct *task;
+ int dest_cpu;
+};
+
+/*
+ * Move (not current) task off this cpu, onto dest cpu. We're doing
+ * this because either it can't run here any more (set_cpus_allowed()
+ * away from this CPU, or CPU going down), or because we're
+ * attempting to rebalance this task on exec (sched_exec).
+ *
+ * So we race with normal scheduler movements, but that's OK, as long
+ * as the task is no longer on this CPU.
+ */
+static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
+{
+ if (unlikely(!cpu_active(dest_cpu)))
+ return rq;
+
+ /* Affinity changed (again). */
+ if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+ return rq;
+
+ rq = move_queued_task(rq, p, dest_cpu);
+
+ return rq;
+}
+
+/*
+ * migration_cpu_stop - this will be executed by a highprio stopper thread
+ * and performs thread migration by bumping thread off CPU then
+ * 'pushing' onto another runqueue.
+ */
+static int migration_cpu_stop(void *data)
+{
+ struct migration_arg *arg = data;
+ struct task_struct *p = arg->task;
+ struct rq *rq = this_rq();
+
+ /*
+ * The original target cpu might have gone down and we might
+ * be on another cpu but it doesn't matter.
+ */
+ local_irq_disable();
+ /*
+ * We need to explicitly wake pending tasks before running
+ * __migrate_task() such that we will not miss enforcing cpus_allowed
+ * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+ */
+ sched_ttwu_pending();
+
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ /*
+ * If task_rq(p) != rq, it cannot be migrated here, because we're
+ * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
+ * we're holding p->pi_lock.
+ */
+ if (task_rq(p) == rq && task_on_rq_queued(p))
+ rq = __migrate_task(rq, p, arg->dest_cpu);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock(&p->pi_lock);
+
+ local_irq_enable();
+ return 0;
+}
+
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->nr_cpus_allowed = cpumask_weight(new_mask);
+}
+
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
+{
+ unsigned long flags;
+ struct rq *rq;
+ unsigned int dest_cpu;
+ int ret = 0;
+
+ rq = task_rq_lock(p, &flags);
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
+
+ if (!cpumask_intersects(new_mask, cpu_active_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ do_set_cpus_allowed(p, new_mask);
+
+ /* Can the task run on the task's current CPU? If so, we're done */
+ if (cpumask_test_cpu(task_cpu(p), new_mask))
+ goto out;
+
+ dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
+ if (task_running(rq, p) || p->state == TASK_WAKING) {
+ struct migration_arg arg = { p, dest_cpu };
+ /* Need help from migration thread: drop lock and wait. */
+ task_rq_unlock(rq, p, &flags);
+ stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ tlb_migrate_finish(p->mm);
+ return 0;
+ } else if (task_on_rq_queued(p)) {
+ /*
+ * OK, since we're going to drop the lock immediately
+ * afterwards anyway.
+ */
+ lockdep_unpin_lock(&rq->lock);
+ rq = move_queued_task(rq, p, dest_cpu);
+ lockdep_pin_lock(&rq->lock);
+ }
+out:
+ task_rq_unlock(rq, p, &flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -1049,7 +1248,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
if (p->sched_class->migrate_task_rq)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
- perf_sw_event_sched(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 0);
+ perf_event_task_migrate(p);
}
__set_task_cpu(p, new_cpu);
@@ -1157,13 +1356,6 @@ out:
return ret;
}
-struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
-};
-
-static int migration_cpu_stop(void *data);
-
/*
* wait_task_inactive - wait for a thread to unschedule.
*
@@ -1296,9 +1488,7 @@ void kick_process(struct task_struct *p)
preempt_enable();
}
EXPORT_SYMBOL_GPL(kick_process);
-#endif /* CONFIG_SMP */
-#ifdef CONFIG_SMP
/*
* ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
@@ -1378,6 +1568,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
+ lockdep_assert_held(&p->pi_lock);
+
if (p->nr_cpus_allowed > 1)
cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
@@ -1403,7 +1595,7 @@ static void update_avg(u64 *avg, u64 sample)
s64 diff = sample - *avg;
*avg += diff >> 3;
}
-#endif
+#endif /* CONFIG_SMP */
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
@@ -1466,8 +1658,15 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
- if (p->sched_class->task_woken)
+ if (p->sched_class->task_woken) {
+ /*
+ * Our task @p is fully woken up and running; so its safe to
+ * drop the rq->lock, hereafter rq is only used for statistics.
+ */
+ lockdep_unpin_lock(&rq->lock);
p->sched_class->task_woken(rq, p);
+ lockdep_pin_lock(&rq->lock);
+ }
if (rq->idle_stamp) {
u64 delta = rq_clock(rq) - rq->idle_stamp;
@@ -1486,6 +1685,8 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
static void
ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
{
+ lockdep_assert_held(&rq->lock);
+
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
rq->nr_uninterruptible--;
@@ -1530,6 +1731,7 @@ void sched_ttwu_pending(void)
return;
raw_spin_lock_irqsave(&rq->lock, flags);
+ lockdep_pin_lock(&rq->lock);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1537,6 +1739,7 @@ void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1633,7 +1836,9 @@ static void ttwu_queue(struct task_struct *p, int cpu)
#endif
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
ttwu_do_activate(rq, p, 0);
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
}
@@ -1728,9 +1933,17 @@ static void try_to_wake_up_local(struct task_struct *p)
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&p->pi_lock);
raw_spin_lock(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
}
if (!(p->state & TASK_NORMAL))
@@ -1951,7 +2164,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
set_task_cpu(p, cpu);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+#ifdef CONFIG_SCHED_INFO
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
@@ -2105,12 +2318,29 @@ void wake_up_new_task(struct task_struct *p)
#ifdef CONFIG_PREEMPT_NOTIFIERS
+static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
+
+void preempt_notifier_inc(void)
+{
+ static_key_slow_inc(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_inc);
+
+void preempt_notifier_dec(void)
+{
+ static_key_slow_dec(&preempt_notifier_key);
+}
+EXPORT_SYMBOL_GPL(preempt_notifier_dec);
+
/**
* preempt_notifier_register - tell me when current is being preempted & rescheduled
* @notifier: notifier struct to register
*/
void preempt_notifier_register(struct preempt_notifier *notifier)
{
+ if (!static_key_false(&preempt_notifier_key))
+ WARN(1, "registering preempt_notifier while notifiers disabled\n");
+
hlist_add_head(&notifier->link, &current->preempt_notifiers);
}
EXPORT_SYMBOL_GPL(preempt_notifier_register);
@@ -2119,7 +2349,7 @@ EXPORT_SYMBOL_GPL(preempt_notifier_register);
* preempt_notifier_unregister - no longer interested in preemption notifications
* @notifier: notifier struct to unregister
*
- * This is safe to call from within a preemption notifier.
+ * This is *not* safe to call from within a preemption notifier.
*/
void preempt_notifier_unregister(struct preempt_notifier *notifier)
{
@@ -2127,7 +2357,7 @@ void preempt_notifier_unregister(struct preempt_notifier *notifier)
}
EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
struct preempt_notifier *notifier;
@@ -2135,9 +2365,15 @@ static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
notifier->ops->sched_in(notifier, raw_smp_processor_id());
}
+static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_in_preempt_notifiers(curr);
+}
+
static void
-fire_sched_out_preempt_notifiers(struct task_struct *curr,
- struct task_struct *next)
+__fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
{
struct preempt_notifier *notifier;
@@ -2145,13 +2381,21 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
notifier->ops->sched_out(notifier, next);
}
+static __always_inline void
+fire_sched_out_preempt_notifiers(struct task_struct *curr,
+ struct task_struct *next)
+{
+ if (static_key_false(&preempt_notifier_key))
+ __fire_sched_out_preempt_notifiers(curr, next);
+}
+
#else /* !CONFIG_PREEMPT_NOTIFIERS */
-static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
+static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
{
}
-static void
+static inline void
fire_sched_out_preempt_notifiers(struct task_struct *curr,
struct task_struct *next)
{
@@ -2252,23 +2496,35 @@ static struct rq *finish_task_switch(struct task_struct *prev)
#ifdef CONFIG_SMP
/* rq->lock is NOT held, but preemption is disabled */
-static inline void post_schedule(struct rq *rq)
+static void __balance_callback(struct rq *rq)
{
- if (rq->post_schedule) {
- unsigned long flags;
+ struct callback_head *head, *next;
+ void (*func)(struct rq *rq);
+ unsigned long flags;
- raw_spin_lock_irqsave(&rq->lock, flags);
- if (rq->curr->sched_class->post_schedule)
- rq->curr->sched_class->post_schedule(rq);
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ head = rq->balance_callback;
+ rq->balance_callback = NULL;
+ while (head) {
+ func = (void (*)(struct rq *))head->func;
+ next = head->next;
+ head->next = NULL;
+ head = next;
- rq->post_schedule = 0;
+ func(rq);
}
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static inline void balance_callback(struct rq *rq)
+{
+ if (unlikely(rq->balance_callback))
+ __balance_callback(rq);
}
#else
-static inline void post_schedule(struct rq *rq)
+static inline void balance_callback(struct rq *rq)
{
}
@@ -2286,7 +2542,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
/* finish_task_switch() drops rq->lock and enables preemtion */
preempt_disable();
rq = finish_task_switch(prev);
- post_schedule(rq);
+ balance_callback(rq);
preempt_enable();
if (current->set_child_tid)
@@ -2330,9 +2586,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
+ lockdep_unpin_lock(&rq->lock);
spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
- context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
barrier();
@@ -2397,9 +2653,9 @@ unsigned long nr_iowait_cpu(int cpu)
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
- struct rq *this = this_rq();
- *nr_waiters = atomic_read(&this->nr_iowait);
- *load = this->cpu_load[0];
+ struct rq *rq = this_rq();
+ *nr_waiters = atomic_read(&rq->nr_iowait);
+ *load = rq->load.weight;
}
#ifdef CONFIG_SMP
@@ -2497,6 +2753,7 @@ void scheduler_tick(void)
update_rq_clock(rq);
curr->sched_class->task_tick(rq, curr, 0);
update_cpu_load_active(rq);
+ calc_global_load_tick(rq);
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
@@ -2525,7 +2782,7 @@ void scheduler_tick(void)
u64 scheduler_tick_max_deferment(void)
{
struct rq *rq = this_rq();
- unsigned long next, now = ACCESS_ONCE(jiffies);
+ unsigned long next, now = READ_ONCE(jiffies);
next = rq->last_sched_tick + HZ;
@@ -2726,9 +2983,7 @@ again:
* - return from syscall or exception to user-space
* - return from interrupt-handler to user-space
*
- * WARNING: all callers must re-check need_resched() afterward and reschedule
- * accordingly in case an event triggered the need for rescheduling (such as
- * an interrupt waking up a task) while preemption was disabled in __schedule().
+ * WARNING: must be called with preemption disabled!
*/
static void __sched __schedule(void)
{
@@ -2737,7 +2992,6 @@ static void __sched __schedule(void)
struct rq *rq;
int cpu;
- preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch();
@@ -2755,6 +3009,7 @@ static void __sched __schedule(void)
*/
smp_mb__before_spinlock();
raw_spin_lock_irq(&rq->lock);
+ lockdep_pin_lock(&rq->lock);
rq->clock_skip_update <<= 1; /* promote REQ to ACT */
@@ -2797,12 +3052,12 @@ static void __sched __schedule(void)
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
- } else
+ } else {
+ lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
+ }
- post_schedule(rq);
-
- sched_preempt_enable_no_resched();
+ balance_callback(rq);
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -2823,7 +3078,9 @@ asmlinkage __visible void __sched schedule(void)
sched_submit_work(tsk);
do {
+ preempt_disable();
__schedule();
+ sched_preempt_enable_no_resched();
} while (need_resched());
}
EXPORT_SYMBOL(schedule);
@@ -2862,15 +3119,14 @@ void __sched schedule_preempt_disabled(void)
static void __sched notrace preempt_schedule_common(void)
{
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
__schedule();
- __preempt_count_sub(PREEMPT_ACTIVE);
+ preempt_active_exit();
/*
* Check again in case we missed a preemption opportunity
* between schedule and now.
*/
- barrier();
} while (need_resched());
}
@@ -2894,9 +3150,8 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
-#ifdef CONFIG_CONTEXT_TRACKING
/**
- * preempt_schedule_context - preempt_schedule called by tracing
+ * preempt_schedule_notrace - preempt_schedule called by tracing
*
* The tracing infrastructure uses preempt_enable_notrace to prevent
* recursion and tracing preempt enabling caused by the tracing
@@ -2909,7 +3164,7 @@ EXPORT_SYMBOL(preempt_schedule);
* instead of preempt_schedule() to exit user context if needed before
* calling the scheduler.
*/
-asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
{
enum ctx_state prev_ctx;
@@ -2917,7 +3172,13 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
return;
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Use raw __prempt_count() ops that don't call function.
+ * We can't call functions before disabling preemption which
+ * disarm preemption tracing recursions.
+ */
+ __preempt_count_add(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
+ barrier();
/*
* Needs preempt disabled in case user_exit() is traced
* and the tracer calls preempt_enable_notrace() causing
@@ -2927,12 +3188,11 @@ asmlinkage __visible void __sched notrace preempt_schedule_context(void)
__schedule();
exception_exit(prev_ctx);
- __preempt_count_sub(PREEMPT_ACTIVE);
barrier();
+ __preempt_count_sub(PREEMPT_ACTIVE + PREEMPT_DISABLE_OFFSET);
} while (need_resched());
}
-EXPORT_SYMBOL_GPL(preempt_schedule_context);
-#endif /* CONFIG_CONTEXT_TRACKING */
+EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
#endif /* CONFIG_PREEMPT */
@@ -2952,17 +3212,11 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
prev_state = exception_enter();
do {
- __preempt_count_add(PREEMPT_ACTIVE);
+ preempt_active_enter();
local_irq_enable();
__schedule();
local_irq_disable();
- __preempt_count_sub(PREEMPT_ACTIVE);
-
- /*
- * Check again in case we missed a preemption opportunity
- * between schedule and now.
- */
- barrier();
+ preempt_active_exit();
} while (need_resched());
exception_exit(prev_state);
@@ -3040,7 +3294,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
- p->dl.dl_throttled = 0;
enqueue_flag = ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
@@ -3068,7 +3321,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
+ preempt_disable(); /* avoid rq from going away on us */
__task_rq_unlock(rq);
+
+ balance_callback(rq);
+ preempt_enable();
}
#endif
@@ -3406,7 +3663,7 @@ static bool dl_param_changed(struct task_struct *p,
static int __sched_setscheduler(struct task_struct *p,
const struct sched_attr *attr,
- bool user)
+ bool user, bool pi)
{
int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
MAX_RT_PRIO - 1 - attr->sched_priority;
@@ -3592,18 +3849,20 @@ change:
p->sched_reset_on_fork = reset_on_fork;
oldprio = p->prio;
- /*
- * Take priority boosted tasks into account. If the new
- * effective priority is unchanged, we just store the new
- * normal parameters and do not touch the scheduler class and
- * the runqueue. This will be done when the task deboost
- * itself.
- */
- new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
- if (new_effective_prio == oldprio) {
- __setscheduler_params(p, attr);
- task_rq_unlock(rq, p, &flags);
- return 0;
+ if (pi) {
+ /*
+ * Take priority boosted tasks into account. If the new
+ * effective priority is unchanged, we just store the new
+ * normal parameters and do not touch the scheduler class and
+ * the runqueue. This will be done when the task deboost
+ * itself.
+ */
+ new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
+ if (new_effective_prio == oldprio) {
+ __setscheduler_params(p, attr);
+ task_rq_unlock(rq, p, &flags);
+ return 0;
+ }
}
queued = task_on_rq_queued(p);
@@ -3614,7 +3873,7 @@ change:
put_prev_task(rq, p);
prev_class = p->sched_class;
- __setscheduler(rq, p, attr, true);
+ __setscheduler(rq, p, attr, pi);
if (running)
p->sched_class->set_curr_task(rq);
@@ -3627,9 +3886,17 @@ change:
}
check_class_changed(rq, p, prev_class, oldprio);
+ preempt_disable(); /* avoid rq from going away on us */
task_rq_unlock(rq, p, &flags);
- rt_mutex_adjust_pi(p);
+ if (pi)
+ rt_mutex_adjust_pi(p);
+
+ /*
+ * Run balance callbacks after we've adjusted the PI chain.
+ */
+ balance_callback(rq);
+ preempt_enable();
return 0;
}
@@ -3650,7 +3917,7 @@ static int _sched_setscheduler(struct task_struct *p, int policy,
attr.sched_policy = policy;
}
- return __sched_setscheduler(p, &attr, check);
+ return __sched_setscheduler(p, &attr, check, true);
}
/**
* sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
@@ -3671,7 +3938,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
{
- return __sched_setscheduler(p, attr, true);
+ return __sched_setscheduler(p, attr, true, true);
}
EXPORT_SYMBOL_GPL(sched_setattr);
@@ -4719,149 +4986,6 @@ out:
}
#ifdef CONFIG_SMP
-/*
- * move_queued_task - move a queued task to new rq.
- *
- * Returns (locked) new rq. Old rq's lock is released.
- */
-static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
-{
- struct rq *rq = task_rq(p);
-
- lockdep_assert_held(&rq->lock);
-
- dequeue_task(rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, new_cpu);
- raw_spin_unlock(&rq->lock);
-
- rq = cpu_rq(new_cpu);
-
- raw_spin_lock(&rq->lock);
- BUG_ON(task_cpu(p) != new_cpu);
- p->on_rq = TASK_ON_RQ_QUEUED;
- enqueue_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
-
- return rq;
-}
-
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
-{
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
-
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->nr_cpus_allowed = cpumask_weight(new_mask);
-}
-
-/*
- * This is how migration works:
- *
- * 1) we invoke migration_cpu_stop() on the target CPU using
- * stop_one_cpu().
- * 2) stopper starts to run (implicitly forcing the migrated thread
- * off the CPU)
- * 3) it checks whether the migrated task is still in the wrong runqueue.
- * 4) if it's in the wrong runqueue then the migration thread removes
- * it and puts it into the right queue.
- * 5) stopper completes and stop_one_cpu() returns and the migration
- * is done.
- */
-
-/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
- */
-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
-{
- unsigned long flags;
- struct rq *rq;
- unsigned int dest_cpu;
- int ret = 0;
-
- rq = task_rq_lock(p, &flags);
-
- if (cpumask_equal(&p->cpus_allowed, new_mask))
- goto out;
-
- if (!cpumask_intersects(new_mask, cpu_active_mask)) {
- ret = -EINVAL;
- goto out;
- }
-
- do_set_cpus_allowed(p, new_mask);
-
- /* Can the task run on the task's current CPU? If so, we're done */
- if (cpumask_test_cpu(task_cpu(p), new_mask))
- goto out;
-
- dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (task_running(rq, p) || p->state == TASK_WAKING) {
- struct migration_arg arg = { p, dest_cpu };
- /* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, p, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
- tlb_migrate_finish(p->mm);
- return 0;
- } else if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-out:
- task_rq_unlock(rq, p, &flags);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-
-/*
- * Move (not current) task off this cpu, onto dest cpu. We're doing
- * this because either it can't run here any more (set_cpus_allowed()
- * away from this CPU, or CPU going down), or because we're
- * attempting to rebalance this task on exec (sched_exec).
- *
- * So we race with normal scheduler movements, but that's OK, as long
- * as the task is no longer on this CPU.
- *
- * Returns non-zero if task was successfully migrated.
- */
-static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
-{
- struct rq *rq;
- int ret = 0;
-
- if (unlikely(!cpu_active(dest_cpu)))
- return ret;
-
- rq = cpu_rq(src_cpu);
-
- raw_spin_lock(&p->pi_lock);
- raw_spin_lock(&rq->lock);
- /* Already moved. */
- if (task_cpu(p) != src_cpu)
- goto done;
-
- /* Affinity changed (again). */
- if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
- goto fail;
-
- /*
- * If we're not on a rq, the next wake-up will ensure we're
- * placed properly.
- */
- if (task_on_rq_queued(p))
- rq = move_queued_task(p, dest_cpu);
-done:
- ret = 1;
-fail:
- raw_spin_unlock(&rq->lock);
- raw_spin_unlock(&p->pi_lock);
- return ret;
-}
#ifdef CONFIG_NUMA_BALANCING
/* Migrate current task p to target_cpu */
@@ -4909,35 +5033,9 @@ void sched_setnuma(struct task_struct *p, int nid)
enqueue_task(rq, p, 0);
task_rq_unlock(rq, p, &flags);
}
-#endif
-
-/*
- * migration_cpu_stop - this will be executed by a highprio stopper thread
- * and performs thread migration by bumping thread off CPU then
- * 'pushing' onto another runqueue.
- */
-static int migration_cpu_stop(void *data)
-{
- struct migration_arg *arg = data;
-
- /*
- * The original target cpu might have gone down and we might
- * be on another cpu but it doesn't matter.
- */
- local_irq_disable();
- /*
- * We need to explicitly wake pending tasks before running
- * __migrate_task() such that we will not miss enforcing cpus_allowed
- * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
- */
- sched_ttwu_pending();
- __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
- local_irq_enable();
- return 0;
-}
+#endif /* CONFIG_NUMA_BALANCING */
#ifdef CONFIG_HOTPLUG_CPU
-
/*
* Ensures that the idle task is using init_mm right before its cpu goes
* offline.
@@ -4993,9 +5091,9 @@ static struct task_struct fake_task = {
* there's no concurrency possible, we hold the required locks anyway
* because of lock validation efforts.
*/
-static void migrate_tasks(unsigned int dead_cpu)
+static void migrate_tasks(struct rq *dead_rq)
{
- struct rq *rq = cpu_rq(dead_cpu);
+ struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
@@ -5017,7 +5115,7 @@ static void migrate_tasks(unsigned int dead_cpu)
*/
update_rq_clock(rq);
- for ( ; ; ) {
+ for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
@@ -5025,22 +5123,29 @@ static void migrate_tasks(unsigned int dead_cpu)
if (rq->nr_running == 1)
break;
+ /*
+ * Ensure rq->lock covers the entire task selection
+ * until the migration.
+ */
+ lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/* Find suitable destination for @next, with force if needed. */
- dest_cpu = select_fallback_rq(dead_cpu, next);
- raw_spin_unlock(&rq->lock);
-
- __migrate_task(next, dead_cpu, dest_cpu);
-
- raw_spin_lock(&rq->lock);
+ dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+
+ lockdep_unpin_lock(&rq->lock);
+ rq = __migrate_task(rq, next, dest_cpu);
+ if (rq != dead_rq) {
+ raw_spin_unlock(&rq->lock);
+ rq = dead_rq;
+ raw_spin_lock(&rq->lock);
+ }
}
rq->stop = stop;
}
-
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5219,7 +5324,7 @@ static void register_sched_domain_sysctl(void)
static void unregister_sched_domain_sysctl(void)
{
}
-#endif
+#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
@@ -5288,7 +5393,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
- migrate_tasks(cpu);
+ migrate_tasks(rq);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
break;
@@ -5314,7 +5419,7 @@ static struct notifier_block migration_notifier = {
.priority = CPU_PRI_MIGRATION,
};
-static void __cpuinit set_cpu_rq_start_time(void)
+static void set_cpu_rq_start_time(void)
{
int cpu = smp_processor_id();
struct rq *rq = cpu_rq(cpu);
@@ -5366,9 +5471,6 @@ static int __init migration_init(void)
return 0;
}
early_initcall(migration_init);
-#endif
-
-#ifdef CONFIG_SMP
static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
@@ -6594,7 +6696,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
struct sched_group *sg;
struct sched_group_capacity *sgc;
- sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
if (!sd)
return -ENOMEM;
@@ -7032,6 +7134,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+ /* nohz_full won't take effect without isolating the cpus. */
+ tick_nohz_full_add_cpus_to(cpu_isolated_map);
+
sched_init_numa();
/*
@@ -7068,8 +7173,6 @@ void __init sched_init_smp(void)
}
#endif /* CONFIG_SMP */
-const_debug unsigned int sysctl_timer_migration = 1;
-
int in_sched_functions(unsigned long addr)
{
return in_lock_functions(addr) ||
@@ -7199,7 +7302,7 @@ void __init sched_init(void)
rq->sd = NULL;
rq->rd = NULL;
rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
- rq->post_schedule = 0;
+ rq->balance_callback = NULL;
rq->active_balance = 0;
rq->next_balance = jiffies;
rq->push_cpu = 0;
@@ -7329,32 +7432,12 @@ EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
-static void normalize_task(struct rq *rq, struct task_struct *p)
+void normalize_rt_tasks(void)
{
- const struct sched_class *prev_class = p->sched_class;
+ struct task_struct *g, *p;
struct sched_attr attr = {
.sched_policy = SCHED_NORMAL,
};
- int old_prio = p->prio;
- int queued;
-
- queued = task_on_rq_queued(p);
- if (queued)
- dequeue_task(rq, p, 0);
- __setscheduler(rq, p, &attr, false);
- if (queued) {
- enqueue_task(rq, p, 0);
- resched_curr(rq);
- }
-
- check_class_changed(rq, p, prev_class, old_prio);
-}
-
-void normalize_rt_tasks(void)
-{
- struct task_struct *g, *p;
- unsigned long flags;
- struct rq *rq;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7381,9 +7464,7 @@ void normalize_rt_tasks(void)
continue;
}
- rq = task_rq_lock(p, &flags);
- normalize_task(rq, p);
- task_rq_unlock(rq, p, &flags);
+ __sched_setscheduler(p, &attr, false, false);
}
read_unlock(&tasklist_lock);
}
@@ -7734,11 +7815,11 @@ static long sched_group_rt_runtime(struct task_group *tg)
return rt_runtime_us;
}
-static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
+static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
{
u64 rt_runtime, rt_period;
- rt_period = (u64)rt_period_us * NSEC_PER_USEC;
+ rt_period = rt_period_us * NSEC_PER_USEC;
rt_runtime = tg->rt_bandwidth.rt_runtime;
return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
@@ -8105,10 +8186,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
__refill_cfs_bandwidth_runtime(cfs_b);
/* restart the period timer (if active) to handle new period expiry */
- if (runtime_enabled && cfs_b->timer_active) {
- /* force a reprogram */
- __start_cfs_bandwidth(cfs_b, true);
- }
+ if (runtime_enabled)
+ start_cfs_bandwidth(cfs_b);
raw_spin_unlock_irq(&cfs_b->lock);
for_each_online_cpu(i) {