summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile4
-rw-r--r--kernel/sched/MuQSS.c (renamed from kernel/sched/bfs.c)2494
-rw-r--r--kernel/sched/MuQSS.h (renamed from kernel/sched/bfs_sched.h)102
-rw-r--r--kernel/sched/cpufreq.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c4
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/stats.c4
7 files changed, 1621 insertions, 995 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 8c1204fa3..a787aa942 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -15,8 +15,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif
-ifdef CONFIG_SCHED_BFS
-obj-y += bfs.o clock.o
+ifdef CONFIG_SCHED_MUQSS
+obj-y += MuQSS.o clock.o
else
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
diff --git a/kernel/sched/bfs.c b/kernel/sched/MuQSS.c
index bb5bac4b2..6a656ad4b 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/MuQSS.c
@@ -1,5 +1,5 @@
/*
- * kernel/sched/bfs.c, was kernel/sched.c
+ * kernel/sched/MuQSS.c, was kernel/sched.c
*
* Kernel scheduler and related syscalls
*
@@ -26,6 +26,8 @@
* Thomas Gleixner, Mike Kravetz
* 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
* a whole lot of those previous things.
+ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS
+ * scheduler by Con Kolivas.
*/
#include <linux/kasan.h>
@@ -74,7 +76,7 @@
#include <linux/context_tracking.h>
#include <linux/sched/prio.h>
#include <linux/tick.h>
-#include <linux/skip_lists.h>
+#include <linux/skip_list.h>
#include <asm/irq_regs.h>
#include <asm/switch_to.h>
@@ -92,11 +94,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
-#include "bfs_sched.h"
+#include "MuQSS.h"
#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO)
#define rt_task(p) rt_prio((p)->prio)
-#define rt_queue(rq) rt_prio((rq)->rq_prio)
#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH))
#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \
(policy) == SCHED_RR)
@@ -105,29 +106,26 @@
#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO)
#define idleprio_task(p) unlikely(is_idle_policy((p)->policy))
#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO)
-#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy)))
#define is_iso_policy(policy) ((policy) == SCHED_ISO)
#define iso_task(p) unlikely(is_iso_policy((p)->policy))
-#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy))
#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO)
-#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO)
#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT)
-#define ISO_PERIOD ((5 * HZ * grq.noc) + 1)
+#define ISO_PERIOD (5 * HZ)
-#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO)
#define STOP_PRIO (MAX_RT_PRIO - 1)
/*
* Some helpers for converting to/from various scales. Use shifts to get
* approximate multiples of ten for less overhead.
*/
-#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
-#define JIFFY_NS (1000000000 / HZ)
-#define HALF_JIFFY_NS (1000000000 / HZ / 2)
-#define HALF_JIFFY_US (1000000 / HZ / 2)
+#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ))
+#define JIFFY_NS (1073741824 / HZ)
+#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS)
+#define HALF_JIFFY_NS (1073741824 / HZ / 2)
+#define HALF_JIFFY_US (1048576 / HZ / 2)
#define MS_TO_NS(TIME) ((TIME) << 20)
#define MS_TO_US(TIME) ((TIME) << 10)
#define NS_TO_MS(TIME) ((TIME) >> 20)
@@ -137,7 +135,7 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n");
+ printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n");
}
/*
@@ -182,30 +180,24 @@ static inline int timeslice(void)
}
/*
- * The global runqueue data that all CPUs work off. Data is protected either
- * by the global grq lock, or the discrete lock that precedes the data in this
- * struct.
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
*/
struct global_rq {
- raw_spinlock_t lock;
- unsigned long nr_running;
- unsigned long nr_uninterruptible;
- unsigned long long nr_switches;
- unsigned long qnr; /* queued not running */
+#ifdef CONFIG_SMP
+ atomic_t nr_running ____cacheline_aligned_in_smp;
+ atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+ atomic64_t nr_switches ____cacheline_aligned_in_smp;
+ atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */
+#else
+ atomic_t nr_running ____cacheline_aligned;
+ atomic_t nr_uninterruptible ____cacheline_aligned;
+ atomic64_t nr_switches ____cacheline_aligned;
+ atomic_t qnr ____cacheline_aligned; /* queued not running */
+#endif
#ifdef CONFIG_SMP
cpumask_t cpu_idle_map;
- bool idle_cpus;
#endif
- int noc; /* num_online_cpus stored and updated when it changes */
- u64 niffies; /* Nanosecond jiffies */
- unsigned long last_jiffy; /* Last jiffy we updated niffies */
-
- raw_spinlock_t iso_lock;
- int iso_ticks;
- bool iso_refractory;
-
- skiplist_node node;
- skiplist *sl;
};
#ifdef CONFIG_SMP
@@ -241,7 +233,11 @@ static struct root_domain def_root_domain;
#endif /* CONFIG_SMP */
/* There can be only one */
-static struct global_rq grq;
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
static DEFINE_MUTEX(sched_hotcpu_mutex);
@@ -276,77 +272,16 @@ int __weak arch_sd_sibling_asym_packing(void)
struct rq *uprq;
#endif /* CONFIG_SMP */
-static inline void update_rq_clock(struct rq *rq);
-
-/*
- * Sanity check should sched_clock return bogus values. We make sure it does
- * not appear to go backwards, and use jiffies to determine the maximum and
- * minimum it could possibly have increased, and round down to the nearest
- * jiffy when it falls outside this.
- */
-static inline void niffy_diff(s64 *niff_diff, int jiff_diff)
-{
- unsigned long min_diff, max_diff;
-
- if (jiff_diff > 1)
- min_diff = JIFFIES_TO_NS(jiff_diff - 1);
- else
- min_diff = 1;
- /* Round up to the nearest tick for maximum */
- max_diff = JIFFIES_TO_NS(jiff_diff + 1);
-
- if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff))
- *niff_diff = min_diff;
-}
-
#ifdef CONFIG_SMP
static inline int cpu_of(struct rq *rq)
{
return rq->cpu;
}
-
-/*
- * Niffies are a globally increasing nanosecond counter. Whenever a runqueue
- * clock is updated with the grq.lock held, it is an opportunity to update the
- * niffies value. Any CPU can update it by adding how much its clock has
- * increased since it last updated niffies, minus any added niffies by other
- * CPUs.
- */
-static inline void update_clocks(struct rq *rq)
-{
- s64 ndiff;
- long jdiff;
-
- update_rq_clock(rq);
- ndiff = rq->clock - rq->old_clock;
- /* old_clock is only updated when we are updating niffies */
- rq->old_clock = rq->clock;
- ndiff -= grq.niffies - rq->last_niffy;
- jdiff = jiffies - grq.last_jiffy;
- niffy_diff(&ndiff, jdiff);
- grq.last_jiffy += jdiff;
- grq.niffies += ndiff;
- rq->last_niffy = grq.niffies;
-}
#else /* CONFIG_SMP */
static inline int cpu_of(struct rq *rq)
{
return 0;
}
-
-static inline void update_clocks(struct rq *rq)
-{
- s64 ndiff;
- long jdiff;
-
- update_rq_clock(rq);
- ndiff = rq->clock - rq->old_clock;
- rq->old_clock = rq->clock;
- jdiff = jiffies - grq.last_jiffy;
- niffy_diff(&ndiff, jdiff);
- grq.last_jiffy += jdiff;
- grq.niffies += ndiff;
-}
#endif
#include "stats.h"
@@ -362,10 +297,10 @@ static inline void update_clocks(struct rq *rq)
#endif
/*
- * All common locking functions performed on grq.lock. rq->clock is local to
+ * All common locking functions performed on rq->lock. rq->clock is local to
* the CPU accessing it so it can be modified just with interrupts disabled
* when we're not updating niffies.
- * Looking up task_rq must be done under grq.lock to be safe.
+ * Looking up task_rq must be done under rq->lock to be safe.
*/
static void update_rq_clock_task(struct rq *rq, s64 delta);
@@ -379,102 +314,501 @@ static inline void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
-static inline bool task_running(struct task_struct *p)
+/*
+ * Niffies are a globally increasing nanosecond counter. They're only used by
+ * update_load_avg and time_slice_expired, however deadlines are based on them
+ * across CPUs. Update them whenever we will call one of those functions, and
+ * synchronise them across CPUs whenever we hold both runqueue locks.
+ */
+static inline void update_clocks(struct rq *rq)
{
+ s64 ndiff, minndiff;
+ long jdiff;
+
+ update_rq_clock(rq);
+ ndiff = rq->clock - rq->old_clock;
+ rq->old_clock = rq->clock;
+ jdiff = jiffies - rq->last_jiffy;
+
+ /* Subtract any niffies added by balancing with other rqs */
+ ndiff -= rq->niffies - rq->last_niffy;
+ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies;
+ if (minndiff < 0)
+ minndiff = 0;
+ ndiff = max(ndiff, minndiff);
+ rq->niffies += ndiff;
+ rq->last_niffy = rq->niffies;
+ if (jdiff) {
+ rq->last_jiffy += jdiff;
+ rq->last_jiffy_niffies = rq->niffies;
+ }
+}
+
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+ return rq->curr == p;
+}
+
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
return p->on_cpu;
+#else
+ return task_current(rq, p);
+#endif
}
-static inline void grq_lock(void)
- __acquires(grq.lock)
+static inline int task_on_rq_queued(struct task_struct *p)
{
- raw_spin_lock(&grq.lock);
+ return p->on_rq == TASK_ON_RQ_QUEUED;
}
-static inline void grq_unlock(void)
- __releases(grq.lock)
+static inline int task_on_rq_migrating(struct task_struct *p)
{
- raw_spin_unlock(&grq.lock);
+ return p->on_rq == TASK_ON_RQ_MIGRATING;
}
-static inline void grq_lock_irq(void)
- __acquires(grq.lock)
+static inline void rq_lock(struct rq *rq)
+ __acquires(rq->lock)
{
- raw_spin_lock_irq(&grq.lock);
+ raw_spin_lock(&rq->lock);
}
-static inline void time_lock_grq(struct rq *rq)
+static inline int rq_trylock(struct rq *rq)
+ __acquires(rq->lock)
{
- grq_lock();
- update_clocks(rq);
+ return raw_spin_trylock(&rq->lock);
+}
+
+static inline void rq_unlock(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock(&rq->lock);
}
-static inline void grq_unlock_irq(void)
- __releases(grq.lock)
+static inline struct rq *this_rq_lock(void)
+ __acquires(rq->lock)
{
- raw_spin_unlock_irq(&grq.lock);
+ struct rq *rq;
+
+ local_irq_disable();
+ rq = this_rq();
+ raw_spin_lock(&rq->lock);
+
+ return rq;
}
-static inline void grq_lock_irqsave(unsigned long *flags)
- __acquires(grq.lock)
+/*
+ * Any time we have two runqueues locked we use that as an opportunity to
+ * synchronise niffies to the highest value as idle ticks may have artificially
+ * kept niffies low on one CPU and the truth can only be later.
+ */
+static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2)
{
- raw_spin_lock_irqsave(&grq.lock, *flags);
+ if (rq1->niffies > rq2->niffies)
+ rq2->niffies = rq1->niffies;
+ else
+ rq1->niffies = rq2->niffies;
}
-static inline void grq_unlock_irqrestore(unsigned long *flags)
- __releases(grq.lock)
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+
+/* For when we know rq1 != rq2 */
+static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
{
- raw_spin_unlock_irqrestore(&grq.lock, *flags);
+ if (rq1 < rq2) {
+ raw_spin_lock(&rq1->lock);
+ raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+ } else {
+ raw_spin_lock(&rq2->lock);
+ raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+ }
}
-static inline struct rq
-*task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(p->pi_lock)
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (rq1 == rq2) {
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+ } else
+ __double_rq_lock(rq1, rq2);
+ synchronise_niffies(rq1, rq2);
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ raw_spin_unlock(&rq1->lock);
+ if (rq1 != rq2)
+ raw_spin_unlock(&rq2->lock);
+ else
+ __release(rq2->lock);
+}
+
+/* Must be sure rq1 != rq2 and irqs are disabled */
+static inline void lock_second_rq(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ if (unlikely(!raw_spin_trylock(&rq2->lock))) {
+ raw_spin_unlock(&rq1->lock);
+ __double_rq_lock(rq1, rq2);
+ }
+ synchronise_niffies(rq1, rq2);
+}
+
+static inline void lock_all_rqs(void)
+{
+ int cpu;
+
+ preempt_disable();
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_lock(&rq->lock);
+ }
+}
+
+static inline void unlock_all_rqs(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct rq *rq = cpu_rq(cpu);
+
+ do_raw_spin_unlock(&rq->lock);
+ }
+ preempt_enable();
+}
+
+/* Specially nest trylock an rq */
+static inline bool trylock_rq(struct rq *this_rq, struct rq *rq)
+{
+ if (unlikely(!do_raw_spin_trylock(&rq->lock)))
+ return false;
+ spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_);
+ synchronise_niffies(this_rq, rq);
+ return true;
+}
+
+/* Unlock a specially nested trylocked rq */
+static inline void unlock_rq(struct rq *rq)
+{
+ spin_release(&rq->lock.dep_map, 1, _RET_IP_);
+ do_raw_spin_unlock(&rq->lock);
+}
+
+static inline void rq_lock_irq(struct rq *rq)
+ __acquires(rq->lock)
+{
+ raw_spin_lock_irq(&rq->lock);
+}
+
+static inline void rq_unlock_irq(struct rq *rq)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irq(&rq->lock);
+}
+
+static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags)
+ __acquires(rq->lock)
{
- raw_spin_lock_irqsave(&p->pi_lock, *flags);
- grq_lock();
- return task_rq(p);
+ raw_spin_lock_irqsave(&rq->lock, *flags);
+}
+
+static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags)
+ __releases(rq->lock)
+{
+ raw_spin_unlock_irqrestore(&rq->lock, *flags);
}
static inline struct rq
-*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
+*task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
+ __acquires(rq->lock)
{
- struct rq *rq = task_grq_lock(p, flags);
+ struct rq *rq;
- update_clocks(rq);
+ while (42) {
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+ }
return rq;
}
-static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags)
+static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
+ __releases(rq->lock)
__releases(p->pi_lock)
{
- grq_unlock();
+ rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
-static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
+static inline struct rq *__task_rq_lock(struct task_struct *p)
+ __acquires(rq->lock)
+{
+ struct rq *rq;
+
+ lockdep_assert_held(&p->pi_lock);
+
+ while (42) {
+ rq = task_rq(p);
+ raw_spin_lock(&rq->lock);
+ if (likely(rq == task_rq(p)))
+ break;
+ raw_spin_unlock(&rq->lock);
+ }
+ return rq;
+}
+
+static inline void __task_rq_unlock(struct rq *rq)
+{
+ rq_unlock(rq);
+}
+
+/*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, mask) \
+ ({ \
+ typeof(ptr) _ptr = (ptr); \
+ typeof(mask) _mask = (mask); \
+ typeof(*_ptr) _old, _val = *_ptr; \
+ \
+ for (;;) { \
+ _old = cmpxchg(_ptr, _val, _val | _mask); \
+ if (_old == _val) \
+ break; \
+ _val = _old; \
+ } \
+ _old; \
+})
+
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = READ_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+ set_tsk_need_resched(p);
+ return true;
+}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
+#endif
+
+void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
- local_irq_save(*flags);
- time_lock_grq(rq);
+ struct wake_q_node *node = &task->wake_q;
+
+ /*
+ * Atomically grab the task, if ->wake_q is !nil already it means
+ * its already queued (either by us or someone else) and will get the
+ * wakeup due to that.
+ *
+ * This cmpxchg() implies a full barrier, which pairs with the write
+ * barrier implied by the wakeup in wake_up_q().
+ */
+ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
+ return;
+
+ get_task_struct(task);
+
+ /*
+ * The head is context local, there can be no concurrency.
+ */
+ *head->lastp = node;
+ head->lastp = &node->next;
+}
+
+void wake_up_q(struct wake_q_head *head)
+{
+ struct wake_q_node *node = head->first;
+
+ while (node != WAKE_Q_TAIL) {
+ struct task_struct *task;
+
+ task = container_of(node, struct task_struct, wake_q);
+ BUG_ON(!task);
+ /* task can safely be re-inserted now */
+ node = node->next;
+ task->wake_q.next = NULL;
+
+ /*
+ * wake_up_process() implies a wmb() to pair with the queueing
+ * in wake_q_add() so as not to miss wakeups.
+ */
+ wake_up_process(task);
+ put_task_struct(task);
+ }
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+ next->on_cpu = 1;
+}
+
+/*
+ * resched_task - mark a task 'to be rescheduled now'.
+ *
+ * On UP this means the setting of the need_resched flag, on SMP it
+ * might also involve a cross-CPU call to trigger the scheduler on
+ * the target CPU.
+ */
+void resched_task(struct task_struct *p)
+{
+ int cpu;
+#ifdef CONFIG_LOCKDEP
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&rq->lock);
+#endif
+ if (test_tsk_need_resched(p))
+ return;
+
+ cpu = task_cpu(p);
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(p);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(p))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
+/*
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
+ */
+static inline bool task_queued(struct task_struct *p)
+{
+ return !skiplist_node_empty(&p->node);
}
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
+static inline void resched_if_idle(struct rq *rq);
+
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ *
+ * In particular, the load of prev->state in finish_task_switch() must
+ * happen before this.
+ *
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
+ */
+ smp_store_release(&prev->on_cpu, 0);
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
- grq.lock.owner = current;
+ rq->lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
- spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_);
+ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
- grq_unlock_irq();
+#ifdef CONFIG_SMP
+ /*
+ * If prev was marked as migrating to another CPU in return_task, drop
+ * the local runqueue lock but leave interrupts disabled and grab the
+ * remote lock we're migrating it to before enabling them.
+ */
+ if (unlikely(task_on_rq_migrating(prev))) {
+ sched_info_dequeued(rq, prev);
+ /*
+ * We move the ownership of prev to the new cpu now. ttwu can't
+ * activate prev to the wrong cpu since it has to grab this
+ * runqueue in ttwu_remote.
+ */
+ task_thread_info(prev)->cpu = prev->wake_cpu;
+ raw_spin_unlock(&rq->lock);
+
+ raw_spin_lock(&prev->pi_lock);
+ rq = __task_rq_lock(prev);
+ /* Check that someone else hasn't already queued prev */
+ if (likely(!task_queued(prev))) {
+ enqueue_task(rq, prev, 0);
+ prev->on_rq = TASK_ON_RQ_QUEUED;
+ /* Wake up the CPU if it's not already running */
+ resched_if_idle(rq);
+ }
+ raw_spin_unlock(&prev->pi_lock);
+ }
+#endif
+ raw_spin_unlock_irq(&rq->lock);
}
static inline bool deadline_before(u64 deadline, u64 time)
@@ -482,11 +816,6 @@ static inline bool deadline_before(u64 deadline, u64 time)
return (deadline < time);
}
-static inline bool deadline_after(u64 deadline, u64 time)
-{
- return (deadline > time);
-}
-
/*
* Deadline is "now" in niffies + (offset by priority). Setting the deadline
* is the key to everything. It distributes cpu fairly amongst tasks of the
@@ -521,26 +850,54 @@ static inline int ms_longest_deadline_diff(void)
return NS_TO_MS(longest_deadline_diff());
}
+static inline int rq_load(struct rq *rq)
+{
+ return rq->sl->entries + !rq_idle(rq);
+}
+
+static inline bool rq_local(struct rq *rq);
+
/*
- * A task that is not running or queued will not have a node set.
- * A task that is queued but not running will have a node set.
- * A task that is currently running will have ->on_cpu set but no node set.
+ * Update the load average for feeding into cpu frequency governors. Use a
+ * rough estimate of a rolling average with ~ time constant of 32ms.
+ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
+ * Make sure a call to update_clocks has been made before calling this to get
+ * an updated rq->niffies.
*/
-static inline bool task_queued(struct task_struct *p)
+static void update_load_avg(struct rq *rq)
{
- return !skiplist_node_empty(&p->node);
+ /* rq clock can go backwards so skip update if that happens */
+ if (likely(rq->clock > rq->load_update)) {
+ unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+ long load, curload = rq_load(rq);
+
+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+ if (unlikely(load < 0))
+ load = 0;
+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+ rq->load_avg = load;
+ } else
+ return;
+
+ rq->load_update = rq->clock;
+ if (likely(rq_local(rq)))
+ cpufreq_trigger(rq->niffies, rq->load_avg);
}
/*
- * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * Removing from the runqueue. Enter with rq locked. Deleting a task
* from the skip list is done via the stored node reference in the task struct
* and does not require a full look up. Thus it occurs in O(k) time where k
- * is the "level" of the list the task was stored at - usually < 4, max 16.
+ * is the "level" of the list the task was stored at - usually < 4, max 8.
*/
-static void dequeue_task(struct task_struct *p)
+static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
{
- skiplist_delete(grq.sl, &p->node);
- sched_info_dequeued(task_rq(p), p);
+ skiplist_delete(rq->sl, &p->node);
+ rq->best_key = rq->node.next[0]->key;
+ update_clocks(rq);
+ if (!(flags & DEQUEUE_SAVE))
+ sched_info_dequeued(task_rq(p), p);
+ update_load_avg(rq);
}
#ifdef CONFIG_PREEMPT_RCU
@@ -566,15 +923,15 @@ static bool idleprio_suitable(struct task_struct *p)
* To determine if a task of SCHED_ISO can run in pseudo-realtime, we check
* that the iso_refractory flag is not set.
*/
-static bool isoprio_suitable(void)
+static inline bool isoprio_suitable(struct rq *rq)
{
- return !grq.iso_refractory;
+ return !rq->iso_refractory;
}
/*
- * Adding to the global runqueue. Enter with grq locked.
+ * Adding to the runqueue. Enter with rq locked.
*/
-static void enqueue_task(struct task_struct *p, struct rq *rq)
+static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
{
unsigned int randseed;
u64 sl_id;
@@ -582,7 +939,7 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
if (!rt_task(p)) {
/* Check it hasn't gotten rt from PI */
if ((idleprio_task(p) && idleprio_suitable(p)) ||
- (iso_task(p) && isoprio_suitable()))
+ (iso_task(p) && isoprio_suitable(rq)))
p->prio = p->normal_prio;
else
p->prio = NORMAL_PRIO;
@@ -604,9 +961,8 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
else {
sl_id = p->deadline;
if (idleprio_task(p)) {
- /* Set it to cope with 4 left shifts with locality_diff */
if (p->prio == IDLE_PRIO)
- sl_id |= 0x00FF000000000000;
+ sl_id |= 0xF000000000000000;
else
sl_id += longest_deadline_diff();
}
@@ -615,14 +971,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
* Some architectures don't have better than microsecond resolution
* so mask out ~microseconds as the random seed for skiplist insertion.
*/
- randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
- skiplist_insert(grq.sl, &p->node, sl_id, p, randseed);
- sched_info_queued(rq, p);
-}
-
-static inline void requeue_task(struct task_struct *p)
-{
- sched_info_queued(task_rq(p), p);
+ update_clocks(rq);
+ if (!(flags & ENQUEUE_RESTORE))
+ sched_info_queued(rq, p);
+ randseed = (rq->niffies >> 10) & 0xFFFFFFFF;
+ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed);
+ rq->best_key = rq->node.next[0]->key;
+ update_load_avg(rq);
}
/*
@@ -644,13 +999,6 @@ static inline int task_timeslice(struct task_struct *p)
return (rr_interval * task_prio_ratio(p) / 128);
}
-static void resched_task(struct task_struct *p);
-
-static inline void resched_curr(struct rq *rq)
-{
- resched_task(rq->curr);
-}
-
/*
* qnr is the "queued but not running" count which is the total number of
* tasks on the global runqueue list waiting for cpu time but not actually
@@ -658,24 +1006,31 @@ static inline void resched_curr(struct rq *rq)
*/
static inline void inc_qnr(void)
{
- grq.qnr++;
+ atomic_inc(&grq.qnr);
}
static inline void dec_qnr(void)
{
- grq.qnr--;
+ atomic_dec(&grq.qnr);
}
static inline int queued_notrunning(void)
{
- return grq.qnr;
+ return atomic_read(&grq.qnr);
}
-static unsigned long rq_load_avg(struct rq *rq)
+#ifdef CONFIG_SMP
+/* Entered with rq locked */
+static inline void resched_if_idle(struct rq *rq)
{
- return rq->soft_affined * SCHED_CAPACITY_SCALE;
+ if (rq_idle(rq))
+ resched_task(rq->curr);
}
+static inline bool rq_local(struct rq *rq)
+{
+ return (rq->cpu == smp_processor_id());
+}
#ifdef CONFIG_SMT_NICE
static const cpumask_t *thread_cpumask(int cpu);
@@ -751,35 +1106,70 @@ static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
#else /* CONFIG_SMT_NICE */
#define smt_schedule(p, this_rq) (true)
#endif /* CONFIG_SMT_NICE */
-#ifdef CONFIG_SMP
+
+static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
+{
+ set_bit(cpu, (volatile unsigned long *)cpumask);
+}
+
/*
* The cpu_idle_map stores a bitmap of all the CPUs currently idle to
* allow easy lookup of whether any suitable idle CPUs are available.
* It's cheaper to maintain a binary yes/no if there are any idle CPUs on the
- * idle_cpus variable than to do a full bitmask check when we are busy.
+ * idle_cpus variable than to do a full bitmask check when we are busy. The
+ * bits are set atomically but read locklessly as occasional false positive /
+ * negative is harmless.
*/
static inline void set_cpuidle_map(int cpu)
{
- if (likely(cpu_online(cpu))) {
- cpumask_set_cpu(cpu, &grq.cpu_idle_map);
- grq.idle_cpus = true;
- }
+ if (likely(cpu_online(cpu)))
+ atomic_set_cpu(cpu, &grq.cpu_idle_map);
+}
+
+static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
+{
+ clear_bit(cpu, (volatile unsigned long *)cpumask);
}
static inline void clear_cpuidle_map(int cpu)
{
- cpumask_clear_cpu(cpu, &grq.cpu_idle_map);
- if (cpumask_empty(&grq.cpu_idle_map))
- grq.idle_cpus = false;
+ atomic_clear_cpu(cpu, &grq.cpu_idle_map);
}
static bool suitable_idle_cpus(struct task_struct *p)
{
- if (!grq.idle_cpus)
- return false;
return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
}
+/*
+ * Resched current on rq. We don't know if rq is local to this CPU nor if it
+ * is locked so we do not use an intermediate variable for the task to avoid
+ * having it dereferenced.
+ */
+static void resched_curr(struct rq *rq)
+{
+ int cpu;
+
+ if (test_tsk_need_resched(rq->curr))
+ return;
+
+ rq->preempt = rq->curr;
+ cpu = rq->cpu;
+
+ /* We're doing this without holding the rq lock if it's not task_rq */
+
+ if (cpu == smp_processor_id()) {
+ set_tsk_need_resched(rq->curr);
+ set_preempt_need_resched();
+ return;
+ }
+
+ if (set_nr_and_not_polling(rq->curr))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+}
+
#define CPUIDLE_DIFF_THREAD (1)
#define CPUIDLE_DIFF_CORE (2)
#define CPUIDLE_CACHE_BUSY (4)
@@ -855,30 +1245,48 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return (this_rq->cpu_locality[that_cpu] < 3);
}
-static bool resched_best_idle(struct task_struct *p)
+/* As per resched_curr but only will resched idle task */
+static inline void resched_idle(struct rq *rq)
+{
+ if (test_tsk_need_resched(rq->idle))
+ return;
+
+ rq->preempt = rq->idle;
+
+ set_tsk_need_resched(rq->idle);
+
+ if (rq_local(rq)) {
+ set_preempt_need_resched();
+ return;
+ }
+
+ smp_send_reschedule(rq->cpu);
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
{
cpumask_t tmpmask;
struct rq *rq;
int best_cpu;
cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
- best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
+ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
rq = cpu_rq(best_cpu);
if (!smt_schedule(p, rq))
- return false;
- resched_curr(rq);
- return true;
+ return NULL;
+ resched_idle(rq);
+ return rq;
}
static inline void resched_suitable_idle(struct task_struct *p)
{
if (suitable_idle_cpus(p))
- resched_best_idle(p);
+ resched_best_idle(p, task_cpu(p));
}
-static inline int locality_diff(int cpu, struct rq *rq)
+static inline struct rq *rq_order(struct rq *rq, int cpu)
{
- return rq->cpu_locality[cpu];
+ return rq->rq_order[cpu];
}
#else /* CONFIG_SMP */
static inline void set_cpuidle_map(int cpu)
@@ -898,9 +1306,28 @@ static inline void resched_suitable_idle(struct task_struct *p)
{
}
-static inline int locality_diff(int cpu, struct rq *rq)
+static inline void resched_curr(struct rq *rq)
{
- return 0;
+ resched_task(rq->curr);
+}
+
+static inline void resched_if_idle(struct rq *rq)
+{
+}
+
+static inline bool rq_local(struct rq *rq)
+{
+ return true;
+}
+
+static inline struct rq *rq_order(struct rq *rq, int cpu)
+{
+ return rq;
+}
+
+static inline bool smt_schedule(struct task_struct *p, struct rq *rq)
+{
+ return true;
}
#endif /* CONFIG_SMP */
@@ -935,32 +1362,11 @@ static int effective_prio(struct task_struct *p)
}
/*
- * Update the load average for feeding into cpu frequency governors. Use a
- * rough estimate of a rolling average with ~ time constant of 32ms.
- * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144
- */
-static void update_load_avg(struct rq *rq)
-{
- /* rq clock can go backwards so skip update if that happens */
- if (likely(rq->clock > rq->load_update)) {
- unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
- long load;
-
- load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
- if (unlikely(load < 0))
- load = 0;
- load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144;
- rq->load_avg = load;
- }
- rq->load_update = rq->clock;
-}
-
-/*
- * activate_task - move a task to the runqueue. Enter with grq locked.
+ * activate_task - move a task to the runqueue. Enter with rq locked.
*/
static void activate_task(struct task_struct *p, struct rq *rq)
{
- update_clocks(rq);
+ resched_if_idle(rq);
/*
* Sleep time is in units of nanosecs, so shift by 20 to get a
@@ -970,134 +1376,137 @@ static void activate_task(struct task_struct *p, struct rq *rq)
if (unlikely(prof_on == SLEEP_PROFILING)) {
if (p->state == TASK_UNINTERRUPTIBLE)
profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
- (rq->clock_task - p->last_ran) >> 20);
+ (rq->niffies - p->last_ran) >> 20);
}
p->prio = effective_prio(p);
if (task_contributes_to_load(p))
- grq.nr_uninterruptible--;
- enqueue_task(p, rq);
- rq->soft_affined++;
- p->on_rq = 1;
- grq.nr_running++;
+ atomic_dec(&grq.nr_uninterruptible);
+
+ enqueue_task(rq, p, 0);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ atomic_inc(&grq.nr_running);
inc_qnr();
- update_load_avg(rq);
- cpufreq_trigger(grq.niffies, rq->load_avg);
}
/*
- * deactivate_task - If it's running, it's not on the grq and we can just
- * decrement the nr_running. Enter with grq locked.
+ * deactivate_task - If it's running, it's not on the runqueue and we can just
+ * decrement the nr_running. Enter with rq locked.
*/
static inline void deactivate_task(struct task_struct *p, struct rq *rq)
{
if (task_contributes_to_load(p))
- grq.nr_uninterruptible++;
- rq->soft_affined--;
+ atomic_inc(&grq.nr_uninterruptible);
+
p->on_rq = 0;
- grq.nr_running--;
- update_load_avg(rq);
- cpufreq_trigger(grq.niffies, rq->load_avg);
+ atomic_dec(&grq.nr_running);
+ sched_info_dequeued(rq, p);
}
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
+ struct rq *rq = task_rq(p);
+ bool queued;
+
#ifdef CONFIG_LOCKDEP
/*
- * The caller should hold either p->pi_lock or grq lock, when changing
- * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks.
+ * The caller should hold either p->pi_lock or rq->lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
*
* Furthermore, all task_rq users should acquire both locks, see
- * task_grq_lock().
+ * task_rq_lock().
*/
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
- lockdep_is_held(&grq.lock)));
+ lockdep_is_held(&task_rq(p)->lock)));
#endif
- if (task_cpu(p) == cpu)
+ if (p->wake_cpu == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
/*
- * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be
+ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
* successfully executed on another CPU. We must ensure that updates of
* per-task data have been completed by this moment.
*/
smp_wmb();
- if (p->on_rq) {
- struct rq *rq = task_rq(p);
+ if (task_running(rq, p)) {
+ /*
+ * We should only be calling this on a running task if we're
+ * holding rq lock.
+ */
+ lockdep_assert_held(&rq->lock);
- rq->soft_affined--;
- update_load_avg(rq);
- rq = cpu_rq(cpu);
- rq->soft_affined++;
- update_load_avg(rq);
+ /*
+ * We can't change the task_thread_info cpu on a running task
+ * as p will still be protected by the rq lock of the cpu it
+ * is still running on so we set the wake_cpu for it to be
+ * lazily updated once off the cpu.
+ */
+ p->wake_cpu = cpu;
+ return;
}
- task_thread_info(p)->cpu = cpu;
+
+ if ((queued = task_queued(p)))
+ dequeue_task(rq, p, 0);
+ task_thread_info(p)->cpu = p->wake_cpu = cpu;
+ if (queued)
+ enqueue_task(cpu_rq(cpu), p, 0);
}
#endif /* CONFIG_SMP */
/*
- * Move a task off the global queue and take it to a cpu for it will
+ * Move a task off the runqueue and take it to a cpu for it will
* become the running task.
*/
-static inline void take_task(int cpu, struct task_struct *p)
+static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
{
+ struct rq *p_rq = task_rq(p);
+
+ dequeue_task(p_rq, p, DEQUEUE_SAVE);
+ if (p_rq != rq) {
+ sched_info_dequeued(p_rq, p);
+ sched_info_queued(rq, p);
+ }
set_task_cpu(p, cpu);
- dequeue_task(p);
dec_qnr();
}
/*
- * Returns a descheduling task to the grq runqueue unless it is being
+ * Returns a descheduling task to the runqueue unless it is being
* deactivated.
*/
-static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate)
+static inline void return_task(struct task_struct *p, struct rq *rq,
+ int cpu, bool deactivate)
{
if (deactivate)
deactivate_task(p, rq);
else {
inc_qnr();
- enqueue_task(p, rq);
+#ifdef CONFIG_SMP
+ /*
+ * set_task_cpu was called on the running task that doesn't
+ * want to deactivate so it has to be enqueued to a different
+ * CPU and we need its lock. Tag it to be moved with as the
+ * lock is dropped in finish_lock_switch.
+ */
+ if (unlikely(p->wake_cpu != cpu))
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ else
+#endif
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
}
}
-/* Enter with grq lock held. We know p is on the local cpu */
+/* Enter with rq lock held. We know p is on the local cpu */
static inline void __set_tsk_resched(struct task_struct *p)
{
set_tsk_need_resched(p);
set_preempt_need_resched();
}
-/*
- * resched_task - mark a task 'to be rescheduled now'.
- *
- * On UP this means the setting of the need_resched flag, on SMP it
- * might also involve a cross-CPU call to trigger the scheduler on
- * the target CPU.
- */
-void resched_task(struct task_struct *p)
-{
- int cpu;
-
- lockdep_assert_held(&grq.lock);
-
- if (test_tsk_need_resched(p))
- return;
-
- set_tsk_need_resched(p);
-
- cpu = task_cpu(p);
- if (cpu == smp_processor_id()) {
- set_preempt_need_resched();
- return;
- }
-
- smp_send_reschedule(cpu);
-}
-
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
@@ -1128,8 +1537,8 @@ inline int task_curr(const struct task_struct *p)
*/
unsigned long wait_task_inactive(struct task_struct *p, long match_state)
{
+ int running, queued;
unsigned long flags;
- bool running, on_rq;
unsigned long ncsw;
struct rq *rq;
@@ -1147,25 +1556,25 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* if the runqueue has changed and p is actually now
* running somewhere else!
*/
- while (task_running(p) && p == rq->curr) {
+ while (task_running(rq, p)) {
if (match_state && unlikely(p->state != match_state))
return 0;
cpu_relax();
}
/*
- * Ok, time to look more closely! We need the grq
+ * Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
- rq = task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
- running = task_running(p);
- on_rq = p->on_rq;
+ running = task_running(rq, p);
+ queued = task_on_rq_queued(p);
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -1193,7 +1602,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
- if (unlikely(on_rq)) {
+ if (unlikely(queued)) {
ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ);
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1252,32 +1661,15 @@ can_preempt(struct task_struct *p, int prio, u64 deadline)
return true;
if (p->prio > prio)
return false;
- /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */
+ if (p->policy == SCHED_BATCH)
+ return false;
+ /* SCHED_NORMAL and ISO will preempt based on deadline */
if (!deadline_before(p->deadline, deadline))
return false;
return true;
}
#ifdef CONFIG_SMP
-#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
-#ifdef CONFIG_HOTPLUG_CPU
-/*
- * Check to see if there is a task that is affined only to offline CPUs but
- * still wants runtime. This happens to kernel threads during suspend/halt and
- * disabling of CPUs.
- */
-static inline bool online_cpus(struct task_struct *p)
-{
- return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed)));
-}
-#else /* CONFIG_HOTPLUG_CPU */
-/* All available CPUs are always online without hotplug. */
-static inline bool online_cpus(struct task_struct *p)
-{
- return true;
-}
-#endif
-
/*
* Check to see if p can run on cpu, and if not, whether there are any online
* CPUs it can run on instead.
@@ -1288,13 +1680,14 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
return true;
return false;
}
+#define cpu_online_map (*(cpumask_t *)cpu_online_mask)
static void try_preempt(struct task_struct *p, struct rq *this_rq)
{
- int i, this_entries = this_rq->soft_affined;
+ int i, this_entries = rq_load(this_rq);
cpumask_t tmp;
- if (suitable_idle_cpus(p) && resched_best_idle(p))
+ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p)))
return;
/* IDLEPRIO tasks never preempt anything but idle */
@@ -1303,26 +1696,15 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
- /*
- * We iterate over CPUs in locality order using rq_order, finding the
- * first one we can preempt if possible, thus staying closest in
- * locality.
- */
for (i = 0; i < num_possible_cpus(); i++) {
struct rq *rq = this_rq->rq_order[i];
if (!cpumask_test_cpu(rq->cpu, &tmp))
continue;
- if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries)
+ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries)
continue;
if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
- /*
- * If we have decided this task should preempt this CPU,
- * set the task's CPU to match thereby speeding up matching
- * this task in earliest_deadline_task.
- */
- set_task_cpu(p, rq->cpu);
resched_curr(rq);
return;
}
@@ -1352,6 +1734,13 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
}
#endif /* CONFIG_SMP */
+/*
+ * wake flags
+ */
+#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
+#define WF_FORK 0x02 /* child wakeup after fork */
+#define WF_MIGRATED 0x04 /* internal use, task got migrated */
+
static void
ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
@@ -1382,27 +1771,96 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
#endif /* CONFIG_SCHEDSTATS */
}
-void wake_up_if_idle(int cpu)
+static inline void ttwu_activate(struct rq *rq, struct task_struct *p)
{
- struct rq *rq = cpu_rq(cpu);
- unsigned long flags;
+ activate_task(p, rq);
- rcu_read_lock();
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
+}
- if (!is_idle_task(rcu_dereference(rq->curr)))
- goto out;
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ /*
+ * Sync wakeups (i.e. those types of wakeups where the waker
+ * has indicated that it will leave the CPU in short order)
+ * don't trigger a preemption if there are no idle cpus,
+ * instead waiting for current to deschedule.
+ */
+ if (wake_flags & WF_SYNC)
+ resched_suitable_idle(p);
+ else
+ try_preempt(p, rq);
+ p->state = TASK_RUNNING;
+ trace_sched_wakeup(p);
+}
- grq_lock_irqsave(&flags);
- if (likely(is_idle_task(rq->curr)))
- smp_send_reschedule(cpu);
- /* Else cpu is not in idle, do nothing here */
- grq_unlock_irqrestore(&flags);
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+ lockdep_assert_held(&rq->lock);
-out:
- rcu_read_unlock();
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ atomic_dec(&grq.nr_uninterruptible);
+#endif
+
+ ttwu_activate(rq, p);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (likely(task_on_rq_queued(p))) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
}
#ifdef CONFIG_SMP
+static bool sched_smp_initialized __read_mostly;
+
+void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct llist_node *llist = llist_del_all(&rq->wake_list);
+ struct task_struct *p;
+ unsigned long flags;
+
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ while (llist) {
+ int wake_flags = 0;
+
+ p = llist_entry(llist, struct task_struct, wake_entry);
+ llist = llist_next(llist);
+
+ ttwu_do_activate(rq, p, wake_flags);
+ }
+
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
void scheduler_ipi(void)
{
/*
@@ -1411,45 +1869,152 @@ void scheduler_ipi(void)
* this IPI.
*/
preempt_fold_need_resched();
-}
-#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync)
-{
- activate_task(p, rq);
+ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched()))
+ return;
/*
- * Sync wakeups (i.e. those types of wakeups where the waker
- * has indicated that it will leave the CPU in short order)
- * don't trigger a preemption if there are no idle cpus,
- * instead waiting for current to deschedule.
+ * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+ * traditionally all their work was done from the interrupt return
+ * path. Now that we actually do some work, we need to make sure
+ * we do call them.
+ *
+ * Some archs already do call them, luckily irq_enter/exit nest
+ * properly.
+ *
+ * Arguably we should visit all archs and update all handlers,
+ * however a fair share of IPIs are still resched only so this would
+ * somewhat pessimize the simple resched case.
*/
- if (!is_sync || suitable_idle_cpus(p))
- try_preempt(p, rq);
+ irq_enter();
+ sched_ttwu_pending();
+ irq_exit();
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- bool success)
+static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
{
- trace_sched_wakeup(p);
- p->state = TASK_RUNNING;
+ struct rq *rq = cpu_rq(cpu);
- /*
- * if a worker is waking up, notify workqueue. Note that on BFS, we
- * don't really know what cpu it will be, so we fake it for
- * wq_worker_waking_up :/
- */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void wake_up_if_idle(int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
+
+ if (set_nr_if_polling(rq->idle)) {
+ trace_sched_wake_idle_without_ipi(cpu);
+ } else {
+ rq_lock_irqsave(rq, &flags);
+ if (likely(is_idle_task(rq->curr)))
+ smp_send_reschedule(cpu);
+ /* Else cpu is not in idle, do nothing here */
+ rq_unlock_irqrestore(rq, &flags);
+ }
+
+out:
+ rcu_read_unlock();
+}
+
+static int valid_task_cpu(struct task_struct *p)
+{
+ cpumask_t valid_mask;
+
+ if (p->flags & PF_KTHREAD)
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask);
+ else
+ cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask);
+
+ if (unlikely(!cpumask_weight(&valid_mask))) {
+ /* Hotplug boot threads do this before the CPU is up */
+ WARN_ON(sched_smp_initialized);
+ return cpumask_any(tsk_cpus_allowed(p));
+ }
+ return cpumask_any(&valid_mask);
}
/*
- * wake flags
+ * For a task that's just being woken up we have a valuable balancing
+ * opportunity so choose the nearest cache most lightly loaded runqueue.
+ * Entered with rq locked and returns with the chosen runqueue locked.
*/
-#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */
-#define WF_FORK 0x02 /* child wakeup after fork */
-#define WF_MIGRATED 0x4 /* internal use, task got migrated */
+static inline int select_best_cpu(struct task_struct *p)
+{
+ unsigned int idlest = ~0U;
+ struct rq *rq = NULL;
+ int i;
+
+ if (suitable_idle_cpus(p)) {
+ int cpu = task_cpu(p);
+
+ if (unlikely(needs_other_cpu(p, cpu)))
+ cpu = valid_task_cpu(p);
+ rq = resched_best_idle(p, cpu);
+ if (likely(rq))
+ return rq->cpu;
+ }
+
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *other_rq = task_rq(p)->rq_order[i];
+ int entries;
+
+ if (!other_rq->online)
+ continue;
+ if (needs_other_cpu(p, other_rq->cpu))
+ continue;
+ entries = rq_load(other_rq);
+ if (entries >= idlest)
+ continue;
+ idlest = entries;
+ rq = other_rq;
+ }
+ if (unlikely(!rq))
+ return smp_processor_id();
+ return rq->cpu;
+}
+#else /* CONFIG_SMP */
+static int valid_task_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static inline int select_best_cpu(struct task_struct *p)
+{
+ return 0;
+}
+
+static struct rq *resched_best_idle(struct task_struct *p, int cpu)
+{
+ return NULL;
+}
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (!cpus_share_cache(smp_processor_id(), cpu)) {
+ sched_clock_cpu(cpu); /* sync clocks x-cpu */
+ ttwu_queue_remote(p, cpu, wake_flags);
+ return;
+ }
+#endif
+ rq_lock(rq);
+ ttwu_do_activate(rq, p, wake_flags);
+ rq_unlock(rq);
+}
/***
* try_to_wake_up - wake up a thread
@@ -1466,13 +2031,11 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
* Return: %true if @p was woken up, %false if it was already running.
* or @state didn't match @p's state.
*/
-static bool try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- bool success = false;
unsigned long flags;
- struct rq *rq;
- int cpu;
+ int cpu, success = 0;
/*
* If we are going to wake up a thread waiting for CONDITION we
@@ -1481,68 +2044,133 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
* set_current_state() the waiting thread does.
*/
smp_mb__before_spinlock();
-
- /*
- * No need to do time_lock_grq as we only need to update the rq clock
- * if we activate the task
- */
- rq = task_grq_lock(p, &flags);
- cpu = task_cpu(p);
-
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
/* state is a volatile long, どうして、分からない */
if (!((unsigned int)p->state & state))
- goto out_unlock;
+ goto out;
trace_sched_waking(p);
- if (task_queued(p) || task_running(p))
- goto out_running;
+ success = 1; /* we're going to change ->state */
+ cpu = task_cpu(p);
- ttwu_activate(p, rq, wake_flags & WF_SYNC);
- success = true;
+ /*
+ * Ensure we load p->on_rq _after_ p->state, otherwise it would
+ * be possible to, falsely, observe p->on_rq == 0 and get stuck
+ * in smp_cond_load_acquire() below.
+ *
+ * sched_ttwu_pending() try_to_wake_up()
+ * [S] p->on_rq = 1; [L] P->state
+ * UNLOCK rq->lock -----.
+ * \
+ * +--- RMB
+ * schedule() /
+ * LOCK rq->lock -----'
+ * UNLOCK rq->lock
+ *
+ * [task p]
+ * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
+ *
+ * Pairs with the UNLOCK+LOCK on rq->lock from the
+ * last wakeup of our task and the schedule that got our task
+ * current.
+ */
+ smp_rmb();
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
-out_running:
- ttwu_post_activation(p, rq, success);
-out_unlock:
- task_grq_unlock(p, &flags);
+#ifdef CONFIG_SMP
+ /*
+ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
+ * possible to, falsely, observe p->on_cpu == 0.
+ *
+ * One must be running (->on_cpu == 1) in order to remove oneself
+ * from the runqueue.
+ *
+ * [S] ->on_cpu = 1; [L] ->on_rq
+ * UNLOCK rq->lock
+ * RMB
+ * LOCK rq->lock
+ * [S] ->on_rq = 0; [L] ->on_cpu
+ *
+ * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
+ * from the consecutive calls to schedule(); the first switching to our
+ * task, the second putting it to sleep.
+ */
+ smp_rmb();
+
+ /*
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
+ *
+ * Pairs with the smp_store_release() in finish_lock_switch().
+ *
+ * This ensures that tasks getting woken will be fully ordered against
+ * their previous state and preserve Program Order.
+ */
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
+
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
+ cpu = select_best_cpu(p);
+ if (task_cpu(p) != cpu)
+ set_task_cpu(p, cpu);
+#endif /* CONFIG_SMP */
+ ttwu_queue(p, cpu, wake_flags);
+stat:
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
+out:
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
/**
- * try_to_wake_up_local - try to wake up a local task with grq lock held
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
* Put @p on the run-queue if it's not already there. The caller must
- * ensure that grq is locked and, @p is not the current task.
- * grq stays locked over invocation.
+ * ensure that rq is locked and, @p is not the current task.
+ * rq stays locked over invocation.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
- lockdep_assert_held(&grq.lock);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ /*
+ * This is OK, because current is on_cpu, which avoids it being
+ * picked for load-balance and preemption/IRQs are still
+ * disabled avoiding further scheduler activity on it and we've
+ * not yet picked a replacement task.
+ */
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
trace_sched_waking(p);
- if (!task_queued(p)) {
- if (likely(!task_running(p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false);
- if (schedstat_enabled())
- ttwu_stat(p, smp_processor_id(), 0);
- success = true;
- }
- ttwu_post_activation(p, rq, success);
+ if (!task_on_rq_queued(p))
+ ttwu_activate(rq, p);
+
+ ttwu_do_wakeup(rq, p, 0);
+ if (schedstat_enabled())
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
@@ -1568,7 +2196,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
return try_to_wake_up(p, state, 0);
}
-static void time_slice_expired(struct task_struct *p);
+static void time_slice_expired(struct task_struct *p, struct rq *rq);
/*
* Perform scheduler related setup for a newly forked process p.
@@ -1576,35 +2204,39 @@ static void time_slice_expired(struct task_struct *p);
*/
int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
{
+ unsigned long flags;
+ int cpu = get_cpu();
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
/*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
* The process state is set to the same value of the process executing
* do_fork() code. That is running. This guarantees that nobody will
* actually run it, and a signal or other external event cannot wake
* it up and insert it on the runqueue either.
*/
- /* Should be reset in fork.c but done here for ease of bfs patching */
+ /* Should be reset in fork.c but done here for ease of MuQSS patching */
+ p->on_cpu =
p->on_rq =
p->utime =
p->stime =
p->utimescaled =
p->stimescaled =
p->sched_time =
- p->stime_pc =
- p->utime_pc = 0;
+ p->stime_ns =
+ p->utime_ns = 0;
skiplist_node_init(&p->node);
/*
- * We mark the process as NEW here. This guarantees that
- * nobody will actually run it, and a signal or other external
- * event cannot wake it up and insert it on the runqueue either.
- */
- p->state = TASK_NEW;
-
- /*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
@@ -1625,12 +2257,20 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
+ /*
+ * Silence PROVE_RCU.
+ */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ set_task_cpu(p, cpu);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
#ifdef CONFIG_SCHED_INFO
if (unlikely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
- p->on_cpu = false;
init_task_preempt_count(p);
+
+ put_cpu();
return 0;
}
@@ -1725,24 +2365,19 @@ void wake_up_new_task(struct task_struct *p)
unsigned long flags;
parent = p->parent;
- rq = task_grq_lock(p, &flags);
- if (unlikely(needs_other_cpu(p, task_cpu(p))))
- set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
- rq_curr = rq->curr;
- p->state = TASK_RUNNING;
-
- /*
- * Reinit new task deadline as its creator deadline could have changed
- * since call to dup_task_struct().
- */
- p->deadline = rq->rq_deadline;
- /* The new task might not be able to run on the same CPU as rq->curr */
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ p->state = TASK_RUNNING;
+ /* Task_rq can't change yet on a new task */
+ new_rq = rq = task_rq(p);
if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
- set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ set_task_cpu(p, valid_task_cpu(p));
new_rq = task_rq(p);
- } else
- new_rq = rq;
+ }
+
+ double_rq_lock(rq, new_rq);
+ update_clocks(rq);
+ rq_curr = rq->curr;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -1756,30 +2391,29 @@ void wake_up_new_task(struct task_struct *p)
* Share the timeslice between parent and child, thus the
* total amount of pending timeslices in the system doesn't change,
* resulting in more scheduling fairness. If it's negative, it won't
- * matter since that's the same as being 0. current's time_slice is
- * actually in rq_time_slice when it's running, as is its last_ran
- * value. rq->rq_deadline is only modified within schedule() so it
- * is always equal to current->deadline.
+ * matter since that's the same as being 0. rq->rq_deadline is only
+ * modified within schedule() so it is always equal to
+ * current->deadline.
*/
- p->last_ran = rq->rq_last_ran;
+ p->last_ran = rq_curr->last_ran;
if (likely(rq_curr->policy != SCHED_FIFO)) {
- rq->rq_time_slice /= 2;
- if (unlikely(rq->rq_time_slice < RESCHED_US)) {
+ rq_curr->time_slice /= 2;
+ if (unlikely(rq_curr->time_slice < RESCHED_US)) {
/*
* Forking task has run out of timeslice. Reschedule it and
* start its child with a new time slice and deadline. The
* child will end up running first because its deadline will
* be slightly earlier.
*/
- rq->rq_time_slice = 0;
+ rq_curr->time_slice = 0;
__set_tsk_resched(rq_curr);
- time_slice_expired(p);
+ time_slice_expired(p, new_rq);
if (suitable_idle_cpus(p))
- resched_best_idle(p);
+ resched_best_idle(p, task_cpu(p));
else if (unlikely(rq != new_rq))
try_preempt(p, new_rq);
} else {
- p->time_slice = rq->rq_time_slice;
+ p->time_slice = rq_curr->time_slice;
if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
/*
* The VM isn't cloned, so we're in a good position to
@@ -1791,10 +2425,11 @@ void wake_up_new_task(struct task_struct *p)
try_preempt(p, new_rq);
}
} else {
- time_slice_expired(p);
+ time_slice_expired(p, new_rq);
try_preempt(p, new_rq);
}
- task_grq_unlock(p, &flags);
+ double_rq_unlock(rq, new_rq);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1928,7 +2563,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* because prev may have moved to another CPU.
*/
static struct rq *finish_task_switch(struct task_struct *prev)
- __releases(grq.lock)
+ __releases(rq->lock)
{
struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
@@ -1988,7 +2623,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
* @prev: the thread we just switched away from.
*/
asmlinkage __visible void schedule_tail(struct task_struct *prev)
- __releases(grq.lock)
+ __releases(rq->lock)
{
struct rq *rq;
@@ -2045,7 +2680,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
* of the scheduler it's an obvious special-case), so we
* do an early lockdep release here:
*/
- spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
@@ -2058,26 +2693,16 @@ context_switch(struct rq *rq, struct task_struct *prev,
* nr_running, nr_uninterruptible and nr_context_switches:
*
* externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup. All are
- * measured without grabbing the grq lock but the occasional inaccurate result
- * doesn't matter so long as it's positive.
+ * threads, total number of context switches performed since bootup.
*/
unsigned long nr_running(void)
{
- long nr = grq.nr_running;
-
- if (unlikely(nr < 0))
- nr = 0;
- return (unsigned long)nr;
+ return atomic_read(&grq.nr_running);
}
static unsigned long nr_uninterruptible(void)
{
- long nu = grq.nr_uninterruptible;
-
- if (unlikely(nu < 0))
- nu = 0;
- return nu;
+ return atomic_read(&grq.nr_uninterruptible);
}
/*
@@ -2095,7 +2720,9 @@ static unsigned long nr_uninterruptible(void)
*/
bool single_task_running(void)
{
- if (cpu_rq(smp_processor_id())->soft_affined == 1)
+ struct rq *rq = cpu_rq(smp_processor_id());
+
+ if (rq_load(rq) == 1)
return true;
else
return false;
@@ -2104,12 +2731,7 @@ EXPORT_SYMBOL(single_task_running);
unsigned long long nr_context_switches(void)
{
- long long ns = grq.nr_switches;
-
- /* This is of course impossible */
- if (unlikely(ns < 0))
- ns = 1;
- return (unsigned long long)ns;
+ return (unsigned long long)atomic64_read(&grq.nr_switches);
}
unsigned long nr_iowait(void)
@@ -2133,15 +2755,16 @@ unsigned long nr_active(void)
return nr_running() + nr_uninterruptible();
}
-/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we
- * base it on the number of running or queued tasks with their ->rq pointer
- * set to this cpu as being the CPU they're more likely to run on. */
+/*
+ * I/O wait is the number of running or queued tasks with their ->rq pointer
+ * set to this cpu as being the CPU they're more likely to run on.
+ */
void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
{
struct rq *rq = this_rq();
*nr_waiters = atomic_read(&rq->nr_iowait);
- *load = rq->soft_affined;
+ *load = rq_load(rq);
}
/* Variables and functions for calc_load */
@@ -2364,7 +2987,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
delta -= steal;
}
#endif
-
rq->clock_task += delta;
}
@@ -2456,86 +3078,89 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
}
/*
- * On each tick, see what percentage of that tick was attributed to each
- * component and add the percentage to the _pc values. Once a _pc value has
- * accumulated one tick's worth, account for that. This means the total
- * percentage of load components will always be 128 (pseudo 100) per tick.
+ * On each tick, add the number of nanoseconds to the unbanked variables and
+ * once one tick's worth has accumulated, account it allowing for accurate
+ * sub-tick accounting and totals.
*/
-static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc)
+static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
if (atomic_read(&rq->nr_iowait) > 0) {
- rq->iowait_pc += pc;
- if (rq->iowait_pc >= 128) {
- cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128;
- rq->iowait_pc %= 128;
+ rq->iowait_ns += ns;
+ if (rq->iowait_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->iowait_ns);
+ cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks;
+ rq->iowait_ns %= JIFFY_NS;
}
} else {
- rq->idle_pc += pc;
- if (rq->idle_pc >= 128) {
- cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128;
- rq->idle_pc %= 128;
+ rq->idle_ns += ns;
+ if (rq->idle_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->idle_ns);
+ cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks;
+ rq->idle_ns %= JIFFY_NS;
}
}
acct_update_integrals(idle);
}
-static void
-pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset,
- unsigned long pc, unsigned long ns)
+static void pc_system_time(struct rq *rq, struct task_struct *p,
+ int hardirq_offset, unsigned long ns)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
- p->stime_pc += pc;
- if (p->stime_pc >= 128) {
- int jiffs = p->stime_pc / 128;
-
- p->stime_pc %= 128;
- p->stime += (__force u64)cputime_one_jiffy * jiffs;
- p->stimescaled += one_jiffy_scaled * jiffs;
- account_group_system_time(p, cputime_one_jiffy * jiffs);
+ p->stime_ns += ns;
+ if (p->stime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->stime_ns);
+ p->stime_ns %= JIFFY_NS;
+ p->stime += (__force u64)cputime_one_jiffy * ticks;
+ p->stimescaled += one_jiffy_scaled * ticks;
+ account_group_system_time(p, cputime_one_jiffy * ticks);
}
p->sched_time += ns;
account_group_exec_runtime(p, ns);
if (hardirq_count() - hardirq_offset) {
- rq->irq_pc += pc;
- if (rq->irq_pc >= 128) {
- cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128;
- rq->irq_pc %= 128;
+ rq->irq_ns += ns;
+ if (rq->irq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->irq_ns);
+ cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->irq_ns %= JIFFY_NS;
}
} else if (in_serving_softirq()) {
- rq->softirq_pc += pc;
- if (rq->softirq_pc >= 128) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
- rq->softirq_pc %= 128;
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->softirq_ns %= JIFFY_NS;
}
} else {
- rq->system_pc += pc;
- if (rq->system_pc >= 128) {
- cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128;
- rq->system_pc %= 128;
+ rq->system_ns += ns;
+ if (rq->system_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->system_ns);
+ cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks;
+ rq->system_ns %= JIFFY_NS;
}
}
acct_update_integrals(p);
}
-static void pc_user_time(struct rq *rq, struct task_struct *p,
- unsigned long pc, unsigned long ns)
+static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
+ unsigned long ticks;
- p->utime_pc += pc;
- if (p->utime_pc >= 128) {
- int jiffs = p->utime_pc / 128;
-
- p->utime_pc %= 128;
- p->utime += (__force u64)cputime_one_jiffy * jiffs;
- p->utimescaled += one_jiffy_scaled * jiffs;
- account_group_user_time(p, cputime_one_jiffy * jiffs);
+ p->utime_ns += ns;
+ if (p->utime_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(p->utime_ns);
+ p->utime_ns %= JIFFY_NS;
+ p->utime += (__force u64)cputime_one_jiffy * ticks;
+ p->utimescaled += one_jiffy_scaled * ticks;
+ account_group_user_time(p, cputime_one_jiffy * ticks);
}
p->sched_time += ns;
account_group_exec_runtime(p, ns);
@@ -2545,36 +3170,33 @@ static void pc_user_time(struct rq *rq, struct task_struct *p,
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
*/
- rq->softirq_pc += pc;
- if (rq->softirq_pc >= 128) {
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128;
- rq->softirq_pc %= 128;
+ rq->softirq_ns += ns;
+ if (rq->softirq_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->softirq_ns);
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks;
+ rq->softirq_ns %= JIFFY_NS;
}
}
if (task_nice(p) > 0 || idleprio_task(p)) {
- rq->nice_pc += pc;
- if (rq->nice_pc >= 128) {
- cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128;
- rq->nice_pc %= 128;
+ rq->nice_ns += ns;
+ if (rq->nice_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->nice_ns);
+ cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks;
+ rq->nice_ns %= JIFFY_NS;
}
} else {
- rq->user_pc += pc;
- if (rq->user_pc >= 128) {
- cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128;
- rq->user_pc %= 128;
+ rq->user_ns += ns;
+ if (rq->user_ns >= JIFFY_NS) {
+ ticks = NS_TO_JIFFIES(rq->user_ns);
+ cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks;
+ rq->user_ns %= JIFFY_NS;
}
}
acct_update_integrals(p);
}
/*
- * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast
- * shifts instead of 100
- */
-#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS)
-
-/*
* This is called on clock ticks.
* Bank in p->sched_time the ns elapsed since the last tick or switch.
* CPU scheduler quota accounting is also performed here in microseconds.
@@ -2582,38 +3204,29 @@ static void pc_user_time(struct rq *rq, struct task_struct *p,
static void
update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
{
- long account_ns = rq->clock_task - rq->rq_last_ran;
+ s64 account_ns = rq->niffies - p->last_ran;
struct task_struct *idle = rq->idle;
- unsigned long account_pc;
- if (unlikely(account_ns < 0) || steal_account_process_tick())
+ if (steal_account_process_tick())
goto ts_account;
- account_pc = NS_TO_PC(account_ns);
-
/* Accurate tick timekeeping */
if (user_mode(get_irq_regs()))
- pc_user_time(rq, p, account_pc, account_ns);
- else if (p != idle || (irq_count() != HARDIRQ_OFFSET))
- pc_system_time(rq, p, HARDIRQ_OFFSET,
- account_pc, account_ns);
- else
- pc_idle_time(rq, idle, account_pc);
+ pc_user_time(rq, p, account_ns);
+ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) {
+ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns);
+ } else
+ pc_idle_time(rq, idle, account_ns);
if (sched_clock_irqtime)
irqtime_account_hi_si();
ts_account:
/* time_slice accounting is done in usecs to avoid overflow on 32bit */
- if (rq->rq_policy != SCHED_FIFO && p != idle) {
- s64 time_diff = rq->clock - rq->timekeep_clock;
-
- niffy_diff(&time_diff, 1);
- rq->rq_time_slice -= NS_TO_US(time_diff);
- }
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
- rq->rq_last_ran = rq->clock_task;
- rq->timekeep_clock = rq->clock;
+ p->last_ran = rq->niffies;
}
/*
@@ -2624,40 +3237,25 @@ ts_account:
static void
update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
{
- long account_ns = rq->clock_task - rq->rq_last_ran;
+ s64 account_ns = rq->niffies - p->last_ran;
struct task_struct *idle = rq->idle;
- unsigned long account_pc;
-
- if (unlikely(account_ns < 0))
- goto ts_account;
-
- account_pc = NS_TO_PC(account_ns);
/* Accurate subtick timekeeping */
- if (p != idle) {
- pc_user_time(rq, p, account_pc, account_ns);
- }
+ if (p != idle)
+ pc_user_time(rq, p, account_ns);
else
- pc_idle_time(rq, idle, account_pc);
+ pc_idle_time(rq, idle, account_ns);
-ts_account:
/* time_slice accounting is done in usecs to avoid overflow on 32bit */
- if (rq->rq_policy != SCHED_FIFO && p != idle) {
- s64 time_diff = rq->clock - rq->timekeep_clock;
-
- niffy_diff(&time_diff, 1);
- rq->rq_time_slice -= NS_TO_US(time_diff);
- }
-
- rq->rq_last_ran = rq->clock_task;
- rq->timekeep_clock = rq->clock;
+ if (p->policy != SCHED_FIFO && p != idle)
+ p->time_slice -= NS_TO_US(account_ns);
}
/*
* Return any ns on the sched_clock that have not yet been accounted in
* @p in case that task is currently running.
*
- * Called with task_grq_lock() held.
+ * Called with task_rq_lock(p) held.
*/
static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
{
@@ -2668,11 +3266,9 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
* project cycles that may never be accounted to this
* thread, breaking clock_gettime().
*/
- if (p == rq->curr && p->on_rq) {
+ if (p == rq->curr && task_on_rq_queued(p)) {
update_clocks(rq);
- ns = rq->clock_task - rq->rq_last_ran;
- if (unlikely((s64)ns < 0))
- ns = 0;
+ ns = rq->niffies - p->last_ran;
}
return ns;
@@ -2702,13 +3298,13 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* If we see ->on_cpu without ->on_rq, the task is leaving, and has
* been accounted, so we're correct here as well.
*/
- if (!p->on_cpu || !p->on_rq)
+ if (!p->on_cpu || !task_on_rq_queued(p))
return tsk_seruntime(p);
#endif
- rq = task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
ns = p->sched_time + do_task_delta_exec(p, rq);
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -2841,37 +3437,12 @@ void account_idle_ticks(unsigned long ticks)
}
#endif
-static inline void grq_iso_lock(void)
- __acquires(grq.iso_lock)
-{
- raw_spin_lock(&grq.iso_lock);
-}
-
-static inline void grq_iso_unlock(void)
- __releases(grq.iso_lock)
-{
- raw_spin_unlock(&grq.iso_lock);
-}
-
/*
* Functions to test for when SCHED_ISO tasks have used their allocated
- * quota as real time scheduling and convert them back to SCHED_NORMAL.
- * Where possible, the data is tested lockless, to avoid grabbing iso_lock
- * because the occasional inaccurate result won't matter. However the
- * tick data is only ever modified under lock. iso_refractory is only simply
- * set to 0 or 1 so it's not worth grabbing the lock yet again for that.
+ * quota as real time scheduling and convert them back to SCHED_NORMAL. All
+ * data is modified only by the local runqueue during scheduler_tick with
+ * interrupts disabled.
*/
-static bool set_iso_refractory(void)
-{
- grq.iso_refractory = true;
- return grq.iso_refractory;
-}
-
-static bool clear_iso_refractory(void)
-{
- grq.iso_refractory = false;
- return grq.iso_refractory;
-}
/*
* Test if SCHED_ISO tasks have run longer than their alloted period as RT
@@ -2879,97 +3450,82 @@ static bool clear_iso_refractory(void)
* for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a
* slow division.
*/
-static bool test_ret_isorefractory(struct rq *rq)
+static inline void iso_tick(struct rq *rq)
{
- if (likely(!grq.iso_refractory)) {
- if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu)
- return set_iso_refractory();
- } else {
- if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128))
- return clear_iso_refractory();
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD;
+ rq->iso_ticks += 100;
+ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) {
+ rq->iso_refractory = true;
+ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100))
+ rq->iso_ticks = ISO_PERIOD * 100;
}
- return grq.iso_refractory;
-}
-
-static void iso_tick(void)
-{
- grq_iso_lock();
- grq.iso_ticks += 100;
- grq_iso_unlock();
}
/* No SCHED_ISO task was running so decrease rq->iso_ticks */
-static inline void no_iso_tick(void)
-{
- if (grq.iso_ticks) {
- grq_iso_lock();
- grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1;
- if (unlikely(grq.iso_refractory && grq.iso_ticks <
- ISO_PERIOD * (sched_iso_cpu * 115 / 128)))
- clear_iso_refractory();
- grq_iso_unlock();
+static inline void no_iso_tick(struct rq *rq, int ticks)
+{
+ if (rq->iso_ticks > 0 || rq->iso_refractory) {
+ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD;
+ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) {
+ rq->iso_refractory = false;
+ if (unlikely(rq->iso_ticks < 0))
+ rq->iso_ticks = 0;
+ }
}
}
/* This manages tasks that have run out of timeslice during a scheduler_tick */
static void task_running_tick(struct rq *rq)
{
- struct task_struct *p;
+ struct task_struct *p = rq->curr;
/*
* If a SCHED_ISO task is running we increment the iso_ticks. In
* order to prevent SCHED_ISO tasks from causing starvation in the
* presence of true RT tasks we account those as iso_ticks as well.
*/
- if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) {
- if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128)
- iso_tick();
- } else
- no_iso_tick();
+ if (rt_task(p) || task_running_iso(p))
+ iso_tick(rq);
+ else
+ no_iso_tick(rq, 1);
- if (iso_queue(rq)) {
- if (unlikely(test_ret_isorefractory(rq))) {
- if (rq_running_iso(rq)) {
+ /* SCHED_FIFO tasks never run out of timeslice. */
+ if (p->policy == SCHED_FIFO)
+ return;
+
+ if (iso_task(p)) {
+ if (task_running_iso(p)) {
+ if (rq->iso_refractory) {
/*
* SCHED_ISO task is running as RT and limit
* has been hit. Force it to reschedule as
* SCHED_NORMAL by zeroing its time_slice
*/
- rq->rq_time_slice = 0;
+ p->time_slice = 0;
}
+ } else if (!rq->iso_refractory) {
+ /* Can now run again ISO. Reschedule to pick up prio */
+ goto out_resched;
}
}
- /* SCHED_FIFO tasks never run out of timeslice. */
- if (rq->rq_policy == SCHED_FIFO)
- return;
/*
* Tasks that were scheduled in the first half of a tick are not
* allowed to run into the 2nd half of the next tick if they will
* run out of time slice in the interim. Otherwise, if they have
* less than RESCHED_US μs of time slice left they will be rescheduled.
*/
- if (rq->dither) {
- if (rq->rq_time_slice > HALF_JIFFY_US)
- return;
- else
- rq->rq_time_slice = 0;
- } else if (rq->rq_time_slice >= RESCHED_US)
- return;
-
- /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */
- p = rq->curr;
-
- grq_lock();
- requeue_task(p);
- resched_task(p);
- grq_unlock();
+ if (p->time_slice - rq->dither >= RESCHED_US)
+ return;
+out_resched:
+ rq_lock(rq);
+ __set_tsk_resched(p);
+ rq_unlock(rq);
}
/*
* This function gets called by the timer code, with HZ frequency.
- * We call it with interrupts disabled. The data modified is all
- * local to struct rq so we don't need to grab grq lock.
+ * We call it with interrupts disabled.
*/
void scheduler_tick(void)
{
@@ -2977,15 +3533,14 @@ void scheduler_tick(void)
struct rq *rq = cpu_rq(cpu);
sched_clock_tick();
- /* grq lock not grabbed, so only update rq clock */
update_rq_clock(rq);
- update_cpu_clock_tick(rq, rq->curr);
update_load_avg(rq);
- cpufreq_trigger(grq.niffies, rq->load_avg);
+ update_cpu_clock_tick(rq, rq->curr);
if (!rq_idle(rq))
task_running_tick(rq);
else
- no_iso_tick();
+ no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy);
+ rq->last_scheduler_tick = rq->last_jiffy;
rq->last_tick = rq->clock;
perf_event_task_tick();
}
@@ -3068,12 +3623,13 @@ static inline void preempt_latency_stop(int val) { }
/*
* The time_slice is only refilled when it is empty and that is when we set a
- * new deadline.
+ * new deadline. Make sure update_clocks has been called recently to update
+ * rq->niffies.
*/
-static void time_slice_expired(struct task_struct *p)
+static void time_slice_expired(struct task_struct *p, struct rq *rq)
{
p->time_slice = timeslice();
- p->deadline = grq.niffies + task_deadline_diff(p);
+ p->deadline = rq->niffies + task_deadline_diff(p);
#ifdef CONFIG_SMT_NICE
if (!p->mm)
p->smt_bias = 0;
@@ -3100,107 +3656,107 @@ static void time_slice_expired(struct task_struct *p)
* SCHED_NORMAL tasks.
*/
-static inline void check_deadline(struct task_struct *p)
+static inline void check_deadline(struct task_struct *p, struct rq *rq)
{
if (p->time_slice < RESCHED_US || batch_task(p))
- time_slice_expired(p);
+ time_slice_expired(p, rq);
}
#define BITOP_WORD(nr) ((nr) / BITS_PER_LONG)
/*
- * Scheduler queue bitmap specific find next bit.
- */
-static inline unsigned long
-next_sched_bit(const unsigned long *addr, unsigned long offset)
-{
- const unsigned long *p;
- unsigned long result;
- unsigned long size;
- unsigned long tmp;
-
- size = PRIO_LIMIT;
- if (offset >= size)
- return size;
-
- p = addr + BITOP_WORD(offset);
- result = offset & ~(BITS_PER_LONG-1);
- size -= result;
- offset %= BITS_PER_LONG;
- if (offset) {
- tmp = *(p++);
- tmp &= (~0UL << offset);
- if (size < BITS_PER_LONG)
- goto found_first;
- if (tmp)
- goto found_middle;
- size -= BITS_PER_LONG;
- result += BITS_PER_LONG;
- }
- while (size & ~(BITS_PER_LONG-1)) {
- if ((tmp = *(p++)))
- goto found_middle;
- result += BITS_PER_LONG;
- size -= BITS_PER_LONG;
- }
- if (!size)
- return result;
- tmp = *p;
-
-found_first:
- tmp &= (~0UL >> (BITS_PER_LONG - size));
- if (tmp == 0UL) /* Are any bits set? */
- return result + size; /* Nope. */
-found_middle:
- return result + __ffs(tmp);
-}
-
-/*
* Task selection with skiplists is a simple matter of picking off the first
- * task in the sorted list, an O(1) operation. The only time it takes longer
- * is if tasks do not have suitable affinity and then we iterate over entries
- * till we find the first that does. Worst case here is no tasks with suitable
- * affinity and taking O(n).
+ * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
+ * being bound to the number of processors.
+ *
+ * Runqueues are selectively locked based on their unlocked data and then
+ * unlocked if not needed. At most 3 locks will be held at any time and are
+ * released as soon as they're no longer needed. All balancing between CPUs
+ * is thus done here in an extremely simple first come best fit manner.
+ *
+ * This iterates over runqueues in cache locality order. In interactive mode
+ * it iterates over all CPUs and finds the task with the best key/deadline.
+ * In non-interactive mode it will only take a task if it's from the current
+ * runqueue or a runqueue with more tasks than the current one with a better
+ * key/deadline.
*/
static inline struct
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
{
- skiplist_node *node = &grq.node;
struct task_struct *edt = idle;
- u64 earliest_deadline = ~0ULL;
+ struct rq *locked = NULL;
+ int i, best_entries = 0;
+ u64 best_key = ~0ULL;
- while ((node = node->next[0]) != &grq.node) {
- struct task_struct *p = node->value;
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *other_rq = rq_order(rq, i);
+ int entries = other_rq->sl->entries;
+ struct task_struct *p;
+ u64 key;
- /* Make sure affinity is ok */
- if (needs_other_cpu(p, cpu))
+ /*
+ * Check for queued entres lockless first. The local runqueue
+ * is locked so entries will always be accurate.
+ */
+ if (!sched_interactive) {
+ if (entries <= best_entries)
+ continue;
+ } else if (!entries)
continue;
- if (!smt_schedule(p, rq))
- continue;
+ /* if (i) implies other_rq != rq */
+ if (i) {
+ /* Check for best id queued lockless first */
+ if (other_rq->best_key >= best_key)
+ continue;
- if (!sched_interactive) {
- int tcpu;
+ if (unlikely(!trylock_rq(rq, other_rq)))
+ continue;
- if ((tcpu = task_cpu(p)) != cpu) {
- u64 dl = p->deadline << locality_diff(tcpu, rq);
+ /* Need to reevaluate entries after locking */
+ entries = other_rq->sl->entries;
+ if (unlikely(!entries)) {
+ unlock_rq(other_rq);
+ continue;
+ }
+ }
+ key = other_rq->node.next[0]->key;
+ /* Reevaluate key after locking */
+ if (unlikely(key >= best_key)) {
+ if (i)
+ unlock_rq(other_rq);
+ continue;
+ }
- if (!deadline_before(dl, earliest_deadline))
- continue;
- earliest_deadline = dl;
- edt = p;
- /* We continue even though we've found the earliest
- * deadline task as the locality offset means there
- * may be a better candidate after it. */
+ p = other_rq->node.next[0]->value;
+ if (!smt_schedule(p, rq)) {
+ if (i)
+ unlock_rq(other_rq);
+ continue;
+ }
+
+ /* Make sure affinity is ok */
+ if (i) {
+ if (needs_other_cpu(p, cpu)) {
+ unlock_rq(other_rq);
continue;
}
+ if (locked)
+ unlock_rq(locked);
+ locked = other_rq;
}
- /* We've encountered the best deadline local task */
+
+ best_entries = entries;
+ best_key = key;
edt = p;
- break;
}
+
if (likely(edt != idle))
- take_task(cpu, edt);
+ take_task(rq, cpu, edt);
+
+ if (locked)
+ unlock_rq(locked);
+
return edt;
}
@@ -3226,9 +3782,6 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
- if (panic_on_warn)
- panic("scheduling while atomic\n");
-
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3256,15 +3809,11 @@ static inline void schedule_debug(struct task_struct *prev)
/*
* The currently running task's information is all stored in rq local data
- * which is only modified by the local CPU, thereby allowing the data to be
- * changed without grabbing the grq lock.
+ * which is only modified by the local CPU.
*/
static inline void set_rq_task(struct rq *rq, struct task_struct *p)
{
- rq->rq_time_slice = p->time_slice;
rq->rq_deadline = p->deadline;
- rq->rq_last_ran = p->last_ran = rq->clock_task;
- rq->rq_policy = p->policy;
rq->rq_prio = p->prio;
#ifdef CONFIG_SMT_NICE
rq->rq_mm = p->mm;
@@ -3272,15 +3821,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p)
#endif
}
-static void reset_rq_task(struct rq *rq, struct task_struct *p)
-{
- rq->rq_policy = p->policy;
- rq->rq_prio = p->prio;
-#ifdef CONFIG_SMT_NICE
- rq->rq_smt_bias = p->smt_bias;
-#endif
-}
-
#ifdef CONFIG_SMT_NICE
static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
@@ -3381,11 +3921,13 @@ static void __sched notrace __schedule(bool preempt)
unsigned long *switch_count;
bool deactivate = false;
struct rq *rq;
+ u64 niffies;
int cpu;
cpu = smp_processor_id();
rq = cpu_rq(cpu);
prev = rq->curr;
+ idle = rq->idle;
/*
* do_exit() calls schedule() with preemption disabled as an exception;
@@ -3409,7 +3951,23 @@ static void __sched notrace __schedule(bool preempt)
* done by the caller to avoid the race with signal_wake_up().
*/
smp_mb__before_spinlock();
- grq_lock();
+ rq_lock(rq);
+#ifdef CONFIG_SMP
+ if (rq->preempt) {
+ /*
+ * Make sure resched_curr hasn't triggered a preemption
+ * locklessly on a task that has since scheduled away. Spurious
+ * wakeup of idle is okay though.
+ */
+ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) {
+ rq->preempt = NULL;
+ clear_preempt_need_resched();
+ rq_unlock_irq(rq);
+ return;
+ }
+ rq->preempt = NULL;
+ }
+#endif
switch_count = &prev->nivcsw;
if (!preempt && prev->state) {
@@ -3431,7 +3989,7 @@ static void __sched notrace __schedule(bool preempt)
to_wakeup = wq_worker_sleeping(prev);
if (to_wakeup) {
/* This shouldn't happen, but does */
- if (unlikely(to_wakeup == prev))
+ if (WARN_ONCE((to_wakeup == prev), "Waking up prev as worker\n"))
deactivate = false;
else
try_to_wake_up_local(to_wakeup);
@@ -3441,64 +3999,64 @@ static void __sched notrace __schedule(bool preempt)
switch_count = &prev->nvcsw;
}
+ /*
+ * Store the niffy value here for use by the next task's last_ran
+ * below to avoid losing niffies due to update_clocks being called
+ * again after this point.
+ */
update_clocks(rq);
+ niffies = rq->niffies;
update_cpu_clock_switch(rq, prev);
if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
- rq->dither = false;
+ rq->dither = 0;
else
- rq->dither = true;
+ rq->dither = HALF_JIFFY_US;
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
- idle = rq->idle;
if (idle != prev) {
- /* Update all the information stored on struct rq */
- prev->time_slice = rq->rq_time_slice;
- prev->deadline = rq->rq_deadline;
- check_deadline(prev);
- prev->last_ran = rq->clock_task;
- return_task(prev, rq, deactivate);
+ check_deadline(prev, rq);
+ return_task(prev, rq, cpu, deactivate);
}
if (unlikely(!queued_notrunning())) {
- /*
- * This CPU is now truly idle as opposed to when idle is
- * scheduled as a high priority task in its own right.
- */
next = idle;
schedstat_inc(rq, sched_goidle);
set_cpuidle_map(cpu);
+ update_load_avg(rq);
} else {
next = earliest_deadline_task(rq, cpu, idle);
if (likely(next->prio != PRIO_LIMIT))
clear_cpuidle_map(cpu);
- else
+ else {
set_cpuidle_map(cpu);
+ update_load_avg(rq);
+ }
}
+ set_rq_task(rq, next);
+ next->last_ran = niffies;
+
if (likely(prev != next)) {
/*
* Don't reschedule an idle task or deactivated tasks
*/
if (prev != idle && !deactivate)
resched_suitable_idle(prev);
- set_rq_task(rq, next);
if (next != idle)
check_siblings(rq);
else
wake_siblings(rq);
- grq.nr_switches++;
- prev->on_cpu = false;
- next->on_cpu = true;
+ atomic64_inc(&grq.nr_switches);
rq->curr = next;
++*switch_count;
trace_sched_switch(preempt, prev, next);
- rq = context_switch(rq, prev, next); /* unlocks the grq */
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
} else {
check_siblings(rq);
- grq_unlock_irq();
+ rq_unlock_irq(rq);
}
}
@@ -3713,13 +4271,12 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
struct rq *rq;
int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
- rq = task_grq_lock(p, &flags);
+ rq = __task_rq_lock(p);
/*
* Idle task boosting is a nono in general. There is one
@@ -3742,17 +4299,17 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
p->prio = prio;
- if (task_running(p)){
+ if (task_running(rq, p)){
if (prio > oldprio)
resched_task(p);
} else if (task_queued(p)) {
- dequeue_task(p);
- enqueue_task(p, rq);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
if (prio < oldprio)
try_preempt(p, rq);
}
out_unlock:
- task_grq_unlock(p, &flags);
+ __task_rq_unlock(rq);
}
#endif
@@ -3779,7 +4336,7 @@ void set_user_nice(struct task_struct *p, long nice)
* We have to be careful, if called from sys_setpriority(),
* the task might be in the middle of scheduling on another CPU.
*/
- rq = time_task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
/*
* The RT priorities are set via sched_setscheduler(), but we still
* allow the 'normal' nice value to be set - but as expected
@@ -3797,17 +4354,17 @@ void set_user_nice(struct task_struct *p, long nice)
p->prio = effective_prio(p);
if (task_queued(p)) {
- dequeue_task(p);
- enqueue_task(p, rq);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
if (new_static < old_static)
try_preempt(p, rq);
- } else if (task_running(p)) {
- reset_rq_task(rq, p);
+ } else if (task_running(rq, p)) {
+ set_rq_task(rq, p);
if (old_static < new_static)
resched_task(p);
}
out_unlock:
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -3878,7 +4435,7 @@ int task_prio(const struct task_struct *p)
goto out;
/* Convert to ms to avoid overflows */
- delta = NS_TO_MS(p->deadline - grq.niffies);
+ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies);
delta = delta * 40 / ms_longest_deadline_diff();
if (delta > 0 && delta <= 80)
prio += delta;
@@ -3921,7 +4478,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
return pid ? find_task_by_vpid(pid) : current;
}
-/* Actually do priority change: must hold grq lock. */
+/* Actually do priority change: must hold rq lock. */
static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
int prio, bool keep_boost)
{
@@ -3948,12 +4505,12 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
} else
p->prio = p->normal_prio;
- if (task_running(p)) {
- reset_rq_task(rq, p);
+ if (task_running(rq, p)) {
+ set_rq_task(rq, p);
resched_task(p);
} else if (task_queued(p)) {
- dequeue_task(p);
- enqueue_task(p, rq);
+ dequeue_task(rq, p, DEQUEUE_SAVE);
+ enqueue_task(rq, p, ENQUEUE_RESTORE);
if (p->prio < oldprio || p->rt_priority > oldrtprio)
try_preempt(p, rq);
}
@@ -4055,7 +4612,7 @@ recheck:
case SCHED_ISO:
if (policy == SCHED_ISO)
goto out;
- if (policy == SCHED_NORMAL)
+ if (policy != SCHED_NORMAL)
return -EPERM;
break;
case SCHED_BATCH:
@@ -4092,16 +4649,16 @@ recheck:
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
*
- * To be able to change p->policy safely, the grunqueue lock must be
+ * To be able to change p->policy safely, the runqueue lock must be
* held.
*/
- rq = task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
return -EINVAL;
}
@@ -4110,22 +4667,20 @@ recheck:
*/
if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
-
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
return 0;
}
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
- update_clocks(rq);
p->sched_reset_on_fork = reset_on_fork;
__setscheduler(p, rq, policy, param->sched_priority, pi);
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
if (pi)
rt_mutex_adjust_pi(p);
@@ -4625,9 +5180,9 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
if (retval)
goto out_unlock;
- grq_lock_irqsave(&flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask);
- grq_unlock_irqrestore(&flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
@@ -4642,8 +5197,7 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: size of CPU mask copied to user_mask_ptr on success. An
- * error code otherwise.
+ * Return: 0 on success. An error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -4685,19 +5239,20 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
SYSCALL_DEFINE0(sched_yield)
{
struct task_struct *p;
+ struct rq *rq;
p = current;
- grq_lock_irq();
+ rq = this_rq_lock();
+ time_slice_expired(p, rq);
schedstat_inc(task_rq(p), yld_count);
- requeue_task(p);
/*
* Since we are going to call schedule() anyway, there's
* no need to preempt or enable interrupts:
*/
- __release(grq.lock);
- spin_release(&grq.lock.dep_map, 1, _THIS_IP_);
- do_raw_spin_unlock(&grq.lock);
+ __release(rq->lock);
+ spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+ do_raw_spin_unlock(&rq->lock);
sched_preempt_enable_no_resched();
schedule();
@@ -4803,29 +5358,44 @@ EXPORT_SYMBOL(yield);
*/
int __sched yield_to(struct task_struct *p, bool preempt)
{
+ struct task_struct *rq_p;
struct rq *rq, *p_rq;
unsigned long flags;
int yielded = 0;
+ local_irq_save(flags);
rq = this_rq();
- grq_lock_irqsave(&flags);
- if (task_running(p) || p->state) {
+
+again:
+ p_rq = task_rq(p);
+ /*
+ * If we're the only runnable task on the rq and target rq also
+ * has only one task, there's absolutely no point in yielding.
+ */
+ if (task_running(p_rq, p) || p->state) {
yielded = -ESRCH;
- goto out_unlock;
+ goto out_irq;
+ }
+
+ double_rq_lock(rq, p_rq);
+ if (unlikely(task_rq(p) != p_rq)) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
}
- p_rq = task_rq(p);
yielded = 1;
- if (p->deadline > rq->rq_deadline)
- p->deadline = rq->rq_deadline;
- p->time_slice += rq->rq_time_slice;
- rq->rq_time_slice = 0;
+ rq_p = rq->curr;
+ if (p->deadline > rq_p->deadline)
+ p->deadline = rq_p->deadline;
+ p->time_slice += rq_p->time_slice;
if (p->time_slice > timeslice())
p->time_slice = timeslice();
+ time_slice_expired(rq_p, rq);
if (preempt && rq != p_rq)
- resched_curr(p_rq);
-out_unlock:
- grq_unlock_irqrestore(&flags);
+ resched_task(p_rq->curr);
+ double_rq_unlock(rq, p_rq);
+out_irq:
+ local_irq_restore(flags);
if (yielded > 0)
schedule();
@@ -4931,8 +5501,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
struct task_struct *p;
unsigned int time_slice;
unsigned long flags;
- int retval;
struct timespec t;
+ struct rq *rq;
+ int retval;
if (pid < 0)
return -EINVAL;
@@ -4947,9 +5518,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
if (retval)
goto out_unlock;
- grq_lock_irqsave(&flags);
+ rq = task_rq_lock(p, &flags);
time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p));
- grq_unlock_irqrestore(&flags);
+ task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
t = ns_to_timespec(time_slice);
@@ -5049,9 +5620,21 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_held(&p->pi_lock);
+
cpumask_copy(tsk_cpus_allowed(p), new_mask);
+
+ if (task_queued(p)) {
+ /*
+ * Because __kthread_bind() calls this on blocked tasks without
+ * holding rq->lock.
+ */
+ lockdep_assert_held(&rq->lock);
+ }
if (needs_other_cpu(p, task_cpu(p)))
- set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ set_task_cpu(p, valid_task_cpu(p));
}
#endif
@@ -5069,8 +5652,8 @@ void init_idle(struct task_struct *idle, int cpu)
unsigned long flags;
raw_spin_lock_irqsave(&idle->pi_lock, flags);
- time_lock_grq(rq);
- idle->last_ran = rq->clock_task;
+ raw_spin_lock(&rq->lock);
+ idle->last_ran = rq->niffies;
idle->state = TASK_RUNNING;
/* Setting prio to illegal value shouldn't matter when never queued */
idle->prio = PRIO_LIMIT;
@@ -5097,8 +5680,8 @@ void init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
- idle->on_cpu = 1;
- grq_unlock();
+ idle->on_rq = TASK_ON_RQ_QUEUED;
+ raw_spin_unlock(&rq->lock);
raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
@@ -5136,59 +5719,14 @@ int task_can_attach(struct task_struct *p,
return ret;
}
-void wake_q_add(struct wake_q_head *head, struct task_struct *task)
-{
- struct wake_q_node *node = &task->wake_q;
-
- /*
- * Atomically grab the task, if ->wake_q is !nil already it means
- * its already queued (either by us or someone else) and will get the
- * wakeup due to that.
- *
- * This cmpxchg() implies a full barrier, which pairs with the write
- * barrier implied by the wakeup in wake_up_q().
- */
- if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
- return;
-
- get_task_struct(task);
-
- /*
- * The head is context local, there can be no concurrency.
- */
- *head->lastp = node;
- head->lastp = &node->next;
-}
-
-void wake_up_q(struct wake_q_head *head)
-{
- struct wake_q_node *node = head->first;
-
- while (node != WAKE_Q_TAIL) {
- struct task_struct *task;
-
- task = container_of(node, struct task_struct, wake_q);
- BUG_ON(!task);
- /* task can safely be re-inserted now */
- node = node->next;
- task->wake_q.next = NULL;
-
- /*
- * wake_up_process() implies a wmb() to pair with the queueing
- * in wake_q_add() so as not to miss wakeups.
- */
- wake_up_process(task);
- put_task_struct(task);
- }
-}
-
void resched_cpu(int cpu)
{
+ struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- grq_lock_irqsave(&flags);
+ rq_lock_irqsave(rq, &flags);
resched_task(cpu_curr(cpu));
- grq_unlock_irqrestore(&flags);
+ rq_unlock_irqrestore(rq, &flags);
}
#ifdef CONFIG_SMP
@@ -5262,7 +5800,7 @@ int get_nohz_timer_target(void)
continue;
if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
- cpu = i;
+ cpu = i;
cpu = i;
goto unlock;
}
@@ -5291,8 +5829,10 @@ void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- set_tsk_need_resched(cpu_rq(cpu)->idle);
- smp_send_reschedule(cpu);
+ if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void wake_up_nohz_cpu(int cpu)
@@ -5321,7 +5861,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
struct rq *rq;
int ret = 0;
- rq = task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
if (p->flags & PF_KTHREAD) {
/*
@@ -5366,22 +5906,30 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
- if (task_running(p)) {
+ if (task_running(rq, p)) {
/* Task is running on the wrong cpu now, reschedule it. */
if (rq == this_rq()) {
set_tsk_need_resched(p);
running_wrong = true;
} else
resched_task(p);
- } else
- set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
+ } else {
+ int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+ struct rq *dest_rq = cpu_rq(dest_cpu);
+
+ /* Switch rq locks here */
+ lock_second_rq(rq, dest_rq);
+ set_task_cpu(p, dest_cpu);
+ rq_unlock(rq);
+ rq = dest_rq;
+ }
out:
if (queued && !cpumask_subset(new_mask, &old_mask))
try_preempt(p, rq);
if (running_wrong)
preempt_disable();
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
if (running_wrong) {
__schedule(true);
@@ -5397,11 +5945,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
-static bool sched_smp_initialized __read_mostly;
-
#ifdef CONFIG_HOTPLUG_CPU
-/* Run through task list and find tasks affined to the dead cpu, then remove
- * that cpu from the list, enable cpu0 and set the zerobound flag. */
+/*
+ * Run through task list and find tasks affined to the dead cpu, then remove
+ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold
+ * cpu 0 and src_cpu's runqueue locks.
+ */
static void bind_zero(int src_cpu)
{
struct task_struct *p, *t;
@@ -5412,15 +5961,17 @@ static void bind_zero(int src_cpu)
do_each_thread(t, p) {
if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) {
- cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p));
- cpumask_set_cpu(0, tsk_cpus_allowed(p));
+ bool local = (task_cpu(p) == src_cpu);
+
+ /* task_running is the cpu stopper thread */
+ if (local && task_running(task_rq(p), p))
+ continue;
+ atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p));
+ atomic_set_cpu(0, tsk_cpus_allowed(p));
p->zerobound = true;
bound++;
- if (task_cpu(p) == src_cpu) {
+ if (local)
set_task_cpu(p, 0);
- if (task_running(p))
- resched_task(p);
- }
}
} while_each_thread(t, p);
@@ -5834,7 +6385,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
struct root_domain *old_rd = NULL;
unsigned long flags;
- grq_lock_irqsave(&flags);
+ rq_lock_irqsave(rq, &flags);
if (rq->rd) {
old_rd = rq->rd;
@@ -5860,7 +6411,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);
- grq_unlock_irqrestore(&flags);
+ rq_unlock_irqrestore(rq, &flags);
if (old_rd)
call_rcu_sched(&old_rd->rcu, free_rootdomain);
@@ -6839,14 +7390,13 @@ int sched_cpu_activate(unsigned int cpu)
* 2) At runtime, if cpuset_cpu_active() fails to rebuild the
* domains.
*/
- grq_lock_irqsave(&flags);
+ rq_lock_irqsave(rq, &flags);
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_online(rq);
}
unbind_zero(cpu);
- grq.noc = num_online_cpus();
- grq_unlock_irqrestore(&flags);
+ rq_unlock_irqrestore(rq, &flags);
return 0;
}
@@ -6894,14 +7444,15 @@ int sched_cpu_dying(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- grq_lock_irqsave(&flags);
+ local_irq_save(flags);
+ double_rq_lock(rq, cpu_rq(0));
if (rq->rd) {
BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
set_rq_offline(rq);
}
bind_zero(cpu);
- grq.noc = num_online_cpus();
- grq_unlock_irqrestore(&flags);
+ double_rq_unlock(rq, cpu_rq(0));
+ local_irq_restore(flags);
return 0;
}
@@ -6958,9 +7509,8 @@ void __init sched_init_smp(void)
#ifdef CONFIG_SCHED_SMT
bool smt_threads = false;
#endif
- struct rq *rq;
-
cpumask_var_t non_isolated_cpus;
+ struct rq *rq;
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
@@ -6985,7 +7535,8 @@ void __init sched_init_smp(void)
free_cpumask_var(non_isolated_cpus);
mutex_lock(&sched_domains_mutex);
- grq_lock_irq();
+ local_irq_disable();
+ lock_all_rqs();
/*
* Set up the relative cache distance of each online cpu from each
* other in a simple array for quick lookup. Locality is determined
@@ -7025,21 +7576,21 @@ void __init sched_init_smp(void)
}
#endif
#ifdef CONFIG_SCHED_SMT
- for_each_cpu(other_cpu, thread_cpumask(cpu))
- rq->cpu_locality[other_cpu] = 1;
if (cpumask_weight(thread_cpumask(cpu)) > 1) {
cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
cpumask_clear_cpu(cpu, &rq->thread_mask);
+ for_each_cpu(other_cpu, thread_cpumask(cpu))
+ rq->cpu_locality[other_cpu] = 1;
rq->siblings_idle = siblings_cpu_idle;
smt_threads = true;
}
#endif
}
for_each_possible_cpu(cpu) {
- int total_cpus = 0, locality;
+ int total_cpus = 1, locality;
rq = cpu_rq(cpu);
- for (locality = 0; locality <= 4; locality++) {
+ for (locality = 1; locality <= 4; locality++) {
for_each_possible_cpu(other_cpu) {
if (rq->cpu_locality[other_cpu] == locality)
rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
@@ -7053,7 +7604,8 @@ void __init sched_init_smp(void)
smt_schedule = &smt_should_schedule;
}
#endif
- grq_unlock_irq();
+ unlock_all_rqs();
+ local_irq_enable();
mutex_unlock(&sched_domains_mutex);
for_each_online_cpu(cpu) {
@@ -7062,7 +7614,7 @@ void __init sched_init_smp(void)
for_each_online_cpu(other_cpu) {
if (other_cpu <= cpu)
continue;
- printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
+ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]);
}
}
sched_smp_initialized = true;
@@ -7116,21 +7668,14 @@ void __init sched_init(void)
for (i = 1 ; i < NICE_WIDTH ; i++)
prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
- raw_spin_lock_init(&grq.lock);
- grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0;
- grq.niffies = 0;
- grq.last_jiffy = jiffies;
- raw_spin_lock_init(&grq.iso_lock);
- grq.iso_ticks = 0;
- grq.iso_refractory = false;
- grq.noc = 1;
- skiplist_init(&grq.node);
- grq.sl = new_skiplist(&grq.node);
+ atomic_set(&grq.nr_running, 0);
+ atomic_set(&grq.nr_uninterruptible, 0);
+ atomic64_set(&grq.nr_switches, 0);
skiplist_node_init(&init_task.node);
#ifdef CONFIG_SMP
init_defrootdomain();
- grq.qnr = grq.idle_cpus = 0;
+ atomic_set(&grq.qnr, 0);
cpumask_clear(&grq.cpu_idle_map);
#else
uprq = &per_cpu(runqueues, 0);
@@ -7145,12 +7690,18 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- rq->grq_lock = &grq.lock;
- rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc =
- rq->iowait_pc = rq->idle_pc = 0;
- rq->dither = false;
+ skiplist_init(&rq->node);
+ rq->sl = new_skiplist(&rq->node);
+ raw_spin_lock_init(&rq->lock);
+ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0;
+ rq->last_jiffy = jiffies;
+ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns =
+ rq->iowait_ns = rq->idle_ns = 0;
+ rq->dither = 0;
+ set_rq_task(rq, &init_task);
+ rq->iso_ticks = 0;
+ rq->iso_refractory = false;
#ifdef CONFIG_SMP
- rq->last_niffy = 0;
rq->sd = NULL;
rq->rd = NULL;
rq->online = false;
@@ -7302,9 +7853,9 @@ static inline void normalise_rt_tasks(void)
if (!rt_task(p) && !iso_task(p))
continue;
- rq = task_grq_lock(p, &flags);
+ rq = task_rq_lock(p, &flags);
__setscheduler(p, rq, SCHED_NORMAL, 0, false);
- task_grq_unlock(p, &flags);
+ task_rq_unlock(rq, p, &flags);
}
read_unlock(&tasklist_lock);
}
@@ -7367,6 +7918,25 @@ void set_curr_task(int cpu, struct task_struct *p)
/*
* Use precise platform statistics if available:
*/
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+
+#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
+void vtime_common_task_switch(struct task_struct *prev)
+{
+ if (is_idle_task(prev))
+ vtime_account_idle(prev);
+ else
+ vtime_account_system(prev);
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ vtime_account_user(prev);
+#endif
+ arch_vtime_task_switch(prev);
+}
+#endif
+
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
@@ -7395,20 +7965,26 @@ void vtime_account_system_irqsafe(struct task_struct *tsk)
}
EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
+/*
+ * Archs that account the whole time spent in the idle task
+ * (outside irq) as idle time can rely on this and just implement
+ * vtime_account_system() and vtime_account_idle(). Archs that
+ * have other meaning of the idle time (s390 only includes the
+ * time spent by the CPU when it's in low power mode) must override
+ * vtime_account().
+ */
+#ifndef __ARCH_HAS_VTIME_ACCOUNT
+void vtime_account_irq_enter(struct task_struct *tsk)
+{
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
else
- vtime_account_system(prev);
-
- vtime_account_user(prev);
- arch_vtime_task_switch(prev);
+ vtime_account_system(tsk);
}
-#endif
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
+#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
+#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
/*
* Perform (stime * rtime) / total, but avoid multiplication overflow by
* losing precision when the numbers are big.
@@ -7530,7 +8106,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
thread_group_cputime(p, &cputime);
cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
}
-#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
void init_idle_bootup_task(struct task_struct *idle)
{}
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/MuQSS.h
index 00a16ba0a..4e3115dac 100644
--- a/kernel/sched/bfs_sched.h
+++ b/kernel/sched/MuQSS.h
@@ -1,9 +1,14 @@
#include <linux/sched.h>
#include <linux/cpuidle.h>
+#include <linux/skip_list.h>
#include <linux/stop_machine.h>
-#ifndef BFS_SCHED_H
-#define BFS_SCHED_H
+#ifndef MUQSS_SCHED_H
+#define MUQSS_SCHED_H
+
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED 1
+#define TASK_ON_RQ_MIGRATING 2
/*
* This is the main, per-CPU runqueue data structure.
@@ -13,16 +18,21 @@ struct rq {
struct task_struct *curr, *idle, *stop;
struct mm_struct *prev_mm;
- /* Pointer to grq spinlock */
- raw_spinlock_t *grq_lock;
+ raw_spinlock_t lock;
- /* Stored data about rq->curr to work outside grq lock */
+ /* Stored data about rq->curr to work outside rq lock */
u64 rq_deadline;
- unsigned int rq_policy;
- int rq_time_slice;
- u64 rq_last_ran;
int rq_prio;
- int soft_affined; /* Running or queued tasks with this set as their rq */
+
+ /* Best queued id for use outside lock */
+ u64 best_key;
+
+ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */
+ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */
+ u64 niffies; /* Last time this RQ updated rq clock */
+ u64 last_niffy; /* Last niffies as updated by local clock */
+ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */
+
u64 load_update; /* When we last updated load */
unsigned long load_avg; /* Rolling load average */
#ifdef CONFIG_SMT_NICE
@@ -30,12 +40,15 @@ struct rq {
int rq_smt_bias; /* Policy/nice level bias across smt siblings */
#endif
/* Accurate timekeeping data */
- u64 timekeep_clock;
- unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc,
- iowait_pc, idle_pc;
+ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns,
+ iowait_ns, idle_ns;
atomic_t nr_iowait;
+ skiplist_node node;
+ skiplist *sl;
#ifdef CONFIG_SMP
+ struct task_struct *preempt; /* Preempt triggered on this task */
+
int cpu; /* cpu of this runqueue */
bool online;
@@ -43,6 +56,7 @@ struct rq {
struct sched_domain *sd;
int *cpu_locality; /* CPU relative cache distance */
struct rq **rq_order; /* RQs ordered by relative cache distance */
+
#ifdef CONFIG_SCHED_SMT
cpumask_t thread_mask;
bool (*siblings_idle)(struct rq *rq);
@@ -53,7 +67,6 @@ struct rq {
bool (*cache_idle)(struct rq *rq);
/* See if all cache siblings are idle */
#endif /* CONFIG_SCHED_MC */
- u64 last_niffy; /* Last time this RQ updated grq.niffies */
#endif /* CONFIG_SMP */
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
u64 prev_irq_time;
@@ -67,7 +80,10 @@ struct rq {
u64 clock, old_clock, last_tick;
u64 clock_task;
- bool dither;
+ int dither;
+
+ int iso_ticks;
+ bool iso_refractory;
#ifdef CONFIG_SCHEDSTATS
@@ -88,6 +104,11 @@ struct rq {
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif /* CONFIG_SCHEDSTATS */
+
+#ifdef CONFIG_SMP
+ struct llist_head wake_list;
+#endif
+
#ifdef CONFIG_CPU_IDLE
/* Must be inspected within a rcu lock section */
struct cpuidle_state *idle_state;
@@ -111,6 +132,41 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define raw_rq() raw_cpu_ptr(&runqueues)
#endif /* CONFIG_SMP */
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ * are in a known state which allows modification. Such pairs
+ * should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ * in the runqueue.
+ *
+ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_MIGRATED - the task was migrated during wakeup
+ *
+ */
+
+#define DEQUEUE_SLEEP 0x01
+#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
+
+#define ENQUEUE_WAKEUP 0x01
+#define ENQUEUE_RESTORE 0x02
+#define ENQUEUE_MOVE 0x04
+
+#define ENQUEUE_HEAD 0x08
+#define ENQUEUE_REPLENISH 0x10
+#ifdef CONFIG_SMP
+#define ENQUEUE_MIGRATED 0x20
+#else
+#define ENQUEUE_MIGRATED 0x00
+#endif
+
static inline u64 __rq_clock_broken(struct rq *rq)
{
return READ_ONCE(rq->clock);
@@ -118,13 +174,13 @@ static inline u64 __rq_clock_broken(struct rq *rq)
static inline u64 rq_clock(struct rq *rq)
{
- lockdep_assert_held(rq->grq_lock);
+ lockdep_assert_held(&rq->lock);
return rq->clock;
}
static inline u64 rq_clock_task(struct rq *rq)
{
- lockdep_assert_held(rq->grq_lock);
+ lockdep_assert_held(&rq->lock);
return rq->clock_task;
}
@@ -157,17 +213,11 @@ static inline void unregister_sched_domain_sysctl(void)
}
#endif
-static inline void sched_ttwu_pending(void) { }
-
-static inline int task_on_rq_queued(struct task_struct *p)
-{
- return p->on_rq;
-}
-
#ifdef CONFIG_SMP
-
+extern void sched_ttwu_pending(void);
extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
-
+#else
+static inline void sched_ttwu_pending(void) { }
#endif
#ifdef CONFIG_CPU_IDLE
@@ -221,4 +271,4 @@ static inline void cpufreq_trigger(u64 time, unsigned long util)
#define arch_scale_freq_invariant() (false)
#endif
-#endif /* BFS_SCHED_H */
+#endif /* MUQSS_SCHED_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 2ebd7b0c4..2a644b6be 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -9,8 +9,8 @@
* published by the Free Software Foundation.
*/
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
#else
#include "sched.h"
#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index eba226d7c..a2bf1ea69 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -16,8 +16,8 @@
#include <linux/slab.h>
#include <trace/events/power.h>
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
#else
#include "sched.h"
#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 1e855dcbd..060b76d85 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -14,8 +14,8 @@
#include <trace/events/power.h>
-#ifdef CONFIG_SCHED_BFS
-#include "bfs_sched.h"
+#ifdef CONFIG_SCHED_MUQSS
+#include "MuQSS.h"
#else
#include "sched.h"
#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 7466a0bb2..ba7b137c3 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -4,10 +4,10 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
-#ifndef CONFIG_SCHED_BFS
+#ifndef CONFIG_SCHED_MUQSS
#include "sched.h"
#else
-#include "bfs_sched.h"
+#include "MuQSS.h"
#endif
/*