diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-10-22 19:31:08 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-10-22 19:31:08 -0300 |
commit | 670027c507e99521d416994a18a498def9ef2ea3 (patch) | |
tree | 74b4d761a9e7904a4f8aa4b58b2dc9801f22284d /kernel/sched | |
parent | d0b2f91bede3bd5e3d24dd6803e56eee959c1797 (diff) |
Linux-libre 4.8.3-gnupck-4.8.3-gnu
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/Makefile | 4 | ||||
-rw-r--r-- | kernel/sched/MuQSS.c (renamed from kernel/sched/bfs.c) | 2494 | ||||
-rw-r--r-- | kernel/sched/MuQSS.h (renamed from kernel/sched/bfs_sched.h) | 102 | ||||
-rw-r--r-- | kernel/sched/cpufreq.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 4 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/stats.c | 4 |
7 files changed, 1621 insertions, 995 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 8c1204fa3..a787aa942 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -15,8 +15,8 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer endif -ifdef CONFIG_SCHED_BFS -obj-y += bfs.o clock.o +ifdef CONFIG_SCHED_MUQSS +obj-y += MuQSS.o clock.o else obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o diff --git a/kernel/sched/bfs.c b/kernel/sched/MuQSS.c index bb5bac4b2..6a656ad4b 100644 --- a/kernel/sched/bfs.c +++ b/kernel/sched/MuQSS.c @@ -1,5 +1,5 @@ /* - * kernel/sched/bfs.c, was kernel/sched.c + * kernel/sched/MuQSS.c, was kernel/sched.c * * Kernel scheduler and related syscalls * @@ -26,6 +26,8 @@ * Thomas Gleixner, Mike Kravetz * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes * a whole lot of those previous things. + * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS + * scheduler by Con Kolivas. */ #include <linux/kasan.h> @@ -74,7 +76,7 @@ #include <linux/context_tracking.h> #include <linux/sched/prio.h> #include <linux/tick.h> -#include <linux/skip_lists.h> +#include <linux/skip_list.h> #include <asm/irq_regs.h> #include <asm/switch_to.h> @@ -92,11 +94,10 @@ #define CREATE_TRACE_POINTS #include <trace/events/sched.h> -#include "bfs_sched.h" +#include "MuQSS.h" #define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) #define rt_task(p) rt_prio((p)->prio) -#define rt_queue(rq) rt_prio((rq)->rq_prio) #define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) #define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ (policy) == SCHED_RR) @@ -105,29 +106,26 @@ #define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) #define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) #define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) -#define idle_queue(rq) (unlikely(is_idle_policy((rq)->rq_policy))) #define is_iso_policy(policy) ((policy) == SCHED_ISO) #define iso_task(p) unlikely(is_iso_policy((p)->policy)) -#define iso_queue(rq) unlikely(is_iso_policy((rq)->rq_policy)) #define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) -#define rq_running_iso(rq) ((rq)->rq_prio == ISO_PRIO) #define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) -#define ISO_PERIOD ((5 * HZ * grq.noc) + 1) +#define ISO_PERIOD (5 * HZ) -#define SCHED_PRIO(p) ((p) + MAX_RT_PRIO) #define STOP_PRIO (MAX_RT_PRIO - 1) /* * Some helpers for converting to/from various scales. Use shifts to get * approximate multiples of ten for less overhead. */ -#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) -#define JIFFY_NS (1000000000 / HZ) -#define HALF_JIFFY_NS (1000000000 / HZ / 2) -#define HALF_JIFFY_US (1000000 / HZ / 2) +#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) +#define JIFFY_NS (1073741824 / HZ) +#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) +#define HALF_JIFFY_NS (1073741824 / HZ / 2) +#define HALF_JIFFY_US (1048576 / HZ / 2) #define MS_TO_NS(TIME) ((TIME) << 20) #define MS_TO_US(TIME) ((TIME) << 10) #define NS_TO_MS(TIME) ((TIME) >> 20) @@ -137,7 +135,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n"); } /* @@ -182,30 +180,24 @@ static inline int timeslice(void) } /* - * The global runqueue data that all CPUs work off. Data is protected either - * by the global grq lock, or the discrete lock that precedes the data in this - * struct. + * The global runqueue data that all CPUs work off. Contains either atomic + * variables and a cpu bitmap set atomically. */ struct global_rq { - raw_spinlock_t lock; - unsigned long nr_running; - unsigned long nr_uninterruptible; - unsigned long long nr_switches; - unsigned long qnr; /* queued not running */ +#ifdef CONFIG_SMP + atomic_t nr_running ____cacheline_aligned_in_smp; + atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; + atomic64_t nr_switches ____cacheline_aligned_in_smp; + atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */ +#else + atomic_t nr_running ____cacheline_aligned; + atomic_t nr_uninterruptible ____cacheline_aligned; + atomic64_t nr_switches ____cacheline_aligned; + atomic_t qnr ____cacheline_aligned; /* queued not running */ +#endif #ifdef CONFIG_SMP cpumask_t cpu_idle_map; - bool idle_cpus; #endif - int noc; /* num_online_cpus stored and updated when it changes */ - u64 niffies; /* Nanosecond jiffies */ - unsigned long last_jiffy; /* Last jiffy we updated niffies */ - - raw_spinlock_t iso_lock; - int iso_ticks; - bool iso_refractory; - - skiplist_node node; - skiplist *sl; }; #ifdef CONFIG_SMP @@ -241,7 +233,11 @@ static struct root_domain def_root_domain; #endif /* CONFIG_SMP */ /* There can be only one */ -static struct global_rq grq; +#ifdef CONFIG_SMP +static struct global_rq grq ____cacheline_aligned_in_smp; +#else +static struct global_rq grq ____cacheline_aligned; +#endif static DEFINE_MUTEX(sched_hotcpu_mutex); @@ -276,77 +272,16 @@ int __weak arch_sd_sibling_asym_packing(void) struct rq *uprq; #endif /* CONFIG_SMP */ -static inline void update_rq_clock(struct rq *rq); - -/* - * Sanity check should sched_clock return bogus values. We make sure it does - * not appear to go backwards, and use jiffies to determine the maximum and - * minimum it could possibly have increased, and round down to the nearest - * jiffy when it falls outside this. - */ -static inline void niffy_diff(s64 *niff_diff, int jiff_diff) -{ - unsigned long min_diff, max_diff; - - if (jiff_diff > 1) - min_diff = JIFFIES_TO_NS(jiff_diff - 1); - else - min_diff = 1; - /* Round up to the nearest tick for maximum */ - max_diff = JIFFIES_TO_NS(jiff_diff + 1); - - if (unlikely(*niff_diff < min_diff || *niff_diff > max_diff)) - *niff_diff = min_diff; -} - #ifdef CONFIG_SMP static inline int cpu_of(struct rq *rq) { return rq->cpu; } - -/* - * Niffies are a globally increasing nanosecond counter. Whenever a runqueue - * clock is updated with the grq.lock held, it is an opportunity to update the - * niffies value. Any CPU can update it by adding how much its clock has - * increased since it last updated niffies, minus any added niffies by other - * CPUs. - */ -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - /* old_clock is only updated when we are updating niffies */ - rq->old_clock = rq->clock; - ndiff -= grq.niffies - rq->last_niffy; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; - rq->last_niffy = grq.niffies; -} #else /* CONFIG_SMP */ static inline int cpu_of(struct rq *rq) { return 0; } - -static inline void update_clocks(struct rq *rq) -{ - s64 ndiff; - long jdiff; - - update_rq_clock(rq); - ndiff = rq->clock - rq->old_clock; - rq->old_clock = rq->clock; - jdiff = jiffies - grq.last_jiffy; - niffy_diff(&ndiff, jdiff); - grq.last_jiffy += jdiff; - grq.niffies += ndiff; -} #endif #include "stats.h" @@ -362,10 +297,10 @@ static inline void update_clocks(struct rq *rq) #endif /* - * All common locking functions performed on grq.lock. rq->clock is local to + * All common locking functions performed on rq->lock. rq->clock is local to * the CPU accessing it so it can be modified just with interrupts disabled * when we're not updating niffies. - * Looking up task_rq must be done under grq.lock to be safe. + * Looking up task_rq must be done under rq->lock to be safe. */ static void update_rq_clock_task(struct rq *rq, s64 delta); @@ -379,102 +314,501 @@ static inline void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } -static inline bool task_running(struct task_struct *p) +/* + * Niffies are a globally increasing nanosecond counter. They're only used by + * update_load_avg and time_slice_expired, however deadlines are based on them + * across CPUs. Update them whenever we will call one of those functions, and + * synchronise them across CPUs whenever we hold both runqueue locks. + */ +static inline void update_clocks(struct rq *rq) { + s64 ndiff, minndiff; + long jdiff; + + update_rq_clock(rq); + ndiff = rq->clock - rq->old_clock; + rq->old_clock = rq->clock; + jdiff = jiffies - rq->last_jiffy; + + /* Subtract any niffies added by balancing with other rqs */ + ndiff -= rq->niffies - rq->last_niffy; + minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; + if (minndiff < 0) + minndiff = 0; + ndiff = max(ndiff, minndiff); + rq->niffies += ndiff; + rq->last_niffy = rq->niffies; + if (jdiff) { + rq->last_jiffy += jdiff; + rq->last_jiffy_niffies = rq->niffies; + } +} + +static inline int task_current(struct rq *rq, struct task_struct *p) +{ + return rq->curr == p; +} + +static inline int task_running(struct rq *rq, struct task_struct *p) +{ +#ifdef CONFIG_SMP return p->on_cpu; +#else + return task_current(rq, p); +#endif } -static inline void grq_lock(void) - __acquires(grq.lock) +static inline int task_on_rq_queued(struct task_struct *p) { - raw_spin_lock(&grq.lock); + return p->on_rq == TASK_ON_RQ_QUEUED; } -static inline void grq_unlock(void) - __releases(grq.lock) +static inline int task_on_rq_migrating(struct task_struct *p) { - raw_spin_unlock(&grq.lock); + return p->on_rq == TASK_ON_RQ_MIGRATING; } -static inline void grq_lock_irq(void) - __acquires(grq.lock) +static inline void rq_lock(struct rq *rq) + __acquires(rq->lock) { - raw_spin_lock_irq(&grq.lock); + raw_spin_lock(&rq->lock); } -static inline void time_lock_grq(struct rq *rq) +static inline int rq_trylock(struct rq *rq) + __acquires(rq->lock) { - grq_lock(); - update_clocks(rq); + return raw_spin_trylock(&rq->lock); +} + +static inline void rq_unlock(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock(&rq->lock); } -static inline void grq_unlock_irq(void) - __releases(grq.lock) +static inline struct rq *this_rq_lock(void) + __acquires(rq->lock) { - raw_spin_unlock_irq(&grq.lock); + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + raw_spin_lock(&rq->lock); + + return rq; } -static inline void grq_lock_irqsave(unsigned long *flags) - __acquires(grq.lock) +/* + * Any time we have two runqueues locked we use that as an opportunity to + * synchronise niffies to the highest value as idle ticks may have artificially + * kept niffies low on one CPU and the truth can only be later. + */ +static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) { - raw_spin_lock_irqsave(&grq.lock, *flags); + if (rq1->niffies > rq2->niffies) + rq2->niffies = rq1->niffies; + else + rq1->niffies = rq2->niffies; } -static inline void grq_unlock_irqrestore(unsigned long *flags) - __releases(grq.lock) +/* + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. + */ + +/* For when we know rq1 != rq2 */ +static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) { - raw_spin_unlock_irqrestore(&grq.lock, *flags); + if (rq1 < rq2) { + raw_spin_lock(&rq1->lock); + raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING); + } else { + raw_spin_lock(&rq2->lock); + raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING); + } } -static inline struct rq -*task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(p->pi_lock) +static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (rq1 == rq2) { + raw_spin_lock(&rq1->lock); + __acquire(rq2->lock); /* Fake it out ;) */ + } else + __double_rq_lock(rq1, rq2); + synchronise_niffies(rq1, rq2); +} + +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __releases(rq2->lock) +{ + raw_spin_unlock(&rq1->lock); + if (rq1 != rq2) + raw_spin_unlock(&rq2->lock); + else + __release(rq2->lock); +} + +/* Must be sure rq1 != rq2 and irqs are disabled */ +static inline void lock_second_rq(struct rq *rq1, struct rq *rq2) + __releases(rq1->lock) + __acquires(rq1->lock) + __acquires(rq2->lock) +{ + BUG_ON(!irqs_disabled()); + if (unlikely(!raw_spin_trylock(&rq2->lock))) { + raw_spin_unlock(&rq1->lock); + __double_rq_lock(rq1, rq2); + } + synchronise_niffies(rq1, rq2); +} + +static inline void lock_all_rqs(void) +{ + int cpu; + + preempt_disable(); + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_lock(&rq->lock); + } +} + +static inline void unlock_all_rqs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct rq *rq = cpu_rq(cpu); + + do_raw_spin_unlock(&rq->lock); + } + preempt_enable(); +} + +/* Specially nest trylock an rq */ +static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) +{ + if (unlikely(!do_raw_spin_trylock(&rq->lock))) + return false; + spin_acquire(&rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); + synchronise_niffies(this_rq, rq); + return true; +} + +/* Unlock a specially nested trylocked rq */ +static inline void unlock_rq(struct rq *rq) +{ + spin_release(&rq->lock.dep_map, 1, _RET_IP_); + do_raw_spin_unlock(&rq->lock); +} + +static inline void rq_lock_irq(struct rq *rq) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); +} + +static inline void rq_unlock_irq(struct rq *rq) + __releases(rq->lock) +{ + raw_spin_unlock_irq(&rq->lock); +} + +static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) + __acquires(rq->lock) { - raw_spin_lock_irqsave(&p->pi_lock, *flags); - grq_lock(); - return task_rq(p); + raw_spin_lock_irqsave(&rq->lock, *flags); +} + +static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) + __releases(rq->lock) +{ + raw_spin_unlock_irqrestore(&rq->lock, *flags); } static inline struct rq -*time_task_grq_lock(struct task_struct *p, unsigned long *flags) +*task_rq_lock(struct task_struct *p, unsigned long *flags) + __acquires(p->pi_lock) + __acquires(rq->lock) { - struct rq *rq = task_grq_lock(p, flags); + struct rq *rq; - update_clocks(rq); + while (42) { + raw_spin_lock_irqsave(&p->pi_lock, *flags); + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); + } return rq; } -static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags) +static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) + __releases(rq->lock) __releases(p->pi_lock) { - grq_unlock(); + rq_unlock(rq); raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } -static inline void time_grq_lock(struct rq *rq, unsigned long *flags) +static inline struct rq *__task_rq_lock(struct task_struct *p) + __acquires(rq->lock) +{ + struct rq *rq; + + lockdep_assert_held(&p->pi_lock); + + while (42) { + rq = task_rq(p); + raw_spin_lock(&rq->lock); + if (likely(rq == task_rq(p))) + break; + raw_spin_unlock(&rq->lock); + } + return rq; +} + +static inline void __task_rq_unlock(struct rq *rq) +{ + rq_unlock(rq); +} + +/* + * cmpxchg based fetch_or, macro so it works for different integer types + */ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ +}) + +#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) +/* + * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +static bool set_nr_and_not_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); +} + +/* + * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. + * + * If this returns true, then the idle task promises to call + * sched_ttwu_pending() and reschedule soon. + */ +static bool set_nr_if_polling(struct task_struct *p) +{ + struct thread_info *ti = task_thread_info(p); + typeof(ti->flags) old, val = READ_ONCE(ti->flags); + + for (;;) { + if (!(val & _TIF_POLLING_NRFLAG)) + return false; + if (val & _TIF_NEED_RESCHED) + return true; + old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); + if (old == val) + break; + val = old; + } + return true; +} + +#else +static bool set_nr_and_not_polling(struct task_struct *p) +{ + set_tsk_need_resched(p); + return true; +} + +#ifdef CONFIG_SMP +static bool set_nr_if_polling(struct task_struct *p) +{ + return false; +} +#endif +#endif + +void wake_q_add(struct wake_q_head *head, struct task_struct *task) { - local_irq_save(*flags); - time_lock_grq(rq); + struct wake_q_node *node = &task->wake_q; + + /* + * Atomically grab the task, if ->wake_q is !nil already it means + * its already queued (either by us or someone else) and will get the + * wakeup due to that. + * + * This cmpxchg() implies a full barrier, which pairs with the write + * barrier implied by the wakeup in wake_up_q(). + */ + if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) + return; + + get_task_struct(task); + + /* + * The head is context local, there can be no concurrency. + */ + *head->lastp = node; + head->lastp = &node->next; +} + +void wake_up_q(struct wake_q_head *head) +{ + struct wake_q_node *node = head->first; + + while (node != WAKE_Q_TAIL) { + struct task_struct *task; + + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); + /* task can safely be re-inserted now */ + node = node->next; + task->wake_q.next = NULL; + + /* + * wake_up_process() implies a wmb() to pair with the queueing + * in wake_q_add() so as not to miss wakeups. + */ + wake_up_process(task); + put_task_struct(task); + } } static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { + next->on_cpu = 1; +} + +/* + * resched_task - mark a task 'to be rescheduled now'. + * + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +void resched_task(struct task_struct *p) +{ + int cpu; +#ifdef CONFIG_LOCKDEP + struct rq *rq = task_rq(p); + + lockdep_assert_held(&rq->lock); +#endif + if (test_tsk_need_resched(p)) + return; + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) { + set_tsk_need_resched(p); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(p)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + +/* + * A task that is not running or queued will not have a node set. + * A task that is queued but not running will have a node set. + * A task that is currently running will have ->on_cpu set but no node set. + */ +static inline bool task_queued(struct task_struct *p) +{ + return !skiplist_node_empty(&p->node); } +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +static inline void resched_if_idle(struct rq *rq); + static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) { +#ifdef CONFIG_SMP + /* + * After ->on_cpu is cleared, the task can be moved to a different CPU. + * We must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - grq.lock.owner = current; + rq->lock.owner = current; #endif /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ - spin_acquire(&grq.lock.dep_map, 0, 0, _THIS_IP_); + spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - grq_unlock_irq(); +#ifdef CONFIG_SMP + /* + * If prev was marked as migrating to another CPU in return_task, drop + * the local runqueue lock but leave interrupts disabled and grab the + * remote lock we're migrating it to before enabling them. + */ + if (unlikely(task_on_rq_migrating(prev))) { + sched_info_dequeued(rq, prev); + /* + * We move the ownership of prev to the new cpu now. ttwu can't + * activate prev to the wrong cpu since it has to grab this + * runqueue in ttwu_remote. + */ + task_thread_info(prev)->cpu = prev->wake_cpu; + raw_spin_unlock(&rq->lock); + + raw_spin_lock(&prev->pi_lock); + rq = __task_rq_lock(prev); + /* Check that someone else hasn't already queued prev */ + if (likely(!task_queued(prev))) { + enqueue_task(rq, prev, 0); + prev->on_rq = TASK_ON_RQ_QUEUED; + /* Wake up the CPU if it's not already running */ + resched_if_idle(rq); + } + raw_spin_unlock(&prev->pi_lock); + } +#endif + raw_spin_unlock_irq(&rq->lock); } static inline bool deadline_before(u64 deadline, u64 time) @@ -482,11 +816,6 @@ static inline bool deadline_before(u64 deadline, u64 time) return (deadline < time); } -static inline bool deadline_after(u64 deadline, u64 time) -{ - return (deadline > time); -} - /* * Deadline is "now" in niffies + (offset by priority). Setting the deadline * is the key to everything. It distributes cpu fairly amongst tasks of the @@ -521,26 +850,54 @@ static inline int ms_longest_deadline_diff(void) return NS_TO_MS(longest_deadline_diff()); } +static inline int rq_load(struct rq *rq) +{ + return rq->sl->entries + !rq_idle(rq); +} + +static inline bool rq_local(struct rq *rq); + /* - * A task that is not running or queued will not have a node set. - * A task that is queued but not running will have a node set. - * A task that is currently running will have ->on_cpu set but no node set. + * Update the load average for feeding into cpu frequency governors. Use a + * rough estimate of a rolling average with ~ time constant of 32ms. + * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 + * Make sure a call to update_clocks has been made before calling this to get + * an updated rq->niffies. */ -static inline bool task_queued(struct task_struct *p) +static void update_load_avg(struct rq *rq) { - return !skiplist_node_empty(&p->node); + /* rq clock can go backwards so skip update if that happens */ + if (likely(rq->clock > rq->load_update)) { + unsigned long us_interval = (rq->clock - rq->load_update) >> 10; + long load, curload = rq_load(rq); + + load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); + if (unlikely(load < 0)) + load = 0; + load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; + rq->load_avg = load; + } else + return; + + rq->load_update = rq->clock; + if (likely(rq_local(rq))) + cpufreq_trigger(rq->niffies, rq->load_avg); } /* - * Removing from the global runqueue. Enter with grq locked. Deleting a task + * Removing from the runqueue. Enter with rq locked. Deleting a task * from the skip list is done via the stored node reference in the task struct * and does not require a full look up. Thus it occurs in O(k) time where k - * is the "level" of the list the task was stored at - usually < 4, max 16. + * is the "level" of the list the task was stored at - usually < 4, max 8. */ -static void dequeue_task(struct task_struct *p) +static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - skiplist_delete(grq.sl, &p->node); - sched_info_dequeued(task_rq(p), p); + skiplist_delete(rq->sl, &p->node); + rq->best_key = rq->node.next[0]->key; + update_clocks(rq); + if (!(flags & DEQUEUE_SAVE)) + sched_info_dequeued(task_rq(p), p); + update_load_avg(rq); } #ifdef CONFIG_PREEMPT_RCU @@ -566,15 +923,15 @@ static bool idleprio_suitable(struct task_struct *p) * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check * that the iso_refractory flag is not set. */ -static bool isoprio_suitable(void) +static inline bool isoprio_suitable(struct rq *rq) { - return !grq.iso_refractory; + return !rq->iso_refractory; } /* - * Adding to the global runqueue. Enter with grq locked. + * Adding to the runqueue. Enter with rq locked. */ -static void enqueue_task(struct task_struct *p, struct rq *rq) +static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { unsigned int randseed; u64 sl_id; @@ -582,7 +939,7 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || - (iso_task(p) && isoprio_suitable())) + (iso_task(p) && isoprio_suitable(rq))) p->prio = p->normal_prio; else p->prio = NORMAL_PRIO; @@ -604,9 +961,8 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) else { sl_id = p->deadline; if (idleprio_task(p)) { - /* Set it to cope with 4 left shifts with locality_diff */ if (p->prio == IDLE_PRIO) - sl_id |= 0x00FF000000000000; + sl_id |= 0xF000000000000000; else sl_id += longest_deadline_diff(); } @@ -615,14 +971,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) * Some architectures don't have better than microsecond resolution * so mask out ~microseconds as the random seed for skiplist insertion. */ - randseed = (grq.niffies >> 10) & 0xFFFFFFFF; - skiplist_insert(grq.sl, &p->node, sl_id, p, randseed); - sched_info_queued(rq, p); -} - -static inline void requeue_task(struct task_struct *p) -{ - sched_info_queued(task_rq(p), p); + update_clocks(rq); + if (!(flags & ENQUEUE_RESTORE)) + sched_info_queued(rq, p); + randseed = (rq->niffies >> 10) & 0xFFFFFFFF; + skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); + rq->best_key = rq->node.next[0]->key; + update_load_avg(rq); } /* @@ -644,13 +999,6 @@ static inline int task_timeslice(struct task_struct *p) return (rr_interval * task_prio_ratio(p) / 128); } -static void resched_task(struct task_struct *p); - -static inline void resched_curr(struct rq *rq) -{ - resched_task(rq->curr); -} - /* * qnr is the "queued but not running" count which is the total number of * tasks on the global runqueue list waiting for cpu time but not actually @@ -658,24 +1006,31 @@ static inline void resched_curr(struct rq *rq) */ static inline void inc_qnr(void) { - grq.qnr++; + atomic_inc(&grq.qnr); } static inline void dec_qnr(void) { - grq.qnr--; + atomic_dec(&grq.qnr); } static inline int queued_notrunning(void) { - return grq.qnr; + return atomic_read(&grq.qnr); } -static unsigned long rq_load_avg(struct rq *rq) +#ifdef CONFIG_SMP +/* Entered with rq locked */ +static inline void resched_if_idle(struct rq *rq) { - return rq->soft_affined * SCHED_CAPACITY_SCALE; + if (rq_idle(rq)) + resched_task(rq->curr); } +static inline bool rq_local(struct rq *rq) +{ + return (rq->cpu == smp_processor_id()); +} #ifdef CONFIG_SMT_NICE static const cpumask_t *thread_cpumask(int cpu); @@ -751,35 +1106,70 @@ static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) #else /* CONFIG_SMT_NICE */ #define smt_schedule(p, this_rq) (true) #endif /* CONFIG_SMT_NICE */ -#ifdef CONFIG_SMP + +static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) +{ + set_bit(cpu, (volatile unsigned long *)cpumask); +} + /* * The cpu_idle_map stores a bitmap of all the CPUs currently idle to * allow easy lookup of whether any suitable idle CPUs are available. * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the - * idle_cpus variable than to do a full bitmask check when we are busy. + * idle_cpus variable than to do a full bitmask check when we are busy. The + * bits are set atomically but read locklessly as occasional false positive / + * negative is harmless. */ static inline void set_cpuidle_map(int cpu) { - if (likely(cpu_online(cpu))) { - cpumask_set_cpu(cpu, &grq.cpu_idle_map); - grq.idle_cpus = true; - } + if (likely(cpu_online(cpu))) + atomic_set_cpu(cpu, &grq.cpu_idle_map); +} + +static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) +{ + clear_bit(cpu, (volatile unsigned long *)cpumask); } static inline void clear_cpuidle_map(int cpu) { - cpumask_clear_cpu(cpu, &grq.cpu_idle_map); - if (cpumask_empty(&grq.cpu_idle_map)) - grq.idle_cpus = false; + atomic_clear_cpu(cpu, &grq.cpu_idle_map); } static bool suitable_idle_cpus(struct task_struct *p) { - if (!grq.idle_cpus) - return false; return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); } +/* + * Resched current on rq. We don't know if rq is local to this CPU nor if it + * is locked so we do not use an intermediate variable for the task to avoid + * having it dereferenced. + */ +static void resched_curr(struct rq *rq) +{ + int cpu; + + if (test_tsk_need_resched(rq->curr)) + return; + + rq->preempt = rq->curr; + cpu = rq->cpu; + + /* We're doing this without holding the rq lock if it's not task_rq */ + + if (cpu == smp_processor_id()) { + set_tsk_need_resched(rq->curr); + set_preempt_need_resched(); + return; + } + + if (set_nr_and_not_polling(rq->curr)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +} + #define CPUIDLE_DIFF_THREAD (1) #define CPUIDLE_DIFF_CORE (2) #define CPUIDLE_CACHE_BUSY (4) @@ -855,30 +1245,48 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return (this_rq->cpu_locality[that_cpu] < 3); } -static bool resched_best_idle(struct task_struct *p) +/* As per resched_curr but only will resched idle task */ +static inline void resched_idle(struct rq *rq) +{ + if (test_tsk_need_resched(rq->idle)) + return; + + rq->preempt = rq->idle; + + set_tsk_need_resched(rq->idle); + + if (rq_local(rq)) { + set_preempt_need_resched(); + return; + } + + smp_send_reschedule(rq->cpu); +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) { cpumask_t tmpmask; struct rq *rq; int best_cpu; cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); - best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); + best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) - return false; - resched_curr(rq); - return true; + return NULL; + resched_idle(rq); + return rq; } static inline void resched_suitable_idle(struct task_struct *p) { if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); } -static inline int locality_diff(int cpu, struct rq *rq) +static inline struct rq *rq_order(struct rq *rq, int cpu) { - return rq->cpu_locality[cpu]; + return rq->rq_order[cpu]; } #else /* CONFIG_SMP */ static inline void set_cpuidle_map(int cpu) @@ -898,9 +1306,28 @@ static inline void resched_suitable_idle(struct task_struct *p) { } -static inline int locality_diff(int cpu, struct rq *rq) +static inline void resched_curr(struct rq *rq) { - return 0; + resched_task(rq->curr); +} + +static inline void resched_if_idle(struct rq *rq) +{ +} + +static inline bool rq_local(struct rq *rq) +{ + return true; +} + +static inline struct rq *rq_order(struct rq *rq, int cpu) +{ + return rq; +} + +static inline bool smt_schedule(struct task_struct *p, struct rq *rq) +{ + return true; } #endif /* CONFIG_SMP */ @@ -935,32 +1362,11 @@ static int effective_prio(struct task_struct *p) } /* - * Update the load average for feeding into cpu frequency governors. Use a - * rough estimate of a rolling average with ~ time constant of 32ms. - * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 - */ -static void update_load_avg(struct rq *rq) -{ - /* rq clock can go backwards so skip update if that happens */ - if (likely(rq->clock > rq->load_update)) { - unsigned long us_interval = (rq->clock - rq->load_update) >> 10; - long load; - - load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); - if (unlikely(load < 0)) - load = 0; - load += rq->soft_affined * rq_load_avg(rq) * us_interval * 5 / 262144; - rq->load_avg = load; - } - rq->load_update = rq->clock; -} - -/* - * activate_task - move a task to the runqueue. Enter with grq locked. + * activate_task - move a task to the runqueue. Enter with rq locked. */ static void activate_task(struct task_struct *p, struct rq *rq) { - update_clocks(rq); + resched_if_idle(rq); /* * Sleep time is in units of nanosecs, so shift by 20 to get a @@ -970,134 +1376,137 @@ static void activate_task(struct task_struct *p, struct rq *rq) if (unlikely(prof_on == SLEEP_PROFILING)) { if (p->state == TASK_UNINTERRUPTIBLE) profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), - (rq->clock_task - p->last_ran) >> 20); + (rq->niffies - p->last_ran) >> 20); } p->prio = effective_prio(p); if (task_contributes_to_load(p)) - grq.nr_uninterruptible--; - enqueue_task(p, rq); - rq->soft_affined++; - p->on_rq = 1; - grq.nr_running++; + atomic_dec(&grq.nr_uninterruptible); + + enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + atomic_inc(&grq.nr_running); inc_qnr(); - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); } /* - * deactivate_task - If it's running, it's not on the grq and we can just - * decrement the nr_running. Enter with grq locked. + * deactivate_task - If it's running, it's not on the runqueue and we can just + * decrement the nr_running. Enter with rq locked. */ static inline void deactivate_task(struct task_struct *p, struct rq *rq) { if (task_contributes_to_load(p)) - grq.nr_uninterruptible++; - rq->soft_affined--; + atomic_inc(&grq.nr_uninterruptible); + p->on_rq = 0; - grq.nr_running--; - update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + atomic_dec(&grq.nr_running); + sched_info_dequeued(rq, p); } #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { + struct rq *rq = task_rq(p); + bool queued; + #ifdef CONFIG_LOCKDEP /* - * The caller should hold either p->pi_lock or grq lock, when changing - * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks. + * The caller should hold either p->pi_lock or rq->lock, when changing + * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. * * Furthermore, all task_rq users should acquire both locks, see - * task_grq_lock(). + * task_rq_lock(). */ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || - lockdep_is_held(&grq.lock))); + lockdep_is_held(&task_rq(p)->lock))); #endif - if (task_cpu(p) == cpu) + if (p->wake_cpu == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); /* - * After ->cpu is set up to a new value, task_grq_lock(p, ...) can be + * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be * successfully executed on another CPU. We must ensure that updates of * per-task data have been completed by this moment. */ smp_wmb(); - if (p->on_rq) { - struct rq *rq = task_rq(p); + if (task_running(rq, p)) { + /* + * We should only be calling this on a running task if we're + * holding rq lock. + */ + lockdep_assert_held(&rq->lock); - rq->soft_affined--; - update_load_avg(rq); - rq = cpu_rq(cpu); - rq->soft_affined++; - update_load_avg(rq); + /* + * We can't change the task_thread_info cpu on a running task + * as p will still be protected by the rq lock of the cpu it + * is still running on so we set the wake_cpu for it to be + * lazily updated once off the cpu. + */ + p->wake_cpu = cpu; + return; } - task_thread_info(p)->cpu = cpu; + + if ((queued = task_queued(p))) + dequeue_task(rq, p, 0); + task_thread_info(p)->cpu = p->wake_cpu = cpu; + if (queued) + enqueue_task(cpu_rq(cpu), p, 0); } #endif /* CONFIG_SMP */ /* - * Move a task off the global queue and take it to a cpu for it will + * Move a task off the runqueue and take it to a cpu for it will * become the running task. */ -static inline void take_task(int cpu, struct task_struct *p) +static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) { + struct rq *p_rq = task_rq(p); + + dequeue_task(p_rq, p, DEQUEUE_SAVE); + if (p_rq != rq) { + sched_info_dequeued(p_rq, p); + sched_info_queued(rq, p); + } set_task_cpu(p, cpu); - dequeue_task(p); dec_qnr(); } /* - * Returns a descheduling task to the grq runqueue unless it is being + * Returns a descheduling task to the runqueue unless it is being * deactivated. */ -static inline void return_task(struct task_struct *p, struct rq *rq, bool deactivate) +static inline void return_task(struct task_struct *p, struct rq *rq, + int cpu, bool deactivate) { if (deactivate) deactivate_task(p, rq); else { inc_qnr(); - enqueue_task(p, rq); +#ifdef CONFIG_SMP + /* + * set_task_cpu was called on the running task that doesn't + * want to deactivate so it has to be enqueued to a different + * CPU and we need its lock. Tag it to be moved with as the + * lock is dropped in finish_lock_switch. + */ + if (unlikely(p->wake_cpu != cpu)) + p->on_rq = TASK_ON_RQ_MIGRATING; + else +#endif + enqueue_task(rq, p, ENQUEUE_RESTORE); } } -/* Enter with grq lock held. We know p is on the local cpu */ +/* Enter with rq lock held. We know p is on the local cpu */ static inline void __set_tsk_resched(struct task_struct *p) { set_tsk_need_resched(p); set_preempt_need_resched(); } -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -void resched_task(struct task_struct *p) -{ - int cpu; - - lockdep_assert_held(&grq.lock); - - if (test_tsk_need_resched(p)) - return; - - set_tsk_need_resched(p); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) { - set_preempt_need_resched(); - return; - } - - smp_send_reschedule(cpu); -} - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -1128,8 +1537,8 @@ inline int task_curr(const struct task_struct *p) */ unsigned long wait_task_inactive(struct task_struct *p, long match_state) { + int running, queued; unsigned long flags; - bool running, on_rq; unsigned long ncsw; struct rq *rq; @@ -1147,25 +1556,25 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * if the runqueue has changed and p is actually now * running somewhere else! */ - while (task_running(p) && p == rq->curr) { + while (task_running(rq, p)) { if (match_state && unlikely(p->state != match_state)) return 0; cpu_relax(); } /* - * Ok, time to look more closely! We need the grq + * Ok, time to look more closely! We need the rq * lock now, to be *sure*. If we're wrong, we'll * just go back and repeat. */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); trace_sched_wait_task(p); - running = task_running(p); - on_rq = p->on_rq; + running = task_running(rq, p); + queued = task_on_rq_queued(p); ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); /* * If it changed from the expected state, bail out now. @@ -1193,7 +1602,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * running right now), it's preempted, and we should * yield - it could be a while. */ - if (unlikely(on_rq)) { + if (unlikely(queued)) { ktime_t to = ktime_set(0, NSEC_PER_SEC / HZ); set_current_state(TASK_UNINTERRUPTIBLE); @@ -1252,32 +1661,15 @@ can_preempt(struct task_struct *p, int prio, u64 deadline) return true; if (p->prio > prio) return false; - /* SCHED_NORMAL, BATCH and ISO will preempt based on deadline */ + if (p->policy == SCHED_BATCH) + return false; + /* SCHED_NORMAL and ISO will preempt based on deadline */ if (!deadline_before(p->deadline, deadline)) return false; return true; } #ifdef CONFIG_SMP -#define cpu_online_map (*(cpumask_t *)cpu_online_mask) -#ifdef CONFIG_HOTPLUG_CPU -/* - * Check to see if there is a task that is affined only to offline CPUs but - * still wants runtime. This happens to kernel threads during suspend/halt and - * disabling of CPUs. - */ -static inline bool online_cpus(struct task_struct *p) -{ - return (likely(cpumask_intersects(&cpu_online_map, &p->cpus_allowed))); -} -#else /* CONFIG_HOTPLUG_CPU */ -/* All available CPUs are always online without hotplug. */ -static inline bool online_cpus(struct task_struct *p) -{ - return true; -} -#endif - /* * Check to see if p can run on cpu, and if not, whether there are any online * CPUs it can run on instead. @@ -1288,13 +1680,14 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu) return true; return false; } +#define cpu_online_map (*(cpumask_t *)cpu_online_mask) static void try_preempt(struct task_struct *p, struct rq *this_rq) { - int i, this_entries = this_rq->soft_affined; + int i, this_entries = rq_load(this_rq); cpumask_t tmp; - if (suitable_idle_cpus(p) && resched_best_idle(p)) + if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) return; /* IDLEPRIO tasks never preempt anything but idle */ @@ -1303,26 +1696,15 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); - /* - * We iterate over CPUs in locality order using rq_order, finding the - * first one we can preempt if possible, thus staying closest in - * locality. - */ for (i = 0; i < num_possible_cpus(); i++) { struct rq *rq = this_rq->rq_order[i]; if (!cpumask_test_cpu(rq->cpu, &tmp)) continue; - if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries) + if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) continue; if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match thereby speeding up matching - * this task in earliest_deadline_task. - */ - set_task_cpu(p, rq->cpu); resched_curr(rq); return; } @@ -1352,6 +1734,13 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, } #endif /* CONFIG_SMP */ +/* + * wake flags + */ +#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ +#define WF_FORK 0x02 /* child wakeup after fork */ +#define WF_MIGRATED 0x04 /* internal use, task got migrated */ + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -1382,27 +1771,96 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) #endif /* CONFIG_SCHEDSTATS */ } -void wake_up_if_idle(int cpu) +static inline void ttwu_activate(struct rq *rq, struct task_struct *p) { - struct rq *rq = cpu_rq(cpu); - unsigned long flags; + activate_task(p, rq); - rcu_read_lock(); + /* if a worker is waking up, notify workqueue */ + if (p->flags & PF_WQ_WORKER) + wq_worker_waking_up(p, cpu_of(rq)); +} - if (!is_idle_task(rcu_dereference(rq->curr))) - goto out; +/* + * Mark the task runnable and perform wakeup-preemption. + */ +static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +{ + /* + * Sync wakeups (i.e. those types of wakeups where the waker + * has indicated that it will leave the CPU in short order) + * don't trigger a preemption if there are no idle cpus, + * instead waiting for current to deschedule. + */ + if (wake_flags & WF_SYNC) + resched_suitable_idle(p); + else + try_preempt(p, rq); + p->state = TASK_RUNNING; + trace_sched_wakeup(p); +} - grq_lock_irqsave(&flags); - if (likely(is_idle_task(rq->curr))) - smp_send_reschedule(cpu); - /* Else cpu is not in idle, do nothing here */ - grq_unlock_irqrestore(&flags); +static void +ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) +{ + lockdep_assert_held(&rq->lock); -out: - rcu_read_unlock(); +#ifdef CONFIG_SMP + if (p->sched_contributes_to_load) + atomic_dec(&grq.nr_uninterruptible); +#endif + + ttwu_activate(rq, p); + ttwu_do_wakeup(rq, p, wake_flags); +} + +/* + * Called in case the task @p isn't fully descheduled from its runqueue, + * in this case we must do a remote wakeup. Its a 'light' wakeup though, + * since all we need to do is flip p->state to TASK_RUNNING, since + * the task is still ->on_rq. + */ +static int ttwu_remote(struct task_struct *p, int wake_flags) +{ + struct rq *rq; + int ret = 0; + + rq = __task_rq_lock(p); + if (likely(task_on_rq_queued(p))) { + ttwu_do_wakeup(rq, p, wake_flags); + ret = 1; + } + __task_rq_unlock(rq); + + return ret; } #ifdef CONFIG_SMP +static bool sched_smp_initialized __read_mostly; + +void sched_ttwu_pending(void) +{ + struct rq *rq = this_rq(); + struct llist_node *llist = llist_del_all(&rq->wake_list); + struct task_struct *p; + unsigned long flags; + + if (!llist) + return; + + raw_spin_lock_irqsave(&rq->lock, flags); + + while (llist) { + int wake_flags = 0; + + p = llist_entry(llist, struct task_struct, wake_entry); + llist = llist_next(llist); + + ttwu_do_activate(rq, p, wake_flags); + } + + raw_spin_unlock_irqrestore(&rq->lock, flags); +} + void scheduler_ipi(void) { /* @@ -1411,45 +1869,152 @@ void scheduler_ipi(void) * this IPI. */ preempt_fold_need_resched(); -} -#endif -static inline void ttwu_activate(struct task_struct *p, struct rq *rq, - bool is_sync) -{ - activate_task(p, rq); + if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) + return; /* - * Sync wakeups (i.e. those types of wakeups where the waker - * has indicated that it will leave the CPU in short order) - * don't trigger a preemption if there are no idle cpus, - * instead waiting for current to deschedule. + * Not all reschedule IPI handlers call irq_enter/irq_exit, since + * traditionally all their work was done from the interrupt return + * path. Now that we actually do some work, we need to make sure + * we do call them. + * + * Some archs already do call them, luckily irq_enter/exit nest + * properly. + * + * Arguably we should visit all archs and update all handlers, + * however a fair share of IPIs are still resched only so this would + * somewhat pessimize the simple resched case. */ - if (!is_sync || suitable_idle_cpus(p)) - try_preempt(p, rq); + irq_enter(); + sched_ttwu_pending(); + irq_exit(); } -static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, - bool success) +static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) { - trace_sched_wakeup(p); - p->state = TASK_RUNNING; + struct rq *rq = cpu_rq(cpu); - /* - * if a worker is waking up, notify workqueue. Note that on BFS, we - * don't really know what cpu it will be, so we fake it for - * wq_worker_waking_up :/ - */ - if ((p->flags & PF_WQ_WORKER) && success) - wq_worker_waking_up(p, cpu_of(rq)); + if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { + if (!set_nr_if_polling(rq->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); + } +} + +void wake_up_if_idle(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + rcu_read_lock(); + + if (!is_idle_task(rcu_dereference(rq->curr))) + goto out; + + if (set_nr_if_polling(rq->idle)) { + trace_sched_wake_idle_without_ipi(cpu); + } else { + rq_lock_irqsave(rq, &flags); + if (likely(is_idle_task(rq->curr))) + smp_send_reschedule(cpu); + /* Else cpu is not in idle, do nothing here */ + rq_unlock_irqrestore(rq, &flags); + } + +out: + rcu_read_unlock(); +} + +static int valid_task_cpu(struct task_struct *p) +{ + cpumask_t valid_mask; + + if (p->flags & PF_KTHREAD) + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_online_mask); + else + cpumask_and(&valid_mask, tsk_cpus_allowed(p), cpu_active_mask); + + if (unlikely(!cpumask_weight(&valid_mask))) { + /* Hotplug boot threads do this before the CPU is up */ + WARN_ON(sched_smp_initialized); + return cpumask_any(tsk_cpus_allowed(p)); + } + return cpumask_any(&valid_mask); } /* - * wake flags + * For a task that's just being woken up we have a valuable balancing + * opportunity so choose the nearest cache most lightly loaded runqueue. + * Entered with rq locked and returns with the chosen runqueue locked. */ -#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ -#define WF_FORK 0x02 /* child wakeup after fork */ -#define WF_MIGRATED 0x4 /* internal use, task got migrated */ +static inline int select_best_cpu(struct task_struct *p) +{ + unsigned int idlest = ~0U; + struct rq *rq = NULL; + int i; + + if (suitable_idle_cpus(p)) { + int cpu = task_cpu(p); + + if (unlikely(needs_other_cpu(p, cpu))) + cpu = valid_task_cpu(p); + rq = resched_best_idle(p, cpu); + if (likely(rq)) + return rq->cpu; + } + + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = task_rq(p)->rq_order[i]; + int entries; + + if (!other_rq->online) + continue; + if (needs_other_cpu(p, other_rq->cpu)) + continue; + entries = rq_load(other_rq); + if (entries >= idlest) + continue; + idlest = entries; + rq = other_rq; + } + if (unlikely(!rq)) + return smp_processor_id(); + return rq->cpu; +} +#else /* CONFIG_SMP */ +static int valid_task_cpu(struct task_struct *p) +{ + return 0; +} + +static inline int select_best_cpu(struct task_struct *p) +{ + return 0; +} + +static struct rq *resched_best_idle(struct task_struct *p, int cpu) +{ + return NULL; +} +#endif /* CONFIG_SMP */ + +static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) +{ + struct rq *rq = cpu_rq(cpu); + +#if defined(CONFIG_SMP) + if (!cpus_share_cache(smp_processor_id(), cpu)) { + sched_clock_cpu(cpu); /* sync clocks x-cpu */ + ttwu_queue_remote(p, cpu, wake_flags); + return; + } +#endif + rq_lock(rq); + ttwu_do_activate(rq, p, wake_flags); + rq_unlock(rq); +} /*** * try_to_wake_up - wake up a thread @@ -1466,13 +2031,11 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, * Return: %true if @p was woken up, %false if it was already running. * or @state didn't match @p's state. */ -static bool try_to_wake_up(struct task_struct *p, unsigned int state, - int wake_flags) +static int +try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) { - bool success = false; unsigned long flags; - struct rq *rq; - int cpu; + int cpu, success = 0; /* * If we are going to wake up a thread waiting for CONDITION we @@ -1481,68 +2044,133 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state, * set_current_state() the waiting thread does. */ smp_mb__before_spinlock(); - - /* - * No need to do time_lock_grq as we only need to update the rq clock - * if we activate the task - */ - rq = task_grq_lock(p, &flags); - cpu = task_cpu(p); - + raw_spin_lock_irqsave(&p->pi_lock, flags); /* state is a volatile long, どうして、分からない */ if (!((unsigned int)p->state & state)) - goto out_unlock; + goto out; trace_sched_waking(p); - if (task_queued(p) || task_running(p)) - goto out_running; + success = 1; /* we're going to change ->state */ + cpu = task_cpu(p); - ttwu_activate(p, rq, wake_flags & WF_SYNC); - success = true; + /* + * Ensure we load p->on_rq _after_ p->state, otherwise it would + * be possible to, falsely, observe p->on_rq == 0 and get stuck + * in smp_cond_load_acquire() below. + * + * sched_ttwu_pending() try_to_wake_up() + * [S] p->on_rq = 1; [L] P->state + * UNLOCK rq->lock -----. + * \ + * +--- RMB + * schedule() / + * LOCK rq->lock -----' + * UNLOCK rq->lock + * + * [task p] + * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq + * + * Pairs with the UNLOCK+LOCK on rq->lock from the + * last wakeup of our task and the schedule that got our task + * current. + */ + smp_rmb(); + if (p->on_rq && ttwu_remote(p, wake_flags)) + goto stat; -out_running: - ttwu_post_activation(p, rq, success); -out_unlock: - task_grq_unlock(p, &flags); +#ifdef CONFIG_SMP + /* + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. + */ + smp_rmb(); + + /* + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. + */ + smp_cond_load_acquire(&p->on_cpu, !VAL); + + p->sched_contributes_to_load = !!task_contributes_to_load(p); + p->state = TASK_WAKING; + + cpu = select_best_cpu(p); + if (task_cpu(p) != cpu) + set_task_cpu(p, cpu); +#endif /* CONFIG_SMP */ + ttwu_queue(p, cpu, wake_flags); +stat: if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); +out: + raw_spin_unlock_irqrestore(&p->pi_lock, flags); return success; } /** - * try_to_wake_up_local - try to wake up a local task with grq lock held + * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened * * Put @p on the run-queue if it's not already there. The caller must - * ensure that grq is locked and, @p is not the current task. - * grq stays locked over invocation. + * ensure that rq is locked and, @p is not the current task. + * rq stays locked over invocation. */ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - bool success = false; - lockdep_assert_held(&grq.lock); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + + lockdep_assert_held(&rq->lock); + + if (!raw_spin_trylock(&p->pi_lock)) { + /* + * This is OK, because current is on_cpu, which avoids it being + * picked for load-balance and preemption/IRQs are still + * disabled avoiding further scheduler activity on it and we've + * not yet picked a replacement task. + */ + raw_spin_unlock(&rq->lock); + raw_spin_lock(&p->pi_lock); + raw_spin_lock(&rq->lock); + } if (!(p->state & TASK_NORMAL)) - return; + goto out; trace_sched_waking(p); - if (!task_queued(p)) { - if (likely(!task_running(p))) { - schedstat_inc(rq, ttwu_count); - schedstat_inc(rq, ttwu_local); - } - ttwu_activate(p, rq, false); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); - success = true; - } - ttwu_post_activation(p, rq, success); + if (!task_on_rq_queued(p)) + ttwu_activate(rq, p); + + ttwu_do_wakeup(rq, p, 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); +out: + raw_spin_unlock(&p->pi_lock); } /** @@ -1568,7 +2196,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } -static void time_slice_expired(struct task_struct *p); +static void time_slice_expired(struct task_struct *p, struct rq *rq); /* * Perform scheduler related setup for a newly forked process p. @@ -1576,35 +2204,39 @@ static void time_slice_expired(struct task_struct *p); */ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) { + unsigned long flags; + int cpu = get_cpu(); + #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); #endif /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_NEW; + + /* * The process state is set to the same value of the process executing * do_fork() code. That is running. This guarantees that nobody will * actually run it, and a signal or other external event cannot wake * it up and insert it on the runqueue either. */ - /* Should be reset in fork.c but done here for ease of bfs patching */ + /* Should be reset in fork.c but done here for ease of MuQSS patching */ + p->on_cpu = p->on_rq = p->utime = p->stime = p->utimescaled = p->stimescaled = p->sched_time = - p->stime_pc = - p->utime_pc = 0; + p->stime_ns = + p->utime_ns = 0; skiplist_node_init(&p->node); /* - * We mark the process as NEW here. This guarantees that - * nobody will actually run it, and a signal or other external - * event cannot wake it up and insert it on the runqueue either. - */ - p->state = TASK_NEW; - - /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { @@ -1625,12 +2257,20 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } + /* + * Silence PROVE_RCU. + */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + set_task_cpu(p, cpu); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + #ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif - p->on_cpu = false; init_task_preempt_count(p); + + put_cpu(); return 0; } @@ -1725,24 +2365,19 @@ void wake_up_new_task(struct task_struct *p) unsigned long flags; parent = p->parent; - rq = task_grq_lock(p, &flags); - if (unlikely(needs_other_cpu(p, task_cpu(p)))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); - rq_curr = rq->curr; - p->state = TASK_RUNNING; - - /* - * Reinit new task deadline as its creator deadline could have changed - * since call to dup_task_struct(). - */ - p->deadline = rq->rq_deadline; - /* The new task might not be able to run on the same CPU as rq->curr */ + raw_spin_lock_irqsave(&p->pi_lock, flags); + p->state = TASK_RUNNING; + /* Task_rq can't change yet on a new task */ + new_rq = rq = task_rq(p); if (unlikely(needs_other_cpu(p, task_cpu(p)))) { - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + set_task_cpu(p, valid_task_cpu(p)); new_rq = task_rq(p); - } else - new_rq = rq; + } + + double_rq_lock(rq, new_rq); + update_clocks(rq); + rq_curr = rq->curr; /* * Make sure we do not leak PI boosting priority to the child. @@ -1756,30 +2391,29 @@ void wake_up_new_task(struct task_struct *p) * Share the timeslice between parent and child, thus the * total amount of pending timeslices in the system doesn't change, * resulting in more scheduling fairness. If it's negative, it won't - * matter since that's the same as being 0. current's time_slice is - * actually in rq_time_slice when it's running, as is its last_ran - * value. rq->rq_deadline is only modified within schedule() so it - * is always equal to current->deadline. + * matter since that's the same as being 0. rq->rq_deadline is only + * modified within schedule() so it is always equal to + * current->deadline. */ - p->last_ran = rq->rq_last_ran; + p->last_ran = rq_curr->last_ran; if (likely(rq_curr->policy != SCHED_FIFO)) { - rq->rq_time_slice /= 2; - if (unlikely(rq->rq_time_slice < RESCHED_US)) { + rq_curr->time_slice /= 2; + if (unlikely(rq_curr->time_slice < RESCHED_US)) { /* * Forking task has run out of timeslice. Reschedule it and * start its child with a new time slice and deadline. The * child will end up running first because its deadline will * be slightly earlier. */ - rq->rq_time_slice = 0; + rq_curr->time_slice = 0; __set_tsk_resched(rq_curr); - time_slice_expired(p); + time_slice_expired(p, new_rq); if (suitable_idle_cpus(p)) - resched_best_idle(p); + resched_best_idle(p, task_cpu(p)); else if (unlikely(rq != new_rq)) try_preempt(p, new_rq); } else { - p->time_slice = rq->rq_time_slice; + p->time_slice = rq_curr->time_slice; if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { /* * The VM isn't cloned, so we're in a good position to @@ -1791,10 +2425,11 @@ void wake_up_new_task(struct task_struct *p) try_preempt(p, new_rq); } } else { - time_slice_expired(p); + time_slice_expired(p, new_rq); try_preempt(p, new_rq); } - task_grq_unlock(p, &flags); + double_rq_unlock(rq, new_rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -1928,7 +2563,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, * because prev may have moved to another CPU. */ static struct rq *finish_task_switch(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; @@ -1988,7 +2623,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) * @prev: the thread we just switched away from. */ asmlinkage __visible void schedule_tail(struct task_struct *prev) - __releases(grq.lock) + __releases(rq->lock) { struct rq *rq; @@ -2045,7 +2680,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * of the scheduler it's an obvious special-case), so we * do an early lockdep release here: */ - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -2058,26 +2693,16 @@ context_switch(struct rq *rq, struct task_struct *prev, * nr_running, nr_uninterruptible and nr_context_switches: * * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. All are - * measured without grabbing the grq lock but the occasional inaccurate result - * doesn't matter so long as it's positive. + * threads, total number of context switches performed since bootup. */ unsigned long nr_running(void) { - long nr = grq.nr_running; - - if (unlikely(nr < 0)) - nr = 0; - return (unsigned long)nr; + return atomic_read(&grq.nr_running); } static unsigned long nr_uninterruptible(void) { - long nu = grq.nr_uninterruptible; - - if (unlikely(nu < 0)) - nu = 0; - return nu; + return atomic_read(&grq.nr_uninterruptible); } /* @@ -2095,7 +2720,9 @@ static unsigned long nr_uninterruptible(void) */ bool single_task_running(void) { - if (cpu_rq(smp_processor_id())->soft_affined == 1) + struct rq *rq = cpu_rq(smp_processor_id()); + + if (rq_load(rq) == 1) return true; else return false; @@ -2104,12 +2731,7 @@ EXPORT_SYMBOL(single_task_running); unsigned long long nr_context_switches(void) { - long long ns = grq.nr_switches; - - /* This is of course impossible */ - if (unlikely(ns < 0)) - ns = 1; - return (unsigned long long)ns; + return (unsigned long long)atomic64_read(&grq.nr_switches); } unsigned long nr_iowait(void) @@ -2133,15 +2755,16 @@ unsigned long nr_active(void) return nr_running() + nr_uninterruptible(); } -/* Beyond a task running on this CPU, load is equal everywhere on BFS, so we - * base it on the number of running or queued tasks with their ->rq pointer - * set to this cpu as being the CPU they're more likely to run on. */ +/* + * I/O wait is the number of running or queued tasks with their ->rq pointer + * set to this cpu as being the CPU they're more likely to run on. + */ void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) { struct rq *rq = this_rq(); *nr_waiters = atomic_read(&rq->nr_iowait); - *load = rq->soft_affined; + *load = rq_load(rq); } /* Variables and functions for calc_load */ @@ -2364,7 +2987,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) delta -= steal; } #endif - rq->clock_task += delta; } @@ -2456,86 +3078,89 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) } /* - * On each tick, see what percentage of that tick was attributed to each - * component and add the percentage to the _pc values. Once a _pc value has - * accumulated one tick's worth, account for that. This means the total - * percentage of load components will always be 128 (pseudo 100) per tick. + * On each tick, add the number of nanoseconds to the unbanked variables and + * once one tick's worth has accumulated, account it allowing for accurate + * sub-tick accounting and totals. */ -static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long pc) +static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) { u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; if (atomic_read(&rq->nr_iowait) > 0) { - rq->iowait_pc += pc; - if (rq->iowait_pc >= 128) { - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * rq->iowait_pc / 128; - rq->iowait_pc %= 128; + rq->iowait_ns += ns; + if (rq->iowait_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->iowait_ns); + cpustat[CPUTIME_IOWAIT] += (__force u64)cputime_one_jiffy * ticks; + rq->iowait_ns %= JIFFY_NS; } } else { - rq->idle_pc += pc; - if (rq->idle_pc >= 128) { - cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * rq->idle_pc / 128; - rq->idle_pc %= 128; + rq->idle_ns += ns; + if (rq->idle_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->idle_ns); + cpustat[CPUTIME_IDLE] += (__force u64)cputime_one_jiffy * ticks; + rq->idle_ns %= JIFFY_NS; } } acct_update_integrals(idle); } -static void -pc_system_time(struct rq *rq, struct task_struct *p, int hardirq_offset, - unsigned long pc, unsigned long ns) +static void pc_system_time(struct rq *rq, struct task_struct *p, + int hardirq_offset, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->stime_pc += pc; - if (p->stime_pc >= 128) { - int jiffs = p->stime_pc / 128; - - p->stime_pc %= 128; - p->stime += (__force u64)cputime_one_jiffy * jiffs; - p->stimescaled += one_jiffy_scaled * jiffs; - account_group_system_time(p, cputime_one_jiffy * jiffs); + p->stime_ns += ns; + if (p->stime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->stime_ns); + p->stime_ns %= JIFFY_NS; + p->stime += (__force u64)cputime_one_jiffy * ticks; + p->stimescaled += one_jiffy_scaled * ticks; + account_group_system_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); if (hardirq_count() - hardirq_offset) { - rq->irq_pc += pc; - if (rq->irq_pc >= 128) { - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * rq->irq_pc / 128; - rq->irq_pc %= 128; + rq->irq_ns += ns; + if (rq->irq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->irq_ns); + cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->irq_ns %= JIFFY_NS; } } else if (in_serving_softirq()) { - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } else { - rq->system_pc += pc; - if (rq->system_pc >= 128) { - cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * rq->system_pc / 128; - rq->system_pc %= 128; + rq->system_ns += ns; + if (rq->system_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->system_ns); + cpustat[CPUTIME_SYSTEM] += (__force u64)cputime_one_jiffy * ticks; + rq->system_ns %= JIFFY_NS; } } acct_update_integrals(p); } -static void pc_user_time(struct rq *rq, struct task_struct *p, - unsigned long pc, unsigned long ns) +static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) { - u64 *cpustat = kcpustat_this_cpu->cpustat; cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + u64 *cpustat = kcpustat_this_cpu->cpustat; + unsigned long ticks; - p->utime_pc += pc; - if (p->utime_pc >= 128) { - int jiffs = p->utime_pc / 128; - - p->utime_pc %= 128; - p->utime += (__force u64)cputime_one_jiffy * jiffs; - p->utimescaled += one_jiffy_scaled * jiffs; - account_group_user_time(p, cputime_one_jiffy * jiffs); + p->utime_ns += ns; + if (p->utime_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(p->utime_ns); + p->utime_ns %= JIFFY_NS; + p->utime += (__force u64)cputime_one_jiffy * ticks; + p->utimescaled += one_jiffy_scaled * ticks; + account_group_user_time(p, cputime_one_jiffy * ticks); } p->sched_time += ns; account_group_exec_runtime(p, ns); @@ -2545,36 +3170,33 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. */ - rq->softirq_pc += pc; - if (rq->softirq_pc >= 128) { - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * rq->softirq_pc / 128; - rq->softirq_pc %= 128; + rq->softirq_ns += ns; + if (rq->softirq_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->softirq_ns); + cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy * ticks; + rq->softirq_ns %= JIFFY_NS; } } if (task_nice(p) > 0 || idleprio_task(p)) { - rq->nice_pc += pc; - if (rq->nice_pc >= 128) { - cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * rq->nice_pc / 128; - rq->nice_pc %= 128; + rq->nice_ns += ns; + if (rq->nice_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->nice_ns); + cpustat[CPUTIME_NICE] += (__force u64)cputime_one_jiffy * ticks; + rq->nice_ns %= JIFFY_NS; } } else { - rq->user_pc += pc; - if (rq->user_pc >= 128) { - cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * rq->user_pc / 128; - rq->user_pc %= 128; + rq->user_ns += ns; + if (rq->user_ns >= JIFFY_NS) { + ticks = NS_TO_JIFFIES(rq->user_ns); + cpustat[CPUTIME_USER] += (__force u64)cputime_one_jiffy * ticks; + rq->user_ns %= JIFFY_NS; } } acct_update_integrals(p); } /* - * Convert nanoseconds to pseudo percentage of one tick. Use 128 for fast - * shifts instead of 100 - */ -#define NS_TO_PC(NS) (NS * 128 / JIFFY_NS) - -/* * This is called on clock ticks. * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. @@ -2582,38 +3204,29 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - if (unlikely(account_ns < 0) || steal_account_process_tick()) + if (steal_account_process_tick()) goto ts_account; - account_pc = NS_TO_PC(account_ns); - /* Accurate tick timekeeping */ if (user_mode(get_irq_regs())) - pc_user_time(rq, p, account_pc, account_ns); - else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) - pc_system_time(rq, p, HARDIRQ_OFFSET, - account_pc, account_ns); - else - pc_idle_time(rq, idle, account_pc); + pc_user_time(rq, p, account_ns); + else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { + pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); + } else + pc_idle_time(rq, idle, account_ns); if (sched_clock_irqtime) irqtime_account_hi_si(); ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + p->last_ran = rq->niffies; } /* @@ -2624,40 +3237,25 @@ ts_account: static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { - long account_ns = rq->clock_task - rq->rq_last_ran; + s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - unsigned long account_pc; - - if (unlikely(account_ns < 0)) - goto ts_account; - - account_pc = NS_TO_PC(account_ns); /* Accurate subtick timekeeping */ - if (p != idle) { - pc_user_time(rq, p, account_pc, account_ns); - } + if (p != idle) + pc_user_time(rq, p, account_ns); else - pc_idle_time(rq, idle, account_pc); + pc_idle_time(rq, idle, account_ns); -ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ - if (rq->rq_policy != SCHED_FIFO && p != idle) { - s64 time_diff = rq->clock - rq->timekeep_clock; - - niffy_diff(&time_diff, 1); - rq->rq_time_slice -= NS_TO_US(time_diff); - } - - rq->rq_last_ran = rq->clock_task; - rq->timekeep_clock = rq->clock; + if (p->policy != SCHED_FIFO && p != idle) + p->time_slice -= NS_TO_US(account_ns); } /* * Return any ns on the sched_clock that have not yet been accounted in * @p in case that task is currently running. * - * Called with task_grq_lock() held. + * Called with task_rq_lock(p) held. */ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) { @@ -2668,11 +3266,9 @@ static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) * project cycles that may never be accounted to this * thread, breaking clock_gettime(). */ - if (p == rq->curr && p->on_rq) { + if (p == rq->curr && task_on_rq_queued(p)) { update_clocks(rq); - ns = rq->clock_task - rq->rq_last_ran; - if (unlikely((s64)ns < 0)) - ns = 0; + ns = rq->niffies - p->last_ran; } return ns; @@ -2702,13 +3298,13 @@ unsigned long long task_sched_runtime(struct task_struct *p) * If we see ->on_cpu without ->on_rq, the task is leaving, and has * been accounted, so we're correct here as well. */ - if (!p->on_cpu || !p->on_rq) + if (!p->on_cpu || !task_on_rq_queued(p)) return tsk_seruntime(p); #endif - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return ns; } @@ -2841,37 +3437,12 @@ void account_idle_ticks(unsigned long ticks) } #endif -static inline void grq_iso_lock(void) - __acquires(grq.iso_lock) -{ - raw_spin_lock(&grq.iso_lock); -} - -static inline void grq_iso_unlock(void) - __releases(grq.iso_lock) -{ - raw_spin_unlock(&grq.iso_lock); -} - /* * Functions to test for when SCHED_ISO tasks have used their allocated - * quota as real time scheduling and convert them back to SCHED_NORMAL. - * Where possible, the data is tested lockless, to avoid grabbing iso_lock - * because the occasional inaccurate result won't matter. However the - * tick data is only ever modified under lock. iso_refractory is only simply - * set to 0 or 1 so it's not worth grabbing the lock yet again for that. + * quota as real time scheduling and convert them back to SCHED_NORMAL. All + * data is modified only by the local runqueue during scheduler_tick with + * interrupts disabled. */ -static bool set_iso_refractory(void) -{ - grq.iso_refractory = true; - return grq.iso_refractory; -} - -static bool clear_iso_refractory(void) -{ - grq.iso_refractory = false; - return grq.iso_refractory; -} /* * Test if SCHED_ISO tasks have run longer than their alloted period as RT @@ -2879,97 +3450,82 @@ static bool clear_iso_refractory(void) * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a * slow division. */ -static bool test_ret_isorefractory(struct rq *rq) +static inline void iso_tick(struct rq *rq) { - if (likely(!grq.iso_refractory)) { - if (grq.iso_ticks > ISO_PERIOD * sched_iso_cpu) - return set_iso_refractory(); - } else { - if (grq.iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) - return clear_iso_refractory(); + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; + rq->iso_ticks += 100; + if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { + rq->iso_refractory = true; + if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) + rq->iso_ticks = ISO_PERIOD * 100; } - return grq.iso_refractory; -} - -static void iso_tick(void) -{ - grq_iso_lock(); - grq.iso_ticks += 100; - grq_iso_unlock(); } /* No SCHED_ISO task was running so decrease rq->iso_ticks */ -static inline void no_iso_tick(void) -{ - if (grq.iso_ticks) { - grq_iso_lock(); - grq.iso_ticks -= grq.iso_ticks / ISO_PERIOD + 1; - if (unlikely(grq.iso_refractory && grq.iso_ticks < - ISO_PERIOD * (sched_iso_cpu * 115 / 128))) - clear_iso_refractory(); - grq_iso_unlock(); +static inline void no_iso_tick(struct rq *rq, int ticks) +{ + if (rq->iso_ticks > 0 || rq->iso_refractory) { + rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; + if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { + rq->iso_refractory = false; + if (unlikely(rq->iso_ticks < 0)) + rq->iso_ticks = 0; + } } } /* This manages tasks that have run out of timeslice during a scheduler_tick */ static void task_running_tick(struct rq *rq) { - struct task_struct *p; + struct task_struct *p = rq->curr; /* * If a SCHED_ISO task is running we increment the iso_ticks. In * order to prevent SCHED_ISO tasks from causing starvation in the * presence of true RT tasks we account those as iso_ticks as well. */ - if ((rt_queue(rq) || (iso_queue(rq) && !grq.iso_refractory))) { - if (grq.iso_ticks <= (ISO_PERIOD * 128) - 128) - iso_tick(); - } else - no_iso_tick(); + if (rt_task(p) || task_running_iso(p)) + iso_tick(rq); + else + no_iso_tick(rq, 1); - if (iso_queue(rq)) { - if (unlikely(test_ret_isorefractory(rq))) { - if (rq_running_iso(rq)) { + /* SCHED_FIFO tasks never run out of timeslice. */ + if (p->policy == SCHED_FIFO) + return; + + if (iso_task(p)) { + if (task_running_iso(p)) { + if (rq->iso_refractory) { /* * SCHED_ISO task is running as RT and limit * has been hit. Force it to reschedule as * SCHED_NORMAL by zeroing its time_slice */ - rq->rq_time_slice = 0; + p->time_slice = 0; } + } else if (!rq->iso_refractory) { + /* Can now run again ISO. Reschedule to pick up prio */ + goto out_resched; } } - /* SCHED_FIFO tasks never run out of timeslice. */ - if (rq->rq_policy == SCHED_FIFO) - return; /* * Tasks that were scheduled in the first half of a tick are not * allowed to run into the 2nd half of the next tick if they will * run out of time slice in the interim. Otherwise, if they have * less than RESCHED_US μs of time slice left they will be rescheduled. */ - if (rq->dither) { - if (rq->rq_time_slice > HALF_JIFFY_US) - return; - else - rq->rq_time_slice = 0; - } else if (rq->rq_time_slice >= RESCHED_US) - return; - - /* p->time_slice < RESCHED_US. We only modify task_struct under grq lock */ - p = rq->curr; - - grq_lock(); - requeue_task(p); - resched_task(p); - grq_unlock(); + if (p->time_slice - rq->dither >= RESCHED_US) + return; +out_resched: + rq_lock(rq); + __set_tsk_resched(p); + rq_unlock(rq); } /* * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. The data modified is all - * local to struct rq so we don't need to grab grq lock. + * We call it with interrupts disabled. */ void scheduler_tick(void) { @@ -2977,15 +3533,14 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); sched_clock_tick(); - /* grq lock not grabbed, so only update rq clock */ update_rq_clock(rq); - update_cpu_clock_tick(rq, rq->curr); update_load_avg(rq); - cpufreq_trigger(grq.niffies, rq->load_avg); + update_cpu_clock_tick(rq, rq->curr); if (!rq_idle(rq)) task_running_tick(rq); else - no_iso_tick(); + no_iso_tick(rq, rq->last_scheduler_tick - rq->last_jiffy); + rq->last_scheduler_tick = rq->last_jiffy; rq->last_tick = rq->clock; perf_event_task_tick(); } @@ -3068,12 +3623,13 @@ static inline void preempt_latency_stop(int val) { } /* * The time_slice is only refilled when it is empty and that is when we set a - * new deadline. + * new deadline. Make sure update_clocks has been called recently to update + * rq->niffies. */ -static void time_slice_expired(struct task_struct *p) +static void time_slice_expired(struct task_struct *p, struct rq *rq) { p->time_slice = timeslice(); - p->deadline = grq.niffies + task_deadline_diff(p); + p->deadline = rq->niffies + task_deadline_diff(p); #ifdef CONFIG_SMT_NICE if (!p->mm) p->smt_bias = 0; @@ -3100,107 +3656,107 @@ static void time_slice_expired(struct task_struct *p) * SCHED_NORMAL tasks. */ -static inline void check_deadline(struct task_struct *p) +static inline void check_deadline(struct task_struct *p, struct rq *rq) { if (p->time_slice < RESCHED_US || batch_task(p)) - time_slice_expired(p); + time_slice_expired(p, rq); } #define BITOP_WORD(nr) ((nr) / BITS_PER_LONG) /* - * Scheduler queue bitmap specific find next bit. - */ -static inline unsigned long -next_sched_bit(const unsigned long *addr, unsigned long offset) -{ - const unsigned long *p; - unsigned long result; - unsigned long size; - unsigned long tmp; - - size = PRIO_LIMIT; - if (offset >= size) - return size; - - p = addr + BITOP_WORD(offset); - result = offset & ~(BITS_PER_LONG-1); - size -= result; - offset %= BITS_PER_LONG; - if (offset) { - tmp = *(p++); - tmp &= (~0UL << offset); - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - } - while (size & ~(BITS_PER_LONG-1)) { - if ((tmp = *(p++))) - goto found_middle; - result += BITS_PER_LONG; - size -= BITS_PER_LONG; - } - if (!size) - return result; - tmp = *p; - -found_first: - tmp &= (~0UL >> (BITS_PER_LONG - size)); - if (tmp == 0UL) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __ffs(tmp); -} - -/* * Task selection with skiplists is a simple matter of picking off the first - * task in the sorted list, an O(1) operation. The only time it takes longer - * is if tasks do not have suitable affinity and then we iterate over entries - * till we find the first that does. Worst case here is no tasks with suitable - * affinity and taking O(n). + * task in the sorted list, an O(1) operation. The lookup is amortised O(1) + * being bound to the number of processors. + * + * Runqueues are selectively locked based on their unlocked data and then + * unlocked if not needed. At most 3 locks will be held at any time and are + * released as soon as they're no longer needed. All balancing between CPUs + * is thus done here in an extremely simple first come best fit manner. + * + * This iterates over runqueues in cache locality order. In interactive mode + * it iterates over all CPUs and finds the task with the best key/deadline. + * In non-interactive mode it will only take a task if it's from the current + * runqueue or a runqueue with more tasks than the current one with a better + * key/deadline. */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { - skiplist_node *node = &grq.node; struct task_struct *edt = idle; - u64 earliest_deadline = ~0ULL; + struct rq *locked = NULL; + int i, best_entries = 0; + u64 best_key = ~0ULL; - while ((node = node->next[0]) != &grq.node) { - struct task_struct *p = node->value; + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *other_rq = rq_order(rq, i); + int entries = other_rq->sl->entries; + struct task_struct *p; + u64 key; - /* Make sure affinity is ok */ - if (needs_other_cpu(p, cpu)) + /* + * Check for queued entres lockless first. The local runqueue + * is locked so entries will always be accurate. + */ + if (!sched_interactive) { + if (entries <= best_entries) + continue; + } else if (!entries) continue; - if (!smt_schedule(p, rq)) - continue; + /* if (i) implies other_rq != rq */ + if (i) { + /* Check for best id queued lockless first */ + if (other_rq->best_key >= best_key) + continue; - if (!sched_interactive) { - int tcpu; + if (unlikely(!trylock_rq(rq, other_rq))) + continue; - if ((tcpu = task_cpu(p)) != cpu) { - u64 dl = p->deadline << locality_diff(tcpu, rq); + /* Need to reevaluate entries after locking */ + entries = other_rq->sl->entries; + if (unlikely(!entries)) { + unlock_rq(other_rq); + continue; + } + } + key = other_rq->node.next[0]->key; + /* Reevaluate key after locking */ + if (unlikely(key >= best_key)) { + if (i) + unlock_rq(other_rq); + continue; + } - if (!deadline_before(dl, earliest_deadline)) - continue; - earliest_deadline = dl; - edt = p; - /* We continue even though we've found the earliest - * deadline task as the locality offset means there - * may be a better candidate after it. */ + p = other_rq->node.next[0]->value; + if (!smt_schedule(p, rq)) { + if (i) + unlock_rq(other_rq); + continue; + } + + /* Make sure affinity is ok */ + if (i) { + if (needs_other_cpu(p, cpu)) { + unlock_rq(other_rq); continue; } + if (locked) + unlock_rq(locked); + locked = other_rq; } - /* We've encountered the best deadline local task */ + + best_entries = entries; + best_key = key; edt = p; - break; } + if (likely(edt != idle)) - take_task(cpu, edt); + take_task(rq, cpu, edt); + + if (locked) + unlock_rq(locked); + return edt; } @@ -3226,9 +3782,6 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif - if (panic_on_warn) - panic("scheduling while atomic\n"); - dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -3256,15 +3809,11 @@ static inline void schedule_debug(struct task_struct *prev) /* * The currently running task's information is all stored in rq local data - * which is only modified by the local CPU, thereby allowing the data to be - * changed without grabbing the grq lock. + * which is only modified by the local CPU. */ static inline void set_rq_task(struct rq *rq, struct task_struct *p) { - rq->rq_time_slice = p->time_slice; rq->rq_deadline = p->deadline; - rq->rq_last_ran = p->last_ran = rq->clock_task; - rq->rq_policy = p->policy; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE rq->rq_mm = p->mm; @@ -3272,15 +3821,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) #endif } -static void reset_rq_task(struct rq *rq, struct task_struct *p) -{ - rq->rq_policy = p->policy; - rq->rq_prio = p->prio; -#ifdef CONFIG_SMT_NICE - rq->rq_smt_bias = p->smt_bias; -#endif -} - #ifdef CONFIG_SMT_NICE static void check_no_siblings(struct rq __maybe_unused *this_rq) {} static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} @@ -3381,11 +3921,13 @@ static void __sched notrace __schedule(bool preempt) unsigned long *switch_count; bool deactivate = false; struct rq *rq; + u64 niffies; int cpu; cpu = smp_processor_id(); rq = cpu_rq(cpu); prev = rq->curr; + idle = rq->idle; /* * do_exit() calls schedule() with preemption disabled as an exception; @@ -3409,7 +3951,23 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - grq_lock(); + rq_lock(rq); +#ifdef CONFIG_SMP + if (rq->preempt) { + /* + * Make sure resched_curr hasn't triggered a preemption + * locklessly on a task that has since scheduled away. Spurious + * wakeup of idle is okay though. + */ + if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { + rq->preempt = NULL; + clear_preempt_need_resched(); + rq_unlock_irq(rq); + return; + } + rq->preempt = NULL; + } +#endif switch_count = &prev->nivcsw; if (!preempt && prev->state) { @@ -3431,7 +3989,7 @@ static void __sched notrace __schedule(bool preempt) to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) { /* This shouldn't happen, but does */ - if (unlikely(to_wakeup == prev)) + if (WARN_ONCE((to_wakeup == prev), "Waking up prev as worker\n")) deactivate = false; else try_to_wake_up_local(to_wakeup); @@ -3441,64 +3999,64 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } + /* + * Store the niffy value here for use by the next task's last_ran + * below to avoid losing niffies due to update_clocks being called + * again after this point. + */ update_clocks(rq); + niffies = rq->niffies; update_cpu_clock_switch(rq, prev); if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = false; + rq->dither = 0; else - rq->dither = true; + rq->dither = HALF_JIFFY_US; clear_tsk_need_resched(prev); clear_preempt_need_resched(); - idle = rq->idle; if (idle != prev) { - /* Update all the information stored on struct rq */ - prev->time_slice = rq->rq_time_slice; - prev->deadline = rq->rq_deadline; - check_deadline(prev); - prev->last_ran = rq->clock_task; - return_task(prev, rq, deactivate); + check_deadline(prev, rq); + return_task(prev, rq, cpu, deactivate); } if (unlikely(!queued_notrunning())) { - /* - * This CPU is now truly idle as opposed to when idle is - * scheduled as a high priority task in its own right. - */ next = idle; schedstat_inc(rq, sched_goidle); set_cpuidle_map(cpu); + update_load_avg(rq); } else { next = earliest_deadline_task(rq, cpu, idle); if (likely(next->prio != PRIO_LIMIT)) clear_cpuidle_map(cpu); - else + else { set_cpuidle_map(cpu); + update_load_avg(rq); + } } + set_rq_task(rq, next); + next->last_ran = niffies; + if (likely(prev != next)) { /* * Don't reschedule an idle task or deactivated tasks */ if (prev != idle && !deactivate) resched_suitable_idle(prev); - set_rq_task(rq, next); if (next != idle) check_siblings(rq); else wake_siblings(rq); - grq.nr_switches++; - prev->on_cpu = false; - next->on_cpu = true; + atomic64_inc(&grq.nr_switches); rq->curr = next; ++*switch_count; trace_sched_switch(preempt, prev, next); - rq = context_switch(rq, prev, next); /* unlocks the grq */ + rq = context_switch(rq, prev, next); /* unlocks the rq */ } else { check_siblings(rq); - grq_unlock_irq(); + rq_unlock_irq(rq); } } @@ -3713,13 +4271,12 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - unsigned long flags; struct rq *rq; int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); - rq = task_grq_lock(p, &flags); + rq = __task_rq_lock(p); /* * Idle task boosting is a nono in general. There is one @@ -3742,17 +4299,17 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; p->prio = prio; - if (task_running(p)){ + if (task_running(rq, p)){ if (prio > oldprio) resched_task(p); } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (prio < oldprio) try_preempt(p, rq); } out_unlock: - task_grq_unlock(p, &flags); + __task_rq_unlock(rq); } #endif @@ -3779,7 +4336,7 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = time_task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3797,17 +4354,17 @@ void set_user_nice(struct task_struct *p, long nice) p->prio = effective_prio(p); if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (new_static < old_static) try_preempt(p, rq); - } else if (task_running(p)) { - reset_rq_task(rq, p); + } else if (task_running(rq, p)) { + set_rq_task(rq, p); if (old_static < new_static) resched_task(p); } out_unlock: - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -3878,7 +4435,7 @@ int task_prio(const struct task_struct *p) goto out; /* Convert to ms to avoid overflows */ - delta = NS_TO_MS(p->deadline - grq.niffies); + delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); delta = delta * 40 / ms_longest_deadline_diff(); if (delta > 0 && delta <= 80) prio += delta; @@ -3921,7 +4478,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } -/* Actually do priority change: must hold grq lock. */ +/* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, int prio, bool keep_boost) { @@ -3948,12 +4505,12 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, } else p->prio = p->normal_prio; - if (task_running(p)) { - reset_rq_task(rq, p); + if (task_running(rq, p)) { + set_rq_task(rq, p); resched_task(p); } else if (task_queued(p)) { - dequeue_task(p); - enqueue_task(p, rq); + dequeue_task(rq, p, DEQUEUE_SAVE); + enqueue_task(rq, p, ENQUEUE_RESTORE); if (p->prio < oldprio || p->rt_priority > oldrtprio) try_preempt(p, rq); } @@ -4055,7 +4612,7 @@ recheck: case SCHED_ISO: if (policy == SCHED_ISO) goto out; - if (policy == SCHED_NORMAL) + if (policy != SCHED_NORMAL) return -EPERM; break; case SCHED_BATCH: @@ -4092,16 +4649,16 @@ recheck: * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: * - * To be able to change p->policy safely, the grunqueue lock must be + * To be able to change p->policy safely, the runqueue lock must be * held. */ - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return -EINVAL; } @@ -4110,22 +4667,20 @@ recheck: */ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || param->sched_priority == p->rt_priority))) { - - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); return 0; } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); goto recheck; } - update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; __setscheduler(p, rq, policy, param->sched_priority, pi); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); if (pi) rt_mutex_adjust_pi(p); @@ -4625,9 +5180,9 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + raw_spin_lock_irqsave(&p->pi_lock, flags); cpumask_and(mask, tsk_cpus_allowed(p), cpu_active_mask); - grq_unlock_irqrestore(&flags); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); out_unlock: rcu_read_unlock(); @@ -4642,8 +5197,7 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: size of CPU mask copied to user_mask_ptr on success. An - * error code otherwise. + * Return: 0 on success. An error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -4685,19 +5239,20 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, SYSCALL_DEFINE0(sched_yield) { struct task_struct *p; + struct rq *rq; p = current; - grq_lock_irq(); + rq = this_rq_lock(); + time_slice_expired(p, rq); schedstat_inc(task_rq(p), yld_count); - requeue_task(p); /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(grq.lock); - spin_release(&grq.lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&grq.lock); + __release(rq->lock); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + do_raw_spin_unlock(&rq->lock); sched_preempt_enable_no_resched(); schedule(); @@ -4803,29 +5358,44 @@ EXPORT_SYMBOL(yield); */ int __sched yield_to(struct task_struct *p, bool preempt) { + struct task_struct *rq_p; struct rq *rq, *p_rq; unsigned long flags; int yielded = 0; + local_irq_save(flags); rq = this_rq(); - grq_lock_irqsave(&flags); - if (task_running(p) || p->state) { + +again: + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (task_running(p_rq, p) || p->state) { yielded = -ESRCH; - goto out_unlock; + goto out_irq; + } + + double_rq_lock(rq, p_rq); + if (unlikely(task_rq(p) != p_rq)) { + double_rq_unlock(rq, p_rq); + goto again; } - p_rq = task_rq(p); yielded = 1; - if (p->deadline > rq->rq_deadline) - p->deadline = rq->rq_deadline; - p->time_slice += rq->rq_time_slice; - rq->rq_time_slice = 0; + rq_p = rq->curr; + if (p->deadline > rq_p->deadline) + p->deadline = rq_p->deadline; + p->time_slice += rq_p->time_slice; if (p->time_slice > timeslice()) p->time_slice = timeslice(); + time_slice_expired(rq_p, rq); if (preempt && rq != p_rq) - resched_curr(p_rq); -out_unlock: - grq_unlock_irqrestore(&flags); + resched_task(p_rq->curr); + double_rq_unlock(rq, p_rq); +out_irq: + local_irq_restore(flags); if (yielded > 0) schedule(); @@ -4931,8 +5501,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, struct task_struct *p; unsigned int time_slice; unsigned long flags; - int retval; struct timespec t; + struct rq *rq; + int retval; if (pid < 0) return -EINVAL; @@ -4947,9 +5518,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, if (retval) goto out_unlock; - grq_lock_irqsave(&flags); + rq = task_rq_lock(p, &flags); time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); - grq_unlock_irqrestore(&flags); + task_rq_unlock(rq, p, &flags); rcu_read_unlock(); t = ns_to_timespec(time_slice); @@ -5049,9 +5620,21 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { + struct rq *rq = task_rq(p); + + lockdep_assert_held(&p->pi_lock); + cpumask_copy(tsk_cpus_allowed(p), new_mask); + + if (task_queued(p)) { + /* + * Because __kthread_bind() calls this on blocked tasks without + * holding rq->lock. + */ + lockdep_assert_held(&rq->lock); + } if (needs_other_cpu(p, task_cpu(p))) - set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + set_task_cpu(p, valid_task_cpu(p)); } #endif @@ -5069,8 +5652,8 @@ void init_idle(struct task_struct *idle, int cpu) unsigned long flags; raw_spin_lock_irqsave(&idle->pi_lock, flags); - time_lock_grq(rq); - idle->last_ran = rq->clock_task; + raw_spin_lock(&rq->lock); + idle->last_ran = rq->niffies; idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; @@ -5097,8 +5680,8 @@ void init_idle(struct task_struct *idle, int cpu) rcu_read_unlock(); rq->curr = rq->idle = idle; - idle->on_cpu = 1; - grq_unlock(); + idle->on_rq = TASK_ON_RQ_QUEUED; + raw_spin_unlock(&rq->lock); raw_spin_unlock_irqrestore(&idle->pi_lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -5136,59 +5719,14 @@ int task_can_attach(struct task_struct *p, return ret; } -void wake_q_add(struct wake_q_head *head, struct task_struct *task) -{ - struct wake_q_node *node = &task->wake_q; - - /* - * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the - * wakeup due to that. - * - * This cmpxchg() implies a full barrier, which pairs with the write - * barrier implied by the wakeup in wake_up_q(). - */ - if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) - return; - - get_task_struct(task); - - /* - * The head is context local, there can be no concurrency. - */ - *head->lastp = node; - head->lastp = &node->next; -} - -void wake_up_q(struct wake_q_head *head) -{ - struct wake_q_node *node = head->first; - - while (node != WAKE_Q_TAIL) { - struct task_struct *task; - - task = container_of(node, struct task_struct, wake_q); - BUG_ON(!task); - /* task can safely be re-inserted now */ - node = node->next; - task->wake_q.next = NULL; - - /* - * wake_up_process() implies a wmb() to pair with the queueing - * in wake_q_add() so as not to miss wakeups. - */ - wake_up_process(task); - put_task_struct(task); - } -} - void resched_cpu(int cpu) { + struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); resched_task(cpu_curr(cpu)); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); } #ifdef CONFIG_SMP @@ -5262,7 +5800,7 @@ int get_nohz_timer_target(void) continue; if (!idle_cpu(i) && is_housekeeping_cpu(i)) { - cpu = i; + cpu = i; cpu = i; goto unlock; } @@ -5291,8 +5829,10 @@ void wake_up_idle_cpu(int cpu) if (cpu == smp_processor_id()) return; - set_tsk_need_resched(cpu_rq(cpu)->idle); - smp_send_reschedule(cpu); + if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); } void wake_up_nohz_cpu(int cpu) @@ -5321,7 +5861,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, struct rq *rq; int ret = 0; - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); if (p->flags & PF_KTHREAD) { /* @@ -5366,22 +5906,30 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask)) goto out; - if (task_running(p)) { + if (task_running(rq, p)) { /* Task is running on the wrong cpu now, reschedule it. */ if (rq == this_rq()) { set_tsk_need_resched(p); running_wrong = true; } else resched_task(p); - } else - set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); + } else { + int dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); + struct rq *dest_rq = cpu_rq(dest_cpu); + + /* Switch rq locks here */ + lock_second_rq(rq, dest_rq); + set_task_cpu(p, dest_cpu); + rq_unlock(rq); + rq = dest_rq; + } out: if (queued && !cpumask_subset(new_mask, &old_mask)) try_preempt(p, rq); if (running_wrong) preempt_disable(); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); if (running_wrong) { __schedule(true); @@ -5397,11 +5945,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); -static bool sched_smp_initialized __read_mostly; - #ifdef CONFIG_HOTPLUG_CPU -/* Run through task list and find tasks affined to the dead cpu, then remove - * that cpu from the list, enable cpu0 and set the zerobound flag. */ +/* + * Run through task list and find tasks affined to the dead cpu, then remove + * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold + * cpu 0 and src_cpu's runqueue locks. + */ static void bind_zero(int src_cpu) { struct task_struct *p, *t; @@ -5412,15 +5961,17 @@ static void bind_zero(int src_cpu) do_each_thread(t, p) { if (cpumask_test_cpu(src_cpu, tsk_cpus_allowed(p))) { - cpumask_clear_cpu(src_cpu, tsk_cpus_allowed(p)); - cpumask_set_cpu(0, tsk_cpus_allowed(p)); + bool local = (task_cpu(p) == src_cpu); + + /* task_running is the cpu stopper thread */ + if (local && task_running(task_rq(p), p)) + continue; + atomic_clear_cpu(src_cpu, tsk_cpus_allowed(p)); + atomic_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; bound++; - if (task_cpu(p) == src_cpu) { + if (local) set_task_cpu(p, 0); - if (task_running(p)) - resched_task(p); - } } } while_each_thread(t, p); @@ -5834,7 +6385,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) struct root_domain *old_rd = NULL; unsigned long flags; - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { old_rd = rq->rd; @@ -5860,7 +6411,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd) if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) set_rq_online(rq); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); if (old_rd) call_rcu_sched(&old_rd->rcu, free_rootdomain); @@ -6839,14 +7390,13 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - grq_lock_irqsave(&flags); + rq_lock_irqsave(rq, &flags); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } unbind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + rq_unlock_irqrestore(rq, &flags); return 0; } @@ -6894,14 +7444,15 @@ int sched_cpu_dying(unsigned int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - grq_lock_irqsave(&flags); + local_irq_save(flags); + double_rq_lock(rq, cpu_rq(0)); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } bind_zero(cpu); - grq.noc = num_online_cpus(); - grq_unlock_irqrestore(&flags); + double_rq_unlock(rq, cpu_rq(0)); + local_irq_restore(flags); return 0; } @@ -6958,9 +7509,8 @@ void __init sched_init_smp(void) #ifdef CONFIG_SCHED_SMT bool smt_threads = false; #endif - struct rq *rq; - cpumask_var_t non_isolated_cpus; + struct rq *rq; alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); alloc_cpumask_var(&fallback_doms, GFP_KERNEL); @@ -6985,7 +7535,8 @@ void __init sched_init_smp(void) free_cpumask_var(non_isolated_cpus); mutex_lock(&sched_domains_mutex); - grq_lock_irq(); + local_irq_disable(); + lock_all_rqs(); /* * Set up the relative cache distance of each online cpu from each * other in a simple array for quick lookup. Locality is determined @@ -7025,21 +7576,21 @@ void __init sched_init_smp(void) } #endif #ifdef CONFIG_SCHED_SMT - for_each_cpu(other_cpu, thread_cpumask(cpu)) - rq->cpu_locality[other_cpu] = 1; if (cpumask_weight(thread_cpumask(cpu)) > 1) { cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); cpumask_clear_cpu(cpu, &rq->thread_mask); + for_each_cpu(other_cpu, thread_cpumask(cpu)) + rq->cpu_locality[other_cpu] = 1; rq->siblings_idle = siblings_cpu_idle; smt_threads = true; } #endif } for_each_possible_cpu(cpu) { - int total_cpus = 0, locality; + int total_cpus = 1, locality; rq = cpu_rq(cpu); - for (locality = 0; locality <= 4; locality++) { + for (locality = 1; locality <= 4; locality++) { for_each_possible_cpu(other_cpu) { if (rq->cpu_locality[other_cpu] == locality) rq->rq_order[total_cpus++] = cpu_rq(other_cpu); @@ -7053,7 +7604,8 @@ void __init sched_init_smp(void) smt_schedule = &smt_should_schedule; } #endif - grq_unlock_irq(); + unlock_all_rqs(); + local_irq_enable(); mutex_unlock(&sched_domains_mutex); for_each_online_cpu(cpu) { @@ -7062,7 +7614,7 @@ void __init sched_init_smp(void) for_each_online_cpu(other_cpu) { if (other_cpu <= cpu) continue; - printk(KERN_DEBUG "BFS LOCALITY CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); + printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); } } sched_smp_initialized = true; @@ -7116,21 +7668,14 @@ void __init sched_init(void) for (i = 1 ; i < NICE_WIDTH ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; - raw_spin_lock_init(&grq.lock); - grq.nr_running = grq.nr_uninterruptible = grq.nr_switches = 0; - grq.niffies = 0; - grq.last_jiffy = jiffies; - raw_spin_lock_init(&grq.iso_lock); - grq.iso_ticks = 0; - grq.iso_refractory = false; - grq.noc = 1; - skiplist_init(&grq.node); - grq.sl = new_skiplist(&grq.node); + atomic_set(&grq.nr_running, 0); + atomic_set(&grq.nr_uninterruptible, 0); + atomic64_set(&grq.nr_switches, 0); skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP init_defrootdomain(); - grq.qnr = grq.idle_cpus = 0; + atomic_set(&grq.qnr, 0); cpumask_clear(&grq.cpu_idle_map); #else uprq = &per_cpu(runqueues, 0); @@ -7145,12 +7690,18 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); - rq->grq_lock = &grq.lock; - rq->user_pc = rq->nice_pc = rq->softirq_pc = rq->system_pc = - rq->iowait_pc = rq->idle_pc = 0; - rq->dither = false; + skiplist_init(&rq->node); + rq->sl = new_skiplist(&rq->node); + raw_spin_lock_init(&rq->lock); + rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; + rq->last_jiffy = jiffies; + rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = + rq->iowait_ns = rq->idle_ns = 0; + rq->dither = 0; + set_rq_task(rq, &init_task); + rq->iso_ticks = 0; + rq->iso_refractory = false; #ifdef CONFIG_SMP - rq->last_niffy = 0; rq->sd = NULL; rq->rd = NULL; rq->online = false; @@ -7302,9 +7853,9 @@ static inline void normalise_rt_tasks(void) if (!rt_task(p) && !iso_task(p)) continue; - rq = task_grq_lock(p, &flags); + rq = task_rq_lock(p, &flags); __setscheduler(p, rq, SCHED_NORMAL, 0, false); - task_grq_unlock(p, &flags); + task_rq_unlock(rq, p, &flags); } read_unlock(&tasklist_lock); } @@ -7367,6 +7918,25 @@ void set_curr_task(int cpu, struct task_struct *p) /* * Use precise platform statistics if available: */ +#ifdef CONFIG_VIRT_CPU_ACCOUNTING + +#ifndef __ARCH_HAS_VTIME_TASK_SWITCH +void vtime_common_task_switch(struct task_struct *prev) +{ + if (is_idle_task(prev)) + vtime_account_idle(prev); + else + vtime_account_system(prev); + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + vtime_account_user(prev); +#endif + arch_vtime_task_switch(prev); +} +#endif + +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -7395,20 +7965,26 @@ void vtime_account_system_irqsafe(struct task_struct *tsk) } EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); +/* + * Archs that account the whole time spent in the idle task + * (outside irq) as idle time can rely on this and just implement + * vtime_account_system() and vtime_account_idle(). Archs that + * have other meaning of the idle time (s390 only includes the + * time spent by the CPU when it's in low power mode) must override + * vtime_account(). + */ +#ifndef __ARCH_HAS_VTIME_ACCOUNT +void vtime_account_irq_enter(struct task_struct *tsk) +{ + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); else - vtime_account_system(prev); - - vtime_account_user(prev); - arch_vtime_task_switch(prev); + vtime_account_system(tsk); } -#endif +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +#endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#else +#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ /* * Perform (stime * rtime) / total, but avoid multiplication overflow by * losing precision when the numbers are big. @@ -7530,7 +8106,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime thread_group_cputime(p, &cputime); cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); } -#endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ void init_idle_bootup_task(struct task_struct *idle) {} diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/MuQSS.h index 00a16ba0a..4e3115dac 100644 --- a/kernel/sched/bfs_sched.h +++ b/kernel/sched/MuQSS.h @@ -1,9 +1,14 @@ #include <linux/sched.h> #include <linux/cpuidle.h> +#include <linux/skip_list.h> #include <linux/stop_machine.h> -#ifndef BFS_SCHED_H -#define BFS_SCHED_H +#ifndef MUQSS_SCHED_H +#define MUQSS_SCHED_H + +/* task_struct::on_rq states: */ +#define TASK_ON_RQ_QUEUED 1 +#define TASK_ON_RQ_MIGRATING 2 /* * This is the main, per-CPU runqueue data structure. @@ -13,16 +18,21 @@ struct rq { struct task_struct *curr, *idle, *stop; struct mm_struct *prev_mm; - /* Pointer to grq spinlock */ - raw_spinlock_t *grq_lock; + raw_spinlock_t lock; - /* Stored data about rq->curr to work outside grq lock */ + /* Stored data about rq->curr to work outside rq lock */ u64 rq_deadline; - unsigned int rq_policy; - int rq_time_slice; - u64 rq_last_ran; int rq_prio; - int soft_affined; /* Running or queued tasks with this set as their rq */ + + /* Best queued id for use outside lock */ + u64 best_key; + + unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ + unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ + u64 niffies; /* Last time this RQ updated rq clock */ + u64 last_niffy; /* Last niffies as updated by local clock */ + u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ + u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ #ifdef CONFIG_SMT_NICE @@ -30,12 +40,15 @@ struct rq { int rq_smt_bias; /* Policy/nice level bias across smt siblings */ #endif /* Accurate timekeeping data */ - u64 timekeep_clock; - unsigned long user_pc, nice_pc, irq_pc, softirq_pc, system_pc, - iowait_pc, idle_pc; + unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, + iowait_ns, idle_ns; atomic_t nr_iowait; + skiplist_node node; + skiplist *sl; #ifdef CONFIG_SMP + struct task_struct *preempt; /* Preempt triggered on this task */ + int cpu; /* cpu of this runqueue */ bool online; @@ -43,6 +56,7 @@ struct rq { struct sched_domain *sd; int *cpu_locality; /* CPU relative cache distance */ struct rq **rq_order; /* RQs ordered by relative cache distance */ + #ifdef CONFIG_SCHED_SMT cpumask_t thread_mask; bool (*siblings_idle)(struct rq *rq); @@ -53,7 +67,6 @@ struct rq { bool (*cache_idle)(struct rq *rq); /* See if all cache siblings are idle */ #endif /* CONFIG_SCHED_MC */ - u64 last_niffy; /* Last time this RQ updated grq.niffies */ #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -67,7 +80,10 @@ struct rq { u64 clock, old_clock, last_tick; u64 clock_task; - bool dither; + int dither; + + int iso_ticks; + bool iso_refractory; #ifdef CONFIG_SCHEDSTATS @@ -88,6 +104,11 @@ struct rq { unsigned int ttwu_count; unsigned int ttwu_local; #endif /* CONFIG_SCHEDSTATS */ + +#ifdef CONFIG_SMP + struct llist_head wake_list; +#endif + #ifdef CONFIG_CPU_IDLE /* Must be inspected within a rcu lock section */ struct cpuidle_state *idle_state; @@ -111,6 +132,41 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); #define raw_rq() raw_cpu_ptr(&runqueues) #endif /* CONFIG_SMP */ +/* + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_MIGRATED - the task was migrated during wakeup + * + */ + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ + +#define ENQUEUE_WAKEUP 0x01 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 +#ifdef CONFIG_SMP +#define ENQUEUE_MIGRATED 0x20 +#else +#define ENQUEUE_MIGRATED 0x00 +#endif + static inline u64 __rq_clock_broken(struct rq *rq) { return READ_ONCE(rq->clock); @@ -118,13 +174,13 @@ static inline u64 __rq_clock_broken(struct rq *rq) static inline u64 rq_clock(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock; } static inline u64 rq_clock_task(struct rq *rq) { - lockdep_assert_held(rq->grq_lock); + lockdep_assert_held(&rq->lock); return rq->clock_task; } @@ -157,17 +213,11 @@ static inline void unregister_sched_domain_sysctl(void) } #endif -static inline void sched_ttwu_pending(void) { } - -static inline int task_on_rq_queued(struct task_struct *p) -{ - return p->on_rq; -} - #ifdef CONFIG_SMP - +extern void sched_ttwu_pending(void); extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); - +#else +static inline void sched_ttwu_pending(void) { } #endif #ifdef CONFIG_CPU_IDLE @@ -221,4 +271,4 @@ static inline void cpufreq_trigger(u64 time, unsigned long util) #define arch_scale_freq_invariant() (false) #endif -#endif /* BFS_SCHED_H */ +#endif /* MUQSS_SCHED_H */ diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index 2ebd7b0c4..2a644b6be 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c @@ -9,8 +9,8 @@ * published by the Free Software Foundation. */ -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index eba226d7c..a2bf1ea69 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -16,8 +16,8 @@ #include <linux/slab.h> #include <trace/events/power.h> -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 1e855dcbd..060b76d85 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -14,8 +14,8 @@ #include <trace/events/power.h> -#ifdef CONFIG_SCHED_BFS -#include "bfs_sched.h" +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" #else #include "sched.h" #endif diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 7466a0bb2..ba7b137c3 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c @@ -4,10 +4,10 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> -#ifndef CONFIG_SCHED_BFS +#ifndef CONFIG_SCHED_MUQSS #include "sched.h" #else -#include "bfs_sched.h" +#include "MuQSS.h" #endif /* |