summaryrefslogtreecommitdiff
path: root/kernel/sched/MuQSS.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/MuQSS.c')
-rw-r--r--kernel/sched/MuQSS.c799
1 files changed, 227 insertions, 572 deletions
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index bd9b1f13f..cf7a95286 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -123,6 +123,7 @@
*/
#define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ))
#define JIFFY_NS (1073741824 / HZ)
+#define JIFFY_US (1048576 / HZ)
#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS)
#define HALF_JIFFY_NS (1073741824 / HZ / 2)
#define HALF_JIFFY_US (1048576 / HZ / 2)
@@ -130,12 +131,13 @@
#define MS_TO_US(TIME) ((TIME) << 10)
#define NS_TO_MS(TIME) ((TIME) >> 20)
#define NS_TO_US(TIME) ((TIME) >> 10)
+#define US_TO_NS(TIME) ((TIME) << 10)
#define RESCHED_US (100) /* Reschedule if less than this many μs left */
void print_scheduler_version(void)
{
- printk(KERN_INFO "MuQSS CPU scheduler v0.115 by Con Kolivas.\n");
+ printk(KERN_INFO "MuQSS CPU scheduler v0.120 by Con Kolivas.\n");
}
/*
@@ -179,9 +181,26 @@ static inline int timeslice(void)
return MS_TO_US(rr_interval);
}
+static bool sched_smp_initialized __read_mostly;
+
+/*
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
+ */
+struct global_rq {
#ifdef CONFIG_SMP
-static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+ atomic_t nr_running ____cacheline_aligned_in_smp;
+ atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+ atomic64_t nr_switches ____cacheline_aligned_in_smp;
+ cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+#else
+ atomic_t nr_running ____cacheline_aligned;
+ atomic_t nr_uninterruptible ____cacheline_aligned;
+ atomic64_t nr_switches ____cacheline_aligned;
+#endif
+};
+#ifdef CONFIG_SMP
/*
* We add the notion of a root-domain which will be used to define per-domain
* variables. Each exclusive cpuset essentially defines an island domain by
@@ -213,6 +232,13 @@ static struct root_domain def_root_domain;
#endif /* CONFIG_SMP */
+/* There can be only one */
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
+
static DEFINE_MUTEX(sched_hotcpu_mutex);
/* cpus with isolated domains */
@@ -768,7 +794,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
*/
if (unlikely(task_on_rq_migrating(prev))) {
sched_info_dequeued(rq, prev);
- rq->nr_running--;
/*
* We move the ownership of prev to the new cpu now. ttwu can't
* activate prev to the wrong cpu since it has to grab this
@@ -779,7 +804,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
raw_spin_lock(&prev->pi_lock);
rq = __task_rq_lock(prev);
- rq->nr_running++;
/* Check that someone else hasn't already queued prev */
if (likely(!task_queued(prev))) {
enqueue_task(rq, prev, 0);
@@ -834,7 +858,7 @@ static inline int ms_longest_deadline_diff(void)
static inline int rq_load(struct rq *rq)
{
- return rq->nr_running;
+ return rq->sl->entries + !rq_idle(rq);
}
static inline bool rq_local(struct rq *rq);
@@ -848,20 +872,24 @@ static inline bool rq_local(struct rq *rq);
*/
static void update_load_avg(struct rq *rq)
{
- /* rq clock can go backwards so skip update if that happens */
- if (likely(rq->clock > rq->load_update)) {
- unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
- long load, curload = rq_load(rq);
+ unsigned long us_interval;
+ long load, curload;
- load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
- if (unlikely(load < 0))
- load = 0;
- load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
- rq->load_avg = load;
- } else
+ if (unlikely(rq->niffies <= rq->load_update))
return;
- rq->load_update = rq->clock;
+ us_interval = NS_TO_US(rq->niffies - rq->load_update);
+ curload = rq_load(rq);
+ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+ if (unlikely(load < 0))
+ load = 0;
+ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+ /* If this CPU has all the load, make it ramp up quickly */
+ if (curload > load && curload >= atomic_read(&grq.nr_running))
+ load = curload;
+ rq->load_avg = load;
+
+ rq->load_update = rq->niffies;
if (likely(rq_local(rq)))
cpufreq_trigger(rq->niffies, rq->load_avg);
}
@@ -1085,7 +1113,7 @@ static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
static inline void set_cpuidle_map(int cpu)
{
if (likely(cpu_online(cpu)))
- atomic_set_cpu(cpu, &cpu_idle_map);
+ atomic_set_cpu(cpu, &grq.cpu_idle_map);
}
static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
@@ -1095,12 +1123,12 @@ static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
static inline void clear_cpuidle_map(int cpu)
{
- atomic_clear_cpu(cpu, &cpu_idle_map);
+ atomic_clear_cpu(cpu, &grq.cpu_idle_map);
}
static bool suitable_idle_cpus(struct task_struct *p)
{
- return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map));
+ return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
}
/*
@@ -1231,7 +1259,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu)
struct rq *rq;
int best_cpu;
- cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map);
+ cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
rq = cpu_rq(best_cpu);
if (!smt_schedule(p, rq))
@@ -1343,11 +1371,11 @@ static void activate_task(struct task_struct *p, struct rq *rq)
p->prio = effective_prio(p);
if (task_contributes_to_load(p))
- rq->nr_uninterruptible--;
+ atomic_dec(&grq.nr_uninterruptible);
enqueue_task(rq, p, 0);
p->on_rq = TASK_ON_RQ_QUEUED;
- rq->nr_running++;
+ atomic_inc(&grq.nr_running);
}
/*
@@ -1357,10 +1385,10 @@ static void activate_task(struct task_struct *p, struct rq *rq)
static inline void deactivate_task(struct task_struct *p, struct rq *rq)
{
if (task_contributes_to_load(p))
- rq->nr_uninterruptible++;
+ atomic_inc(&grq.nr_uninterruptible);
p->on_rq = 0;
- rq->nr_running--;
+ atomic_dec(&grq.nr_running);
sched_info_dequeued(rq, p);
}
@@ -1381,7 +1409,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
lockdep_is_held(&task_rq(p)->lock)));
#endif
- if (p->wake_cpu == cpu)
+ if (task_cpu(p) == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
@@ -1428,9 +1456,7 @@ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
dequeue_task(p_rq, p, DEQUEUE_SAVE);
if (p_rq != rq) {
- p_rq->nr_running--;
sched_info_dequeued(p_rq, p);
- rq->nr_running++;
sched_info_queued(rq, p);
}
set_task_cpu(p, cpu);
@@ -1767,7 +1793,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
#ifdef CONFIG_SMP
if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
+ atomic_dec(&grq.nr_uninterruptible);
#endif
ttwu_activate(rq, p);
@@ -1796,8 +1822,6 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static bool sched_smp_initialized __read_mostly;
-
void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
@@ -2312,6 +2336,16 @@ int sysctl_schedstats(struct ctl_table *table, int write,
static inline void init_schedstats(void) {}
#endif /* CONFIG_SCHEDSTATS */
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
+
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
+{
+ update_clocks(rq);
+ /* This isn't really a context switch but accounting is the same */
+ update_cpu_clock_switch(rq, p);
+ p->last_ran = rq->niffies;
+}
+
/*
* wake_up_new_task - wake up a newly created task for the first time.
*
@@ -2337,7 +2371,6 @@ void wake_up_new_task(struct task_struct *p)
}
double_rq_lock(rq, new_rq);
- update_clocks(rq);
rq_curr = rq->curr;
/*
@@ -2345,7 +2378,6 @@ void wake_up_new_task(struct task_struct *p)
*/
p->prio = rq_curr->normal_prio;
- activate_task(p, rq);
trace_sched_wakeup_new(p);
/*
@@ -2356,17 +2388,17 @@ void wake_up_new_task(struct task_struct *p)
* modified within schedule() so it is always equal to
* current->deadline.
*/
+ account_task_cpu(rq, rq_curr);
p->last_ran = rq_curr->last_ran;
if (likely(rq_curr->policy != SCHED_FIFO)) {
rq_curr->time_slice /= 2;
- if (unlikely(rq_curr->time_slice < RESCHED_US)) {
+ if (rq_curr->time_slice < RESCHED_US) {
/*
* Forking task has run out of timeslice. Reschedule it and
* start its child with a new time slice and deadline. The
* child will end up running first because its deadline will
* be slightly earlier.
*/
- rq_curr->time_slice = 0;
__set_tsk_resched(rq_curr);
time_slice_expired(p, new_rq);
if (suitable_idle_cpus(p))
@@ -2389,6 +2421,7 @@ void wake_up_new_task(struct task_struct *p)
time_slice_expired(p, new_rq);
try_preempt(p, new_rq);
}
+ activate_task(p, new_rq);
double_rq_unlock(rq, new_rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
@@ -2651,6 +2684,22 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+ return atomic_read(&grq.nr_running);
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+ return atomic_read(&grq.nr_uninterruptible);
+}
+
+/*
* Check if only the current task is running on the cpu.
*
* Caution: this function does not check that the caller has disabled
@@ -2674,31 +2723,9 @@ bool single_task_running(void)
}
EXPORT_SYMBOL(single_task_running);
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup.
- */
unsigned long long nr_context_switches(void)
{
- long long sum = 0;
- int i;
-
- for_each_possible_cpu(i)
- sum += cpu_rq(i)->nr_switches;
-
- return sum;
-}
-
-unsigned long nr_running(void)
-{
- long i, sum = 0;
-
- for_each_online_cpu(i)
- sum += cpu_rq(i)->nr_running;
-
- return sum;
+ return (unsigned long long)atomic64_read(&grq.nr_switches);
}
unsigned long nr_iowait(void)
@@ -2719,14 +2746,7 @@ unsigned long nr_iowait_cpu(int cpu)
unsigned long nr_active(void)
{
- long i, sum = 0;
-
- for_each_online_cpu(i) {
- sum += cpu_rq(i)->nr_running;
- sum += cpu_rq(i)->nr_uninterruptible;
- }
-
- return sum;
+ return nr_running() + nr_uninterruptible();
}
/*
@@ -2797,116 +2817,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
- sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
- __this_cpu_inc(irq_time_seq.sequence);
- smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
- smp_wmb();
- __this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- u64 irq_time;
- unsigned seq;
-
- do {
- seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
- irq_time = per_cpu(cpu_softirq_time, cpu) +
- per_cpu(cpu_hardirq_time, cpu);
- } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
- return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
- return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
- unsigned long flags;
- s64 delta;
- int cpu;
-
- if (!sched_clock_irqtime)
- return;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
- delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
- __this_cpu_add(irq_start_time, delta);
-
- irq_time_write_begin();
- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
- __this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- __this_cpu_add(cpu_softirq_time, delta);
-
- irq_time_write_end();
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
#ifdef CONFIG_PARAVIRT
static inline u64 steal_ticks(u64 steal)
{
@@ -2968,89 +2878,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs)
#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static void irqtime_account_hi_si(void)
-{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- u64 latest_ns;
-
- latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
- if (latest_ns > cpustat[CPUTIME_IRQ])
- cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
-
- latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
- if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
- cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime (0)
-
-static inline void irqtime_account_hi_si(void)
-{
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
- if (static_key_false(&paravirt_steal_enabled)) {
- u64 steal;
- cputime_t steal_ct;
-
- steal = paravirt_steal_clock(smp_processor_id());
- steal -= this_rq()->prev_steal_time;
-
- /*
- * cputime_t may be less precise than nsecs (eg: if it's
- * based on jiffies). Lets cast the result to cputime
- * granularity and account the rest on the next rounds.
- */
- steal_ct = nsecs_to_cputime(steal);
- this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
- account_steal_time(steal_ct);
- return steal_ct;
- }
-#endif
- return false;
-}
-
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
- struct signal_struct *sig = tsk->signal;
- cputime_t utime, stime;
- struct task_struct *t;
- unsigned int seq, nextseq;
- unsigned long flags;
-
- rcu_read_lock();
- /* Attempt a lockless read on the first round. */
- nextseq = 0;
- do {
- seq = nextseq;
- flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
-
- for_each_thread(tsk, t) {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- }
- /* If lockless access failed, take the lock. */
- nextseq = 1;
- } while (need_seqretry(&sig->stats_lock, seq));
- done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
- rcu_read_unlock();
-}
-
/*
* On each tick, add the number of nanoseconds to the unbanked variables and
* once one tick's worth has accumulated, account it allowing for accurate
@@ -3175,15 +3002,11 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
* Bank in p->sched_time the ns elapsed since the last tick or switch.
* CPU scheduler quota accounting is also performed here in microseconds.
*/
-static void
-update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
{
s64 account_ns = rq->niffies - p->last_ran;
struct task_struct *idle = rq->idle;
- if (steal_account_process_tick())
- goto ts_account;
-
/* Accurate tick timekeeping */
if (user_mode(get_irq_regs()))
pc_user_time(rq, p, account_ns);
@@ -3192,10 +3015,6 @@ update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
} else
pc_idle_time(rq, idle, account_ns);
- if (sched_clock_irqtime)
- irqtime_account_hi_si();
-
-ts_account:
/* time_slice accounting is done in usecs to avoid overflow on 32bit */
if (p->policy != SCHED_FIFO && p != idle)
p->time_slice -= NS_TO_US(account_ns);
@@ -3208,8 +3027,7 @@ ts_account:
* Bank in p->sched_time the ns elapsed since the last tick or switch.
* CPU scheduler quota accounting is also performed here in microseconds.
*/
-static void
-update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
{
s64 account_ns = rq->niffies - p->last_ran;
struct task_struct *idle = rq->idle;
@@ -3283,133 +3101,86 @@ unsigned long long task_sched_runtime(struct task_struct *p)
return ns;
}
-/* Compatibility crap */
-void account_user_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
-{
-}
-
-void account_idle_time(cputime_t cputime)
+#ifdef CONFIG_HIGH_RES_TIMERS
+static inline int hrexpiry_enabled(struct rq *rq)
{
+ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
+ return 0;
+ return hrtimer_is_hres_active(&rq->hrexpiry_timer);
}
/*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
+ * Use HR-timers to deliver accurate preemption points.
*/
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled)
+static void hrexpiry_clear(struct rq *rq)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
-
- /* Add guest time to process. */
- p->utime += (__force u64)cputime;
- p->utimescaled += (__force u64)cputime_scaled;
- account_group_user_time(p, cputime);
- p->gtime += (__force u64)cputime;
-
- /* Add guest time to cpustat. */
- if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += (__force u64)cputime;
- cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
- } else {
- cpustat[CPUTIME_USER] += (__force u64)cputime;
- cpustat[CPUTIME_GUEST] += (__force u64)cputime;
- }
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (hrtimer_active(&rq->hrexpiry_timer))
+ hrtimer_cancel(&rq->hrexpiry_timer);
}
/*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * High-resolution time_slice expiry.
+ * Runs from hardirq context with interrupts disabled.
*/
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
{
- /* Add system time to process. */
- p->stime += (__force u64)cputime;
- p->stimescaled += (__force u64)cputime_scaled;
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- *target_cputime64 += (__force u64)cputime;
-
- /* Account for system time used */
- acct_update_integrals(p);
-}
+ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
+ struct task_struct *p;
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * This is for guest only now.
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
- cputime_t cputime, cputime_t cputime_scaled)
-{
+ /* This can happen during CPU hotplug / resume */
+ if (unlikely(cpu_of(rq) != smp_processor_id()))
+ goto out;
- if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
- account_guest_time(p, cputime, cputime_scaled);
+ /*
+ * We're doing this without the runqueue lock but this should always
+ * be run on the local CPU. Time slice should run out in __schedule
+ * but we set it to zero here in case niffies is slightly less.
+ */
+ p = rq->curr;
+ p->time_slice = 0;
+ __set_tsk_resched(p);
+out:
+ return HRTIMER_NORESTART;
}
/*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * Called to set the hrexpiry timer state.
+ *
+ * called with irqs disabled from the local CPU only
*/
-void account_steal_time(cputime_t cputime)
+static void hrexpiry_start(struct rq *rq, u64 delay)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ if (!hrexpiry_enabled(rq))
+ return;
- cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
+ HRTIMER_MODE_REL_PINNED);
}
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-static void account_idle_times(cputime_t cputime)
+static void init_rq_hrexpiry(struct rq *rq)
{
- u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();
-
- if (atomic_read(&rq->nr_iowait) > 0)
- cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
- else
- cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ rq->hrexpiry_timer.function = hrexpiry;
}
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-void account_process_tick(struct task_struct *p, int user_tick)
+static inline int rq_dither(struct rq *rq)
{
+ if (!hrexpiry_enabled(rq))
+ return HALF_JIFFY_US;
+ return 0;
}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
+#else /* CONFIG_HIGH_RES_TIMERS */
+static inline void init_rq_hrexpiry(struct rq *rq)
{
- account_steal_time(jiffies_to_cputime(ticks));
}
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
+static inline int rq_dither(struct rq *rq)
{
- account_idle_times(jiffies_to_cputime(ticks));
+ return HALF_JIFFY_US;
}
-#endif
+#endif /* CONFIG_HIGH_RES_TIMERS */
/*
* Functions to test for when SCHED_ISO tasks have used their allocated
@@ -3488,6 +3259,8 @@ static void task_running_tick(struct rq *rq)
* allowed to run into the 2nd half of the next tick if they will
* run out of time slice in the interim. Otherwise, if they have
* less than RESCHED_US μs of time slice left they will be rescheduled.
+ * Dither is used as a backup for when hrexpiry is disabled or high res
+ * timers not configured in.
*/
if (p->time_slice - rq->dither >= RESCHED_US)
return;
@@ -3497,6 +3270,60 @@ out_resched:
rq_unlock(rq);
}
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * We can stop the timer tick any time highres timers are active since
+ * we rely entirely on highres timeouts for task expiry rescheduling.
+ */
+static void sched_stop_tick(struct rq *rq, int cpu)
+{
+ if (!hrexpiry_enabled(rq))
+ return;
+ if (!tick_nohz_full_enabled())
+ return;
+ if (!tick_nohz_full_cpu(cpu))
+ return;
+ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running.
+ *
+ * This makes sure that uptime continues to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+ struct rq *rq = this_rq();
+ unsigned long next, now = READ_ONCE(jiffies);
+
+ next = rq->last_jiffy + HZ;
+
+ if (time_before_eq(next, now))
+ return 0;
+
+ return jiffies_to_nsecs(next - now);
+}
+#else
+static inline void sched_stop_tick(struct rq *rq, int cpu)
+{
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+}
+#endif
+
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
@@ -3507,7 +3334,7 @@ void scheduler_tick(void)
struct rq *rq = cpu_rq(cpu);
sched_clock_tick();
- update_rq_clock(rq);
+ update_clocks(rq);
update_load_avg(rq);
update_cpu_clock_tick(rq, rq->curr);
if (!rq_idle(rq))
@@ -3517,6 +3344,7 @@ void scheduler_tick(void)
rq->last_scheduler_tick = rq->last_jiffy;
rq->last_tick = rq->clock;
perf_event_task_tick();
+ sched_stop_tick(rq, cpu);
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -3799,6 +3627,17 @@ static inline void schedule_debug(struct task_struct *prev)
*/
static inline void set_rq_task(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ if (p == rq->idle || p->policy == SCHED_FIFO)
+ hrexpiry_clear(rq);
+ else
+ hrexpiry_start(rq, US_TO_NS(p->time_slice));
+#endif /* CONFIG_HIGH_RES_TIMERS */
+ if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+ rq->dither = 0;
+ else
+ rq->dither = rq_dither(rq);
+
rq->rq_deadline = p->deadline;
rq->rq_prio = p->prio;
#ifdef CONFIG_SMT_NICE
@@ -3980,10 +3819,6 @@ static void __sched notrace __schedule(bool preempt)
update_clocks(rq);
niffies = rq->niffies;
update_cpu_clock_switch(rq, prev);
- if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
- rq->dither = 0;
- else
- rq->dither = HALF_JIFFY_US;
clear_tsk_need_resched(prev);
clear_preempt_need_resched();
@@ -3994,15 +3829,15 @@ static void __sched notrace __schedule(bool preempt)
}
next = earliest_deadline_task(rq, cpu, idle);
- if (likely(next->prio != PRIO_LIMIT)) {
+ if (likely(next->prio != PRIO_LIMIT))
clear_cpuidle_map(cpu);
- next->last_ran = niffies;
- } else {
+ else {
set_cpuidle_map(cpu);
update_load_avg(rq);
}
set_rq_task(rq, next);
+ next->last_ran = niffies;
if (likely(prev != next)) {
/*
@@ -4014,14 +3849,16 @@ static void __sched notrace __schedule(bool preempt)
check_siblings(rq);
else
wake_siblings(rq);
- rq->nr_switches++;
+ atomic64_inc(&grq.nr_switches);
rq->curr = next;
++*switch_count;
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
- } else
+ } else {
+ check_siblings(rq);
rq_unlock_irq(rq);
+ }
}
static inline void sched_submit_work(struct task_struct *tsk)
@@ -5607,8 +5444,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
__do_set_cpus_allowed(p, new_mask);
if (needs_other_cpu(p, task_cpu(p))) {
+ struct rq *rq;
+
set_task_cpu(p, valid_task_cpu(p));
+ rq = __task_rq_lock(p);
resched_task(p);
+ __task_rq_unlock(rq);
}
}
@@ -5641,6 +5482,7 @@ void init_idle(struct task_struct *idle, int cpu)
raw_spin_lock_irqsave(&idle->pi_lock, flags);
raw_spin_lock(&rq->lock);
idle->last_ran = rq->niffies;
+ time_slice_expired(idle, rq);
idle->state = TASK_RUNNING;
/* Setting prio to illegal value shouldn't matter when never queued */
idle->prio = PRIO_LIMIT;
@@ -7439,6 +7281,8 @@ int sched_cpu_dying(unsigned int cpu)
}
bind_zero(cpu);
double_rq_unlock(rq, cpu_rq(0));
+ sched_start_tick(rq, cpu);
+ hrexpiry_clear(rq);
local_irq_restore(flags);
return 0;
@@ -7463,7 +7307,7 @@ static const cpumask_t *thread_cpumask(int cpu)
/* All this CPU's SMT siblings are idle */
static bool siblings_cpu_idle(struct rq *rq)
{
- return cpumask_subset(&rq->thread_mask, &cpu_idle_map);
+ return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
}
#endif
#ifdef CONFIG_SCHED_MC
@@ -7474,7 +7318,7 @@ static const cpumask_t *core_cpumask(int cpu)
/* All this CPU's shared cache siblings are idle */
static bool cache_cpu_idle(struct rq *rq)
{
- return cpumask_subset(&rq->core_mask, &cpu_idle_map);
+ return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
}
#endif
@@ -7609,6 +7453,7 @@ void __init sched_init_smp(void)
#else
void __init sched_init_smp(void)
{
+ sched_smp_initialized = true;
}
#endif /* CONFIG_SMP */
@@ -7655,11 +7500,14 @@ void __init sched_init(void)
for (i = 1 ; i < NICE_WIDTH ; i++)
prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
+ atomic_set(&grq.nr_running, 0);
+ atomic_set(&grq.nr_uninterruptible, 0);
+ atomic64_set(&grq.nr_switches, 0);
skiplist_node_init(&init_task.node);
#ifdef CONFIG_SMP
init_defrootdomain();
- cpumask_clear(&cpu_idle_map);
+ cpumask_clear(&grq.cpu_idle_map);
#else
uprq = &per_cpu(runqueues, 0);
#endif
@@ -7673,7 +7521,6 @@ void __init sched_init(void)
#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
rq = cpu_rq(i);
- rq->nr_running = rq->nr_uninterruptible = rq->nr_switches = 0;
skiplist_init(&rq->node);
rq->sl = new_skiplist(&rq->node);
raw_spin_lock_init(&rq->lock);
@@ -7692,6 +7539,7 @@ void __init sched_init(void)
rq->cpu = i;
rq_attach_root(rq, &def_root_domain);
#endif
+ init_rq_hrexpiry(rq);
atomic_set(&rq->nr_iowait, 0);
}
@@ -7899,199 +7747,6 @@ void set_curr_task(int cpu, struct task_struct *p)
#endif
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
-{
- if (is_idle_task(prev))
- vtime_account_idle(prev);
- else
- vtime_account_system(prev);
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
- vtime_account_user(prev);
-#endif
- arch_vtime_task_switch(prev);
-}
-#endif
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- *ut = p->utime;
- *st = p->stime;
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
-
- *ut = cputime.utime;
- *st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- vtime_account_system(tsk);
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
-{
- if (!in_interrupt() && is_idle_task(tsk))
- vtime_account_idle(tsk);
- else
- vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
-#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
- u64 scaled;
-
- for (;;) {
- /* Make sure "rtime" is the bigger of stime/rtime */
- if (stime > rtime) {
- u64 tmp = rtime; rtime = stime; stime = tmp;
- }
-
- /* Make sure 'total' fits in 32 bits */
- if (total >> 32)
- goto drop_precision;
-
- /* Does rtime (and thus stime) fit in 32 bits? */
- if (!(rtime >> 32))
- break;
-
- /* Can we just balance rtime/stime rather than dropping bits? */
- if (stime >> 31)
- goto drop_precision;
-
- /* We can grow stime and shrink rtime and try to make them both fit */
- stime <<= 1;
- rtime >>= 1;
- continue;
-
-drop_precision:
- /* We drop from rtime, it has more bits than stime */
- rtime >>= 1;
- total >>= 1;
- }
-
- /*
- * Make sure gcc understands that this is a 32x32->64 multiply,
- * followed by a 64/32->64 divide.
- */
- scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
- return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
- struct prev_cputime *prev,
- cputime_t *ut, cputime_t *st)
-{
- cputime_t rtime, stime, utime, total;
-
- stime = curr->stime;
- total = stime + curr->utime;
-
- /*
- * Tick based cputime accounting depend on random scheduling
- * timeslices of a task to be interrupted or not by the timer.
- * Depending on these circumstances, the number of these interrupts
- * may be over or under-optimistic, matching the real user and system
- * cputime with a variable precision.
- *
- * Fix this by scaling these tick based values against the total
- * runtime accounted by the CFS scheduler.
- */
- rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
- /*
- * Update userspace visible utime/stime values only if actual execution
- * time is bigger than already exported. Note that can happen, that we
- * provided bigger values due to scaling inaccuracy on big numbers.
- */
- if (prev->stime + prev->utime >= rtime)
- goto out;
-
- if (total) {
- stime = scale_stime((__force u64)stime,
- (__force u64)rtime, (__force u64)total);
- utime = rtime - stime;
- } else {
- stime = rtime;
- utime = 0;
- }
-
- /*
- * If the tick based count grows faster than the scheduler one,
- * the result of the scaling may go backward.
- * Let's enforce monotonicity.
- */
- prev->stime = max(prev->stime, stime);
- prev->utime = max(prev->utime, utime);
-
-out:
- *ut = prev->utime;
- *st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime = {
- .sum_exec_runtime = tsk_seruntime(p),
- };
-
- task_cputime(p, &cputime.utime, &cputime.stime);
- cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
- struct task_cputime cputime;
-
- thread_group_cputime(p, &cputime);
- cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
void init_idle_bootup_task(struct task_struct *idle)
{}