From 1eae9639aac0f8de4d284f567ec722a822b52513 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Fabian=20Silva=20Delgado?= Date: Tue, 1 Nov 2016 14:27:38 -0300 Subject: Linux-libre 4.8.6-gnu --- kernel/Kconfig.hz | 1 + kernel/irq/generic-chip.c | 21 ++ kernel/sched/Makefile | 4 +- kernel/sched/MuQSS.c | 799 +++++++++++++--------------------------------- kernel/sched/MuQSS.h | 57 +++- kernel/sched/cputime.c | 7 +- kernel/sched/fair.c | 38 ++- kernel/time/Kconfig | 2 +- kernel/time/clockevents.c | 5 + 9 files changed, 347 insertions(+), 587 deletions(-) (limited to 'kernel') diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 2a202a846..ecde22d15 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -5,6 +5,7 @@ choice prompt "Timer frequency" default HZ_250 + default HZ_100 if SCHED_MUQSS help Allows the configuration of the timer frequency. It is customary to have the timer interrupt run at 1000 Hz but 100 Hz may be more diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c index abd286afb..a4775f345 100644 --- a/kernel/irq/generic-chip.c +++ b/kernel/irq/generic-chip.c @@ -411,8 +411,29 @@ int irq_map_generic_chip(struct irq_domain *d, unsigned int virq, } EXPORT_SYMBOL_GPL(irq_map_generic_chip); +static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_domain_get_irq_data(d, virq); + struct irq_domain_chip_generic *dgc = d->gc; + unsigned int hw_irq = data->hwirq; + struct irq_chip_generic *gc; + int irq_idx; + + gc = irq_get_domain_generic_chip(d, hw_irq); + if (!gc) + return; + + irq_idx = hw_irq % dgc->irqs_per_chip; + + clear_bit(irq_idx, &gc->installed); + irq_domain_set_info(d, virq, hw_irq, &no_irq_chip, NULL, NULL, NULL, + NULL); + +} + struct irq_domain_ops irq_generic_chip_ops = { .map = irq_map_generic_chip, + .unmap = irq_unmap_generic_chip, .xlate = irq_domain_xlate_onetwocell, }; EXPORT_SYMBOL_GPL(irq_generic_chip_ops); diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a787aa942..77bdf9807 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -18,14 +18,14 @@ endif ifdef CONFIG_SCHED_MUQSS obj-y += MuQSS.o clock.o else -obj-y += core.o loadavg.o clock.o cputime.o +obj-y += core.o loadavg.o clock.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o obj-$(CONFIG_SMP) += cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o endif -obj-y += wait.o swait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o cputime.o obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c index bd9b1f13f..cf7a95286 100644 --- a/kernel/sched/MuQSS.c +++ b/kernel/sched/MuQSS.c @@ -123,6 +123,7 @@ */ #define JIFFIES_TO_NS(TIME) ((TIME) * (1073741824 / HZ)) #define JIFFY_NS (1073741824 / HZ) +#define JIFFY_US (1048576 / HZ) #define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) #define HALF_JIFFY_NS (1073741824 / HZ / 2) #define HALF_JIFFY_US (1048576 / HZ / 2) @@ -130,12 +131,13 @@ #define MS_TO_US(TIME) ((TIME) << 10) #define NS_TO_MS(TIME) ((TIME) >> 20) #define NS_TO_US(TIME) ((TIME) >> 10) +#define US_TO_NS(TIME) ((TIME) << 10) #define RESCHED_US (100) /* Reschedule if less than this many μs left */ void print_scheduler_version(void) { - printk(KERN_INFO "MuQSS CPU scheduler v0.115 by Con Kolivas.\n"); + printk(KERN_INFO "MuQSS CPU scheduler v0.120 by Con Kolivas.\n"); } /* @@ -179,9 +181,26 @@ static inline int timeslice(void) return MS_TO_US(rr_interval); } +static bool sched_smp_initialized __read_mostly; + +/* + * The global runqueue data that all CPUs work off. Contains either atomic + * variables and a cpu bitmap set atomically. + */ +struct global_rq { #ifdef CONFIG_SMP -static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; + atomic_t nr_running ____cacheline_aligned_in_smp; + atomic_t nr_uninterruptible ____cacheline_aligned_in_smp; + atomic64_t nr_switches ____cacheline_aligned_in_smp; + cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; +#else + atomic_t nr_running ____cacheline_aligned; + atomic_t nr_uninterruptible ____cacheline_aligned; + atomic64_t nr_switches ____cacheline_aligned; +#endif +}; +#ifdef CONFIG_SMP /* * We add the notion of a root-domain which will be used to define per-domain * variables. Each exclusive cpuset essentially defines an island domain by @@ -213,6 +232,13 @@ static struct root_domain def_root_domain; #endif /* CONFIG_SMP */ +/* There can be only one */ +#ifdef CONFIG_SMP +static struct global_rq grq ____cacheline_aligned_in_smp; +#else +static struct global_rq grq ____cacheline_aligned; +#endif + static DEFINE_MUTEX(sched_hotcpu_mutex); /* cpus with isolated domains */ @@ -768,7 +794,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) */ if (unlikely(task_on_rq_migrating(prev))) { sched_info_dequeued(rq, prev); - rq->nr_running--; /* * We move the ownership of prev to the new cpu now. ttwu can't * activate prev to the wrong cpu since it has to grab this @@ -779,7 +804,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) raw_spin_lock(&prev->pi_lock); rq = __task_rq_lock(prev); - rq->nr_running++; /* Check that someone else hasn't already queued prev */ if (likely(!task_queued(prev))) { enqueue_task(rq, prev, 0); @@ -834,7 +858,7 @@ static inline int ms_longest_deadline_diff(void) static inline int rq_load(struct rq *rq) { - return rq->nr_running; + return rq->sl->entries + !rq_idle(rq); } static inline bool rq_local(struct rq *rq); @@ -848,20 +872,24 @@ static inline bool rq_local(struct rq *rq); */ static void update_load_avg(struct rq *rq) { - /* rq clock can go backwards so skip update if that happens */ - if (likely(rq->clock > rq->load_update)) { - unsigned long us_interval = (rq->clock - rq->load_update) >> 10; - long load, curload = rq_load(rq); + unsigned long us_interval; + long load, curload; - load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); - if (unlikely(load < 0)) - load = 0; - load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; - rq->load_avg = load; - } else + if (unlikely(rq->niffies <= rq->load_update)) return; - rq->load_update = rq->clock; + us_interval = NS_TO_US(rq->niffies - rq->load_update); + curload = rq_load(rq); + load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); + if (unlikely(load < 0)) + load = 0; + load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; + /* If this CPU has all the load, make it ramp up quickly */ + if (curload > load && curload >= atomic_read(&grq.nr_running)) + load = curload; + rq->load_avg = load; + + rq->load_update = rq->niffies; if (likely(rq_local(rq))) cpufreq_trigger(rq->niffies, rq->load_avg); } @@ -1085,7 +1113,7 @@ static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) static inline void set_cpuidle_map(int cpu) { if (likely(cpu_online(cpu))) - atomic_set_cpu(cpu, &cpu_idle_map); + atomic_set_cpu(cpu, &grq.cpu_idle_map); } static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) @@ -1095,12 +1123,12 @@ static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) static inline void clear_cpuidle_map(int cpu) { - atomic_clear_cpu(cpu, &cpu_idle_map); + atomic_clear_cpu(cpu, &grq.cpu_idle_map); } static bool suitable_idle_cpus(struct task_struct *p) { - return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); + return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map)); } /* @@ -1231,7 +1259,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu) struct rq *rq; int best_cpu; - cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); + cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); rq = cpu_rq(best_cpu); if (!smt_schedule(p, rq)) @@ -1343,11 +1371,11 @@ static void activate_task(struct task_struct *p, struct rq *rq) p->prio = effective_prio(p); if (task_contributes_to_load(p)) - rq->nr_uninterruptible--; + atomic_dec(&grq.nr_uninterruptible); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; - rq->nr_running++; + atomic_inc(&grq.nr_running); } /* @@ -1357,10 +1385,10 @@ static void activate_task(struct task_struct *p, struct rq *rq) static inline void deactivate_task(struct task_struct *p, struct rq *rq) { if (task_contributes_to_load(p)) - rq->nr_uninterruptible++; + atomic_inc(&grq.nr_uninterruptible); p->on_rq = 0; - rq->nr_running--; + atomic_dec(&grq.nr_running); sched_info_dequeued(rq, p); } @@ -1381,7 +1409,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || lockdep_is_held(&task_rq(p)->lock))); #endif - if (p->wake_cpu == cpu) + if (task_cpu(p) == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); @@ -1428,9 +1456,7 @@ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) dequeue_task(p_rq, p, DEQUEUE_SAVE); if (p_rq != rq) { - p_rq->nr_running--; sched_info_dequeued(p_rq, p); - rq->nr_running++; sched_info_queued(rq, p); } set_task_cpu(p, cpu); @@ -1767,7 +1793,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) #ifdef CONFIG_SMP if (p->sched_contributes_to_load) - rq->nr_uninterruptible--; + atomic_dec(&grq.nr_uninterruptible); #endif ttwu_activate(rq, p); @@ -1796,8 +1822,6 @@ static int ttwu_remote(struct task_struct *p, int wake_flags) } #ifdef CONFIG_SMP -static bool sched_smp_initialized __read_mostly; - void sched_ttwu_pending(void) { struct rq *rq = this_rq(); @@ -2312,6 +2336,16 @@ int sysctl_schedstats(struct ctl_table *table, int write, static inline void init_schedstats(void) {} #endif /* CONFIG_SCHEDSTATS */ +static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); + +static void account_task_cpu(struct rq *rq, struct task_struct *p) +{ + update_clocks(rq); + /* This isn't really a context switch but accounting is the same */ + update_cpu_clock_switch(rq, p); + p->last_ran = rq->niffies; +} + /* * wake_up_new_task - wake up a newly created task for the first time. * @@ -2337,7 +2371,6 @@ void wake_up_new_task(struct task_struct *p) } double_rq_lock(rq, new_rq); - update_clocks(rq); rq_curr = rq->curr; /* @@ -2345,7 +2378,6 @@ void wake_up_new_task(struct task_struct *p) */ p->prio = rq_curr->normal_prio; - activate_task(p, rq); trace_sched_wakeup_new(p); /* @@ -2356,17 +2388,17 @@ void wake_up_new_task(struct task_struct *p) * modified within schedule() so it is always equal to * current->deadline. */ + account_task_cpu(rq, rq_curr); p->last_ran = rq_curr->last_ran; if (likely(rq_curr->policy != SCHED_FIFO)) { rq_curr->time_slice /= 2; - if (unlikely(rq_curr->time_slice < RESCHED_US)) { + if (rq_curr->time_slice < RESCHED_US) { /* * Forking task has run out of timeslice. Reschedule it and * start its child with a new time slice and deadline. The * child will end up running first because its deadline will * be slightly earlier. */ - rq_curr->time_slice = 0; __set_tsk_resched(rq_curr); time_slice_expired(p, new_rq); if (suitable_idle_cpus(p)) @@ -2389,6 +2421,7 @@ void wake_up_new_task(struct task_struct *p) time_slice_expired(p, new_rq); try_preempt(p, new_rq); } + activate_task(p, new_rq); double_rq_unlock(rq, new_rq); raw_spin_unlock_irqrestore(&p->pi_lock, flags); } @@ -2650,6 +2683,22 @@ context_switch(struct rq *rq, struct task_struct *prev, return finish_task_switch(prev); } +/* + * nr_running, nr_uninterruptible and nr_context_switches: + * + * externally visible scheduler statistics: current number of runnable + * threads, total number of context switches performed since bootup. + */ +unsigned long nr_running(void) +{ + return atomic_read(&grq.nr_running); +} + +static unsigned long nr_uninterruptible(void) +{ + return atomic_read(&grq.nr_uninterruptible); +} + /* * Check if only the current task is running on the cpu. * @@ -2674,31 +2723,9 @@ bool single_task_running(void) } EXPORT_SYMBOL(single_task_running); -/* - * nr_running, nr_uninterruptible and nr_context_switches: - * - * externally visible scheduler statistics: current number of runnable - * threads, total number of context switches performed since bootup. - */ unsigned long long nr_context_switches(void) { - long long sum = 0; - int i; - - for_each_possible_cpu(i) - sum += cpu_rq(i)->nr_switches; - - return sum; -} - -unsigned long nr_running(void) -{ - long i, sum = 0; - - for_each_online_cpu(i) - sum += cpu_rq(i)->nr_running; - - return sum; + return (unsigned long long)atomic64_read(&grq.nr_switches); } unsigned long nr_iowait(void) @@ -2719,14 +2746,7 @@ unsigned long nr_iowait_cpu(int cpu) unsigned long nr_active(void) { - long i, sum = 0; - - for_each_online_cpu(i) { - sum += cpu_rq(i)->nr_running; - sum += cpu_rq(i)->nr_uninterruptible; - } - - return sum; + return nr_running() + nr_uninterruptible(); } /* @@ -2797,116 +2817,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - -/* - * There are no locks covering percpu hardirq/softirq time. - * They are only modified in account_system_vtime, on corresponding CPU - * with interrupts disabled. So, writes are safe. - * They are read and saved off onto struct rq in update_rq_clock(). - * This may result in other CPU reading this CPU's irq time and can - * race with irq/account_system_vtime on this CPU. We would either get old - * or new value with a side effect of accounting a slice of irq time to wrong - * task when irq is in progress while we read rq->clock. That is a worthy - * compromise in place of having locks on each irq in account_system_time. - */ -static DEFINE_PER_CPU(u64, cpu_hardirq_time); -static DEFINE_PER_CPU(u64, cpu_softirq_time); - -static DEFINE_PER_CPU(u64, irq_start_time); -static int sched_clock_irqtime; - -void enable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 1; -} - -void disable_sched_clock_irqtime(void) -{ - sched_clock_irqtime = 0; -} - -#ifndef CONFIG_64BIT -static DEFINE_PER_CPU(seqcount_t, irq_time_seq); - -static inline void irq_time_write_begin(void) -{ - __this_cpu_inc(irq_time_seq.sequence); - smp_wmb(); -} - -static inline void irq_time_write_end(void) -{ - smp_wmb(); - __this_cpu_inc(irq_time_seq.sequence); -} - -static inline u64 irq_time_read(int cpu) -{ - u64 irq_time; - unsigned seq; - - do { - seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); - irq_time = per_cpu(cpu_softirq_time, cpu) + - per_cpu(cpu_hardirq_time, cpu); - } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); - - return irq_time; -} -#else /* CONFIG_64BIT */ -static inline void irq_time_write_begin(void) -{ -} - -static inline void irq_time_write_end(void) -{ -} - -static inline u64 irq_time_read(int cpu) -{ - return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); -} -#endif /* CONFIG_64BIT */ - -/* - * Called before incrementing preempt_count on {soft,}irq_enter - * and before decrementing preempt_count on {soft,}irq_exit. - */ -void irqtime_account_irq(struct task_struct *curr) -{ - unsigned long flags; - s64 delta; - int cpu; - - if (!sched_clock_irqtime) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); - __this_cpu_add(irq_start_time, delta); - - irq_time_write_begin(); - /* - * We do not account for softirq time from ksoftirqd here. - * We want to continue accounting softirq time to ksoftirqd thread - * in that case, so as not to confuse scheduler with a special task - * that do not consume any time, but still wants to run. - */ - if (hardirq_count()) - __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); - - irq_time_write_end(); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(irqtime_account_irq); - -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { @@ -2968,89 +2878,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) # define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) #endif -#ifdef CONFIG_IRQ_TIME_ACCOUNTING -static void irqtime_account_hi_si(void) -{ - u64 *cpustat = kcpustat_this_cpu->cpustat; - u64 latest_ns; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)); - if (latest_ns > cpustat[CPUTIME_IRQ]) - cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy; - - latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)); - if (latest_ns > cpustat[CPUTIME_SOFTIRQ]) - cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy; -} -#else /* CONFIG_IRQ_TIME_ACCOUNTING */ - -#define sched_clock_irqtime (0) - -static inline void irqtime_account_hi_si(void) -{ -} -#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ - -static __always_inline bool steal_account_process_tick(void) -{ -#ifdef CONFIG_PARAVIRT - if (static_key_false(¶virt_steal_enabled)) { - u64 steal; - cputime_t steal_ct; - - steal = paravirt_steal_clock(smp_processor_id()); - steal -= this_rq()->prev_steal_time; - - /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime - * granularity and account the rest on the next rounds. - */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); - - account_steal_time(steal_ct); - return steal_ct; - } -#endif - return false; -} - -/* - * Accumulate raw cputime values of dead tasks (sig->[us]time) and live - * tasks (sum on group iteration) belonging to @tsk's group. - */ -void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) -{ - struct signal_struct *sig = tsk->signal; - cputime_t utime, stime; - struct task_struct *t; - unsigned int seq, nextseq; - unsigned long flags; - - rcu_read_lock(); - /* Attempt a lockless read on the first round. */ - nextseq = 0; - do { - seq = nextseq; - flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; - - for_each_thread(tsk, t) { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } - /* If lockless access failed, take the lock. */ - nextseq = 1; - } while (need_seqretry(&sig->stats_lock, seq)); - done_seqretry_irqrestore(&sig->stats_lock, seq, flags); - rcu_read_unlock(); -} - /* * On each tick, add the number of nanoseconds to the unbanked variables and * once one tick's worth has accumulated, account it allowing for accurate @@ -3175,15 +3002,11 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. */ -static void -update_cpu_clock_tick(struct rq *rq, struct task_struct *p) +static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) { s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; - if (steal_account_process_tick()) - goto ts_account; - /* Accurate tick timekeeping */ if (user_mode(get_irq_regs())) pc_user_time(rq, p, account_ns); @@ -3192,10 +3015,6 @@ update_cpu_clock_tick(struct rq *rq, struct task_struct *p) } else pc_idle_time(rq, idle, account_ns); - if (sched_clock_irqtime) - irqtime_account_hi_si(); - -ts_account: /* time_slice accounting is done in usecs to avoid overflow on 32bit */ if (p->policy != SCHED_FIFO && p != idle) p->time_slice -= NS_TO_US(account_ns); @@ -3208,8 +3027,7 @@ ts_account: * Bank in p->sched_time the ns elapsed since the last tick or switch. * CPU scheduler quota accounting is also performed here in microseconds. */ -static void -update_cpu_clock_switch(struct rq *rq, struct task_struct *p) +static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) { s64 account_ns = rq->niffies - p->last_ran; struct task_struct *idle = rq->idle; @@ -3283,133 +3101,86 @@ unsigned long long task_sched_runtime(struct task_struct *p) return ns; } -/* Compatibility crap */ -void account_user_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) -{ -} - -void account_idle_time(cputime_t cputime) +#ifdef CONFIG_HIGH_RES_TIMERS +static inline int hrexpiry_enabled(struct rq *rq) { + if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) + return 0; + return hrtimer_is_hres_active(&rq->hrexpiry_timer); } /* - * Account guest cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in virtual machine since the last update - * @cputime_scaled: cputime scaled by cpu frequency + * Use HR-timers to deliver accurate preemption points. */ -static void account_guest_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled) +static void hrexpiry_clear(struct rq *rq) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - - /* Add guest time to process. */ - p->utime += (__force u64)cputime; - p->utimescaled += (__force u64)cputime_scaled; - account_group_user_time(p, cputime); - p->gtime += (__force u64)cputime; - - /* Add guest time to cpustat. */ - if (task_nice(p) > 0) { - cpustat[CPUTIME_NICE] += (__force u64)cputime; - cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime; - } else { - cpustat[CPUTIME_USER] += (__force u64)cputime; - cpustat[CPUTIME_GUEST] += (__force u64)cputime; - } + if (!hrexpiry_enabled(rq)) + return; + if (hrtimer_active(&rq->hrexpiry_timer)) + hrtimer_cancel(&rq->hrexpiry_timer); } /* - * Account system cpu time to a process and desired cpustat field - * @p: the process that the cpu time gets accounted to - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * @target_cputime64: pointer to cpustat field that has to be updated + * High-resolution time_slice expiry. + * Runs from hardirq context with interrupts disabled. */ -static inline -void __account_system_time(struct task_struct *p, cputime_t cputime, - cputime_t cputime_scaled, cputime64_t *target_cputime64) +static enum hrtimer_restart hrexpiry(struct hrtimer *timer) { - /* Add system time to process. */ - p->stime += (__force u64)cputime; - p->stimescaled += (__force u64)cputime_scaled; - account_group_system_time(p, cputime); - - /* Add system time to cpustat. */ - *target_cputime64 += (__force u64)cputime; - - /* Account for system time used */ - acct_update_integrals(p); -} + struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); + struct task_struct *p; -/* - * Account system cpu time to a process. - * @p: the process that the cpu time gets accounted to - * @hardirq_offset: the offset to subtract from hardirq_count() - * @cputime: the cpu time spent in kernel space since the last update - * @cputime_scaled: cputime scaled by cpu frequency - * This is for guest only now. - */ -void account_system_time(struct task_struct *p, int hardirq_offset, - cputime_t cputime, cputime_t cputime_scaled) -{ + /* This can happen during CPU hotplug / resume */ + if (unlikely(cpu_of(rq) != smp_processor_id())) + goto out; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - account_guest_time(p, cputime, cputime_scaled); + /* + * We're doing this without the runqueue lock but this should always + * be run on the local CPU. Time slice should run out in __schedule + * but we set it to zero here in case niffies is slightly less. + */ + p = rq->curr; + p->time_slice = 0; + __set_tsk_resched(p); +out: + return HRTIMER_NORESTART; } /* - * Account for involuntary wait time. - * @steal: the cpu time spent in involuntary wait + * Called to set the hrexpiry timer state. + * + * called with irqs disabled from the local CPU only */ -void account_steal_time(cputime_t cputime) +static void hrexpiry_start(struct rq *rq, u64 delay) { - u64 *cpustat = kcpustat_this_cpu->cpustat; + if (!hrexpiry_enabled(rq)) + return; - cpustat[CPUTIME_STEAL] += (__force u64)cputime; + hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), + HRTIMER_MODE_REL_PINNED); } -/* - * Account for idle time. - * @cputime: the cpu time spent in idle wait - */ -static void account_idle_times(cputime_t cputime) +static void init_rq_hrexpiry(struct rq *rq) { - u64 *cpustat = kcpustat_this_cpu->cpustat; - struct rq *rq = this_rq(); - - if (atomic_read(&rq->nr_iowait) > 0) - cpustat[CPUTIME_IOWAIT] += (__force u64)cputime; - else - cpustat[CPUTIME_IDLE] += (__force u64)cputime; + hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrexpiry_timer.function = hrexpiry; } -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - -void account_process_tick(struct task_struct *p, int user_tick) +static inline int rq_dither(struct rq *rq) { + if (!hrexpiry_enabled(rq)) + return HALF_JIFFY_US; + return 0; } - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) +#else /* CONFIG_HIGH_RES_TIMERS */ +static inline void init_rq_hrexpiry(struct rq *rq) { - account_steal_time(jiffies_to_cputime(ticks)); } -/* - * Account multiple ticks of idle time. - * @ticks: number of stolen ticks - */ -void account_idle_ticks(unsigned long ticks) +static inline int rq_dither(struct rq *rq) { - account_idle_times(jiffies_to_cputime(ticks)); + return HALF_JIFFY_US; } -#endif +#endif /* CONFIG_HIGH_RES_TIMERS */ /* * Functions to test for when SCHED_ISO tasks have used their allocated @@ -3488,6 +3259,8 @@ static void task_running_tick(struct rq *rq) * allowed to run into the 2nd half of the next tick if they will * run out of time slice in the interim. Otherwise, if they have * less than RESCHED_US μs of time slice left they will be rescheduled. + * Dither is used as a backup for when hrexpiry is disabled or high res + * timers not configured in. */ if (p->time_slice - rq->dither >= RESCHED_US) return; @@ -3497,6 +3270,60 @@ out_resched: rq_unlock(rq); } +#ifdef CONFIG_NO_HZ_FULL +/* + * We can stop the timer tick any time highres timers are active since + * we rely entirely on highres timeouts for task expiry rescheduling. + */ +static void sched_stop_tick(struct rq *rq, int cpu) +{ + if (!hrexpiry_enabled(rq)) + return; + if (!tick_nohz_full_enabled()) + return; + if (!tick_nohz_full_cpu(cpu)) + return; + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); +} + +static inline void sched_start_tick(struct rq *rq, int cpu) +{ + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} + +/** + * scheduler_tick_max_deferment + * + * Keep at least one tick per second when a single + * active task is running. + * + * This makes sure that uptime continues to move forward, even + * with a very low granularity. + * + * Return: Maximum deferment in nanoseconds. + */ +u64 scheduler_tick_max_deferment(void) +{ + struct rq *rq = this_rq(); + unsigned long next, now = READ_ONCE(jiffies); + + next = rq->last_jiffy + HZ; + + if (time_before_eq(next, now)) + return 0; + + return jiffies_to_nsecs(next - now); +} +#else +static inline void sched_stop_tick(struct rq *rq, int cpu) +{ +} + +static inline void sched_start_tick(struct rq *rq, int cpu) +{ +} +#endif + /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. @@ -3507,7 +3334,7 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); sched_clock_tick(); - update_rq_clock(rq); + update_clocks(rq); update_load_avg(rq); update_cpu_clock_tick(rq, rq->curr); if (!rq_idle(rq)) @@ -3517,6 +3344,7 @@ void scheduler_tick(void) rq->last_scheduler_tick = rq->last_jiffy; rq->last_tick = rq->clock; perf_event_task_tick(); + sched_stop_tick(rq, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -3799,6 +3627,17 @@ static inline void schedule_debug(struct task_struct *prev) */ static inline void set_rq_task(struct rq *rq, struct task_struct *p) { +#ifdef CONFIG_HIGH_RES_TIMERS + if (p == rq->idle || p->policy == SCHED_FIFO) + hrexpiry_clear(rq); + else + hrexpiry_start(rq, US_TO_NS(p->time_slice)); +#endif /* CONFIG_HIGH_RES_TIMERS */ + if (rq->clock - rq->last_tick > HALF_JIFFY_NS) + rq->dither = 0; + else + rq->dither = rq_dither(rq); + rq->rq_deadline = p->deadline; rq->rq_prio = p->prio; #ifdef CONFIG_SMT_NICE @@ -3980,10 +3819,6 @@ static void __sched notrace __schedule(bool preempt) update_clocks(rq); niffies = rq->niffies; update_cpu_clock_switch(rq, prev); - if (rq->clock - rq->last_tick > HALF_JIFFY_NS) - rq->dither = 0; - else - rq->dither = HALF_JIFFY_US; clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -3994,15 +3829,15 @@ static void __sched notrace __schedule(bool preempt) } next = earliest_deadline_task(rq, cpu, idle); - if (likely(next->prio != PRIO_LIMIT)) { + if (likely(next->prio != PRIO_LIMIT)) clear_cpuidle_map(cpu); - next->last_ran = niffies; - } else { + else { set_cpuidle_map(cpu); update_load_avg(rq); } set_rq_task(rq, next); + next->last_ran = niffies; if (likely(prev != next)) { /* @@ -4014,14 +3849,16 @@ static void __sched notrace __schedule(bool preempt) check_siblings(rq); else wake_siblings(rq); - rq->nr_switches++; + atomic64_inc(&grq.nr_switches); rq->curr = next; ++*switch_count; trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - } else + } else { + check_siblings(rq); rq_unlock_irq(rq); + } } static inline void sched_submit_work(struct task_struct *tsk) @@ -5607,8 +5444,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { __do_set_cpus_allowed(p, new_mask); if (needs_other_cpu(p, task_cpu(p))) { + struct rq *rq; + set_task_cpu(p, valid_task_cpu(p)); + rq = __task_rq_lock(p); resched_task(p); + __task_rq_unlock(rq); } } @@ -5641,6 +5482,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&idle->pi_lock, flags); raw_spin_lock(&rq->lock); idle->last_ran = rq->niffies; + time_slice_expired(idle, rq); idle->state = TASK_RUNNING; /* Setting prio to illegal value shouldn't matter when never queued */ idle->prio = PRIO_LIMIT; @@ -7439,6 +7281,8 @@ int sched_cpu_dying(unsigned int cpu) } bind_zero(cpu); double_rq_unlock(rq, cpu_rq(0)); + sched_start_tick(rq, cpu); + hrexpiry_clear(rq); local_irq_restore(flags); return 0; @@ -7463,7 +7307,7 @@ static const cpumask_t *thread_cpumask(int cpu) /* All this CPU's SMT siblings are idle */ static bool siblings_cpu_idle(struct rq *rq) { - return cpumask_subset(&rq->thread_mask, &cpu_idle_map); + return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map); } #endif #ifdef CONFIG_SCHED_MC @@ -7474,7 +7318,7 @@ static const cpumask_t *core_cpumask(int cpu) /* All this CPU's shared cache siblings are idle */ static bool cache_cpu_idle(struct rq *rq) { - return cpumask_subset(&rq->core_mask, &cpu_idle_map); + return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map); } #endif @@ -7609,6 +7453,7 @@ void __init sched_init_smp(void) #else void __init sched_init_smp(void) { + sched_smp_initialized = true; } #endif /* CONFIG_SMP */ @@ -7655,11 +7500,14 @@ void __init sched_init(void) for (i = 1 ; i < NICE_WIDTH ; i++) prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; + atomic_set(&grq.nr_running, 0); + atomic_set(&grq.nr_uninterruptible, 0); + atomic64_set(&grq.nr_switches, 0); skiplist_node_init(&init_task.node); #ifdef CONFIG_SMP init_defrootdomain(); - cpumask_clear(&cpu_idle_map); + cpumask_clear(&grq.cpu_idle_map); #else uprq = &per_cpu(runqueues, 0); #endif @@ -7673,7 +7521,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); - rq->nr_running = rq->nr_uninterruptible = rq->nr_switches = 0; skiplist_init(&rq->node); rq->sl = new_skiplist(&rq->node); raw_spin_lock_init(&rq->lock); @@ -7692,6 +7539,7 @@ void __init sched_init(void) rq->cpu = i; rq_attach_root(rq, &def_root_domain); #endif + init_rq_hrexpiry(rq); atomic_set(&rq->nr_iowait, 0); } @@ -7899,199 +7747,6 @@ void set_curr_task(int cpu, struct task_struct *p) #endif -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING - -#ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) -{ - if (is_idle_task(prev)) - vtime_account_idle(prev); - else - vtime_account_system(prev); - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - vtime_account_user(prev); -#endif - arch_vtime_task_switch(prev); -} -#endif - -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - *ut = p->utime; - *st = p->stime; -} -EXPORT_SYMBOL_GPL(task_cputime_adjusted); - -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - - *ut = cputime.utime; - *st = cputime.stime; -} - -void vtime_account_system_irqsafe(struct task_struct *tsk) -{ - unsigned long flags; - - local_irq_save(flags); - vtime_account_system(tsk); - local_irq_restore(flags); -} -EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe); - -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) -{ - if (!in_interrupt() && is_idle_task(tsk)) - vtime_account_idle(tsk); - else - vtime_account_system(tsk); -} -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ - -#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -/* - * Perform (stime * rtime) / total, but avoid multiplication overflow by - * losing precision when the numbers are big. - */ -static cputime_t scale_stime(u64 stime, u64 rtime, u64 total) -{ - u64 scaled; - - for (;;) { - /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } - - /* Make sure 'total' fits in 32 bits */ - if (total >> 32) - goto drop_precision; - - /* Does rtime (and thus stime) fit in 32 bits? */ - if (!(rtime >> 32)) - break; - - /* Can we just balance rtime/stime rather than dropping bits? */ - if (stime >> 31) - goto drop_precision; - - /* We can grow stime and shrink rtime and try to make them both fit */ - stime <<= 1; - rtime >>= 1; - continue; - -drop_precision: - /* We drop from rtime, it has more bits than stime */ - rtime >>= 1; - total >>= 1; - } - - /* - * Make sure gcc understands that this is a 32x32->64 multiply, - * followed by a 64/32->64 divide. - */ - scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total); - return (__force cputime_t) scaled; -} - -/* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. - */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - cputime_t *ut, cputime_t *st) -{ - cputime_t rtime, stime, utime, total; - - stime = curr->stime; - total = stime + curr->utime; - - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ - rtime = nsecs_to_cputime(curr->sum_exec_runtime); - - /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. - */ - if (prev->stime + prev->utime >= rtime) - goto out; - - if (total) { - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; - } else { - stime = rtime; - utime = 0; - } - - /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. - */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); - -out: - *ut = prev->utime; - *st = prev->stime; -} - -void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime = { - .sum_exec_runtime = tsk_seruntime(p), - }; - - task_cputime(p, &cputime.utime, &cputime.stime); - cputime_adjust(&cputime, &p->prev_cputime, ut, st); -} -EXPORT_SYMBOL_GPL(task_cputime_adjusted); - -/* - * Must be called with siglock held. - */ -void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) -{ - struct task_cputime cputime; - - thread_group_cputime(p, &cputime); - cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st); -} -#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ - void init_idle_bootup_task(struct task_struct *idle) {} diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h index 10a12b335..f9510d739 100644 --- a/kernel/sched/MuQSS.h +++ b/kernel/sched/MuQSS.h @@ -2,6 +2,7 @@ #include #include #include +#include "cpuacct.h" #ifndef MUQSS_SCHED_H #define MUQSS_SCHED_H @@ -17,9 +18,6 @@ struct rq { struct task_struct *curr, *idle, *stop; struct mm_struct *prev_mm; - long nr_uninterruptible; - s64 nr_switches; - int nr_running; raw_spinlock_t lock; @@ -88,6 +86,10 @@ struct rq { int iso_ticks; bool iso_refractory; +#ifdef CONFIG_HIGH_RES_TIMERS + struct hrtimer hrexpiry_timer; +#endif + #ifdef CONFIG_SCHEDSTATS /* latency stats */ @@ -247,6 +249,55 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq) } #endif +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + +DECLARE_PER_CPU(u64, cpu_hardirq_time); +DECLARE_PER_CPU(u64, cpu_softirq_time); + +#ifndef CONFIG_64BIT +DECLARE_PER_CPU(seqcount_t, irq_time_seq); + +static inline void irq_time_write_begin(void) +{ + __this_cpu_inc(irq_time_seq.sequence); + smp_wmb(); +} + +static inline void irq_time_write_end(void) +{ + smp_wmb(); + __this_cpu_inc(irq_time_seq.sequence); +} + +static inline u64 irq_time_read(int cpu) +{ + u64 irq_time; + unsigned seq; + + do { + seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); + irq_time = per_cpu(cpu_softirq_time, cpu) + + per_cpu(cpu_hardirq_time, cpu); + } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); + + return irq_time; +} +#else /* CONFIG_64BIT */ +static inline void irq_time_write_begin(void) +{ +} + +static inline void irq_time_write_end(void) +{ +} + +static inline u64 irq_time_read(int cpu) +{ + return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); +} +#endif /* CONFIG_64BIT */ +#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + #ifdef CONFIG_CPU_FREQ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a846cf89e..f09077a45 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,7 +4,12 @@ #include #include #include +#ifdef CONFIG_SCHED_MUQSS +#include "MuQSS.h" +#include "stats.h" +#else #include "sched.h" +#endif #ifdef CONFIG_PARAVIRT #include #endif @@ -671,7 +676,7 @@ out: void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime = { - .sum_exec_runtime = p->se.sum_exec_runtime, + .sum_exec_runtime = tsk_seruntime(p), }; task_cputime(p, &cputime.utime, &cputime.stime); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4309c8e76..b728c4117 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -481,17 +481,23 @@ static inline int entity_before(struct sched_entity *a, static void update_min_vruntime(struct cfs_rq *cfs_rq) { + struct sched_entity *curr = cfs_rq->curr; + u64 vruntime = cfs_rq->min_vruntime; - if (cfs_rq->curr) - vruntime = cfs_rq->curr->vruntime; + if (curr) { + if (curr->on_rq) + vruntime = curr->vruntime; + else + curr = NULL; + } if (cfs_rq->rb_leftmost) { struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost, struct sched_entity, run_node); - if (!cfs_rq->curr) + if (!curr) vruntime = se->vruntime; else vruntime = min_vruntime(vruntime, se->vruntime); @@ -705,7 +711,14 @@ void init_entity_runnable_average(struct sched_entity *se) * will definitely be update (after enqueue). */ sa->period_contrib = 1023; - sa->load_avg = scale_load_down(se->load.weight); + /* + * Tasks are intialized with full load to be seen as heavy tasks until + * they get a chance to stabilize to their real load level. + * Group entities are intialized with zero load to reflect the fact that + * nothing has been attached to the task group yet. + */ + if (entity_is_task(se)) + sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* * At this point, util_avg won't be used in select_task_rq_fair anyway @@ -3484,9 +3497,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) account_entity_dequeue(cfs_rq, se); /* - * Normalize the entity after updating the min_vruntime because the - * update can refer to the ->curr item and we need to reflect this - * movement in our normalized position. + * Normalize after update_curr(); which will also have moved + * min_vruntime if @se is the one holding it back. But before doing + * update_min_vruntime() again, which will discount @se's position and + * can move min_vruntime forward still more. */ if (!(flags & DEQUEUE_SLEEP)) se->vruntime -= cfs_rq->min_vruntime; @@ -3494,8 +3508,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) /* return excess runtime on last dequeue */ return_cfs_rq_runtime(cfs_rq); - update_min_vruntime(cfs_rq); update_cfs_shares(cfs_rq); + + /* + * Now advance min_vruntime if @se was the entity holding it back, + * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be + * put back on, and if we advance min_vruntime, we'll be placed back + * further than we started -- ie. we'll be penalized. + */ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); } /* diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 10e18d267..4008d9f95 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -89,7 +89,7 @@ config NO_HZ_IDLE config NO_HZ_FULL bool "Full dynticks system (tickless)" # NO_HZ_COMMON dependency - depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS && !SCHED_MUQSS + depends on !ARCH_USES_GETTIMEOFFSET && GENERIC_CLOCKEVENTS # We need at least one periodic CPU for timekeeping depends on SMP depends on HAVE_CONTEXT_TRACKING diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 2c5bc77c0..b96deed54 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -198,8 +198,13 @@ int clockevents_tick_resume(struct clock_event_device *dev) #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST +#ifdef CONFIG_SCHED_MUQSS +/* Limit min_delta to 100us */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) +#else /* Limit min_delta to a jiffie */ #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) +#endif /** * clockevents_increase_min_delta - raise minimum delta of a clock event device -- cgit v1.2.3