1 files changed, 227 insertions, 572 deletions
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index bd9b1f13f..cf7a95286 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -123,6 +123,7 @@
  */
 #define JIFFIES_TO_NS(TIME)	((TIME) * (1073741824 / HZ))
 #define JIFFY_NS		(1073741824 / HZ)
+#define JIFFY_US		(1048576 / HZ)
 #define NS_TO_JIFFIES(TIME)	((TIME) / JIFFY_NS)
 #define HALF_JIFFY_NS		(1073741824 / HZ / 2)
 #define HALF_JIFFY_US		(1048576 / HZ / 2)
@@ -130,12 +131,13 @@
 #define MS_TO_US(TIME)		((TIME) << 10)
 #define NS_TO_MS(TIME)		((TIME) >> 20)
 #define NS_TO_US(TIME)		((TIME) >> 10)
+#define US_TO_NS(TIME)		((TIME) << 10)
 
 #define RESCHED_US	(100) /* Reschedule if less than this many μs left */
 
 void print_scheduler_version(void)
 {
-	printk(KERN_INFO "MuQSS CPU scheduler v0.115 by Con Kolivas.\n");
+	printk(KERN_INFO "MuQSS CPU scheduler v0.120 by Con Kolivas.\n");
 }
 
 /*
@@ -179,9 +181,26 @@ static inline int timeslice(void)
 	return MS_TO_US(rr_interval);
 }
 
+static bool sched_smp_initialized __read_mostly;
+
+/*
+ * The global runqueue data that all CPUs work off. Contains either atomic
+ * variables and a cpu bitmap set atomically.
+ */
+struct global_rq {
 #ifdef CONFIG_SMP
-static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+	atomic_t nr_running ____cacheline_aligned_in_smp;
+	atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
+	atomic64_t nr_switches ____cacheline_aligned_in_smp;
+	cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
+#else
+	atomic_t nr_running ____cacheline_aligned;
+	atomic_t nr_uninterruptible ____cacheline_aligned;
+	atomic64_t nr_switches ____cacheline_aligned;
+#endif
+};
 
+#ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -213,6 +232,13 @@ static struct root_domain def_root_domain;
 
 #endif /* CONFIG_SMP */
 
+/* There can be only one */
+#ifdef CONFIG_SMP
+static struct global_rq grq ____cacheline_aligned_in_smp;
+#else
+static struct global_rq grq ____cacheline_aligned;
+#endif
+
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 /* cpus with isolated domains */
@@ -768,7 +794,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	if (unlikely(task_on_rq_migrating(prev))) {
 		sched_info_dequeued(rq, prev);
-		rq->nr_running--;
 		/*
 		 * We move the ownership of prev to the new cpu now. ttwu can't
 		 * activate prev to the wrong cpu since it has to grab this
@@ -779,7 +804,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 
 		raw_spin_lock(&prev->pi_lock);
 		rq = __task_rq_lock(prev);
-		rq->nr_running++;
 		/* Check that someone else hasn't already queued prev */
 		if (likely(!task_queued(prev))) {
 			enqueue_task(rq, prev, 0);
@@ -834,7 +858,7 @@ static inline int ms_longest_deadline_diff(void)
 
 static inline int rq_load(struct rq *rq)
 {
-	return rq->nr_running;
+	return rq->sl->entries + !rq_idle(rq);
 }
 
 static inline bool rq_local(struct rq *rq);
@@ -848,20 +872,24 @@ static inline bool rq_local(struct rq *rq);
  */
 static void update_load_avg(struct rq *rq)
 {
-	/* rq clock can go backwards so skip update if that happens */
-	if (likely(rq->clock > rq->load_update)) {
-		unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
-		long load, curload = rq_load(rq);
+	unsigned long us_interval;
+	long load, curload;
 
-		load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
-		if (unlikely(load < 0))
-			load = 0;
-		load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
-		rq->load_avg = load;
-	} else
+	if (unlikely(rq->niffies <= rq->load_update))
 		return;
 
-	rq->load_update = rq->clock;
+	us_interval = NS_TO_US(rq->niffies - rq->load_update);
+	curload = rq_load(rq);
+	load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144);
+	if (unlikely(load < 0))
+		load = 0;
+	load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144;
+	/* If this CPU has all the load, make it ramp up quickly */
+	if (curload > load && curload >= atomic_read(&grq.nr_running))
+		load = curload;
+	rq->load_avg = load;
+
+	rq->load_update = rq->niffies;
 	if (likely(rq_local(rq)))
 		cpufreq_trigger(rq->niffies, rq->load_avg);
 }
@@ -1085,7 +1113,7 @@ static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
 static inline void set_cpuidle_map(int cpu)
 {
 	if (likely(cpu_online(cpu)))
-		atomic_set_cpu(cpu, &cpu_idle_map);
+		atomic_set_cpu(cpu, &grq.cpu_idle_map);
 }
 
 static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
@@ -1095,12 +1123,12 @@ static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
 
 static inline void clear_cpuidle_map(int cpu)
 {
-	atomic_clear_cpu(cpu, &cpu_idle_map);
+	atomic_clear_cpu(cpu, &grq.cpu_idle_map);
 }
 
 static bool suitable_idle_cpus(struct task_struct *p)
 {
-	return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map));
+	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
 }
 
 /*
@@ -1231,7 +1259,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu)
 	struct rq *rq;
 	int best_cpu;
 
-	cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map);
+	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
 	best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
 	rq = cpu_rq(best_cpu);
 	if (!smt_schedule(p, rq))
@@ -1343,11 +1371,11 @@ static void activate_task(struct task_struct *p, struct rq *rq)
 
 	p->prio = effective_prio(p);
 	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible--;
+		atomic_dec(&grq.nr_uninterruptible);
 
 	enqueue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-	rq->nr_running++;
+	atomic_inc(&grq.nr_running);
 }
 
 /*
@@ -1357,10 +1385,10 @@ static void activate_task(struct task_struct *p, struct rq *rq)
 static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	if (task_contributes_to_load(p))
-		rq->nr_uninterruptible++;
+		atomic_inc(&grq.nr_uninterruptible);
 
 	p->on_rq = 0;
-	rq->nr_running--;
+	atomic_dec(&grq.nr_running);
 	sched_info_dequeued(rq, p);
 }
 
@@ -1381,7 +1409,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
 	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
 				      lockdep_is_held(&task_rq(p)->lock)));
 #endif
-	if (p->wake_cpu == cpu)
+	if (task_cpu(p) == cpu)
 		return;
 	trace_sched_migrate_task(p, cpu);
 	perf_event_task_migrate(p);
@@ -1428,9 +1456,7 @@ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
 
 	dequeue_task(p_rq, p, DEQUEUE_SAVE);
 	if (p_rq != rq) {
-		p_rq->nr_running--;
 		sched_info_dequeued(p_rq, p);
-		rq->nr_running++;
 		sched_info_queued(rq, p);
 	}
 	set_task_cpu(p, cpu);
@@ -1767,7 +1793,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
-		rq->nr_uninterruptible--;
+		atomic_dec(&grq.nr_uninterruptible);
 #endif
 
 	ttwu_activate(rq, p);
@@ -1796,8 +1822,6 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 
 #ifdef CONFIG_SMP
-static bool sched_smp_initialized __read_mostly;
-
 void sched_ttwu_pending(void)
 {
 	struct rq *rq = this_rq();
@@ -2312,6 +2336,16 @@ int sysctl_schedstats(struct ctl_table *table, int write,
 static inline void init_schedstats(void) {}
 #endif /* CONFIG_SCHEDSTATS */
 
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p);
+
+static void account_task_cpu(struct rq *rq, struct task_struct *p)
+{
+	update_clocks(rq);
+	/* This isn't really a context switch but accounting is the same */
+	update_cpu_clock_switch(rq, p);
+	p->last_ran = rq->niffies;
+}
+
 /*
  * wake_up_new_task - wake up a newly created task for the first time.
  *
@@ -2337,7 +2371,6 @@ void wake_up_new_task(struct task_struct *p)
 	}
 
 	double_rq_lock(rq, new_rq);
-	update_clocks(rq);
 	rq_curr = rq->curr;
 
 	/*
@@ -2345,7 +2378,6 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	p->prio = rq_curr->normal_prio;
 
-	activate_task(p, rq);
 	trace_sched_wakeup_new(p);
 
 	/*
@@ -2356,17 +2388,17 @@ void wake_up_new_task(struct task_struct *p)
 	 * modified within schedule() so it is always equal to
 	 * current->deadline.
 	 */
+	account_task_cpu(rq, rq_curr);
 	p->last_ran = rq_curr->last_ran;
 	if (likely(rq_curr->policy != SCHED_FIFO)) {
 		rq_curr->time_slice /= 2;
-		if (unlikely(rq_curr->time_slice < RESCHED_US)) {
+		if (rq_curr->time_slice < RESCHED_US) {
 			/*
 			 * Forking task has run out of timeslice. Reschedule it and
 			 * start its child with a new time slice and deadline. The
 			 * child will end up running first because its deadline will
 			 * be slightly earlier.
 			 */
-			rq_curr->time_slice = 0;
 			__set_tsk_resched(rq_curr);
 			time_slice_expired(p, new_rq);
 			if (suitable_idle_cpus(p))
@@ -2389,6 +2421,7 @@ void wake_up_new_task(struct task_struct *p)
 		time_slice_expired(p, new_rq);
 		try_preempt(p, new_rq);
 	}
+	activate_task(p, new_rq);
 	double_rq_unlock(rq, new_rq);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
@@ -2651,6 +2684,22 @@ context_switch(struct rq *rq, struct task_struct *prev,
 }
 
 /*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
+unsigned long nr_running(void)
+{
+	return atomic_read(&grq.nr_running);
+}
+
+static unsigned long nr_uninterruptible(void)
+{
+	return atomic_read(&grq.nr_uninterruptible);
+}
+
+/*
  * Check if only the current task is running on the cpu.
  *
  * Caution: this function does not check that the caller has disabled
@@ -2674,31 +2723,9 @@ bool single_task_running(void)
 }
 EXPORT_SYMBOL(single_task_running);
 
-/*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup.
- */
 unsigned long long nr_context_switches(void)
 {
-	long long sum = 0;
-	int i;
-
-	for_each_possible_cpu(i)
-		sum += cpu_rq(i)->nr_switches;
-
-	return sum;
-}
-
-unsigned long nr_running(void)
-{
-	long i, sum = 0;
-
-	for_each_online_cpu(i)
-		sum += cpu_rq(i)->nr_running;
-
-	return sum;
+	return (unsigned long long)atomic64_read(&grq.nr_switches);
 }
 
 unsigned long nr_iowait(void)
@@ -2719,14 +2746,7 @@ unsigned long nr_iowait_cpu(int cpu)
 
 unsigned long nr_active(void)
 {
-	long i, sum = 0;
-
-	for_each_online_cpu(i) {
-		sum += cpu_rq(i)->nr_running;
-		sum += cpu_rq(i)->nr_uninterruptible;
-	}
-
-	return sum;
+	return nr_running() + nr_uninterruptible();
 }
 
 /*
@@ -2797,116 +2817,6 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-
-/*
- * There are no locks covering percpu hardirq/softirq time.
- * They are only modified in account_system_vtime, on corresponding CPU
- * with interrupts disabled. So, writes are safe.
- * They are read and saved off onto struct rq in update_rq_clock().
- * This may result in other CPU reading this CPU's irq time and can
- * race with irq/account_system_vtime on this CPU. We would either get old
- * or new value with a side effect of accounting a slice of irq time to wrong
- * task when irq is in progress while we read rq->clock. That is a worthy
- * compromise in place of having locks on each irq in account_system_time.
- */
-static DEFINE_PER_CPU(u64, cpu_hardirq_time);
-static DEFINE_PER_CPU(u64, cpu_softirq_time);
-
-static DEFINE_PER_CPU(u64, irq_start_time);
-static int sched_clock_irqtime;
-
-void enable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 1;
-}
-
-void disable_sched_clock_irqtime(void)
-{
-	sched_clock_irqtime = 0;
-}
-
-#ifndef CONFIG_64BIT
-static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-
-static inline void irq_time_write_begin(void)
-{
-	__this_cpu_inc(irq_time_seq.sequence);
-	smp_wmb();
-}
-
-static inline void irq_time_write_end(void)
-{
-	smp_wmb();
-	__this_cpu_inc(irq_time_seq.sequence);
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	u64 irq_time;
-	unsigned seq;
-
-	do {
-		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
-		irq_time = per_cpu(cpu_softirq_time, cpu) +
-			   per_cpu(cpu_hardirq_time, cpu);
-	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-
-	return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-
-static inline void irq_time_write_end(void)
-{
-}
-
-static inline u64 irq_time_read(int cpu)
-{
-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
-}
-#endif /* CONFIG_64BIT */
-
-/*
- * Called before incrementing preempt_count on {soft,}irq_enter
- * and before decrementing preempt_count on {soft,}irq_exit.
- */
-void irqtime_account_irq(struct task_struct *curr)
-{
-	unsigned long flags;
-	s64 delta;
-	int cpu;
-
-	if (!sched_clock_irqtime)
-		return;
-
-	local_irq_save(flags);
-
-	cpu = smp_processor_id();
-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
-	__this_cpu_add(irq_start_time, delta);
-
-	irq_time_write_begin();
-	/*
-	 * We do not account for softirq time from ksoftirqd here.
-	 * We want to continue accounting softirq time to ksoftirqd thread
-	 * in that case, so as not to confuse scheduler with a special task
-	 * that do not consume any time, but still wants to run.
-	 */
-	if (hardirq_count())
-		__this_cpu_add(cpu_hardirq_time, delta);
-	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-		__this_cpu_add(cpu_softirq_time, delta);
-
-	irq_time_write_end();
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(irqtime_account_irq);
-
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
 #ifdef CONFIG_PARAVIRT
 static inline u64 steal_ticks(u64 steal)
 {
@@ -2968,89 +2878,6 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 # define nsecs_to_cputime(__nsecs)	nsecs_to_jiffies(__nsecs)
 #endif
 
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-static void irqtime_account_hi_si(void)
-{
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	u64 latest_ns;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time));
-	if (latest_ns > cpustat[CPUTIME_IRQ])
-		cpustat[CPUTIME_IRQ] += (__force u64)cputime_one_jiffy;
-
-	latest_ns = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time));
-	if (latest_ns > cpustat[CPUTIME_SOFTIRQ])
-		cpustat[CPUTIME_SOFTIRQ] += (__force u64)cputime_one_jiffy;
-}
-#else /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-#define sched_clock_irqtime	(0)
-
-static inline void irqtime_account_hi_si(void)
-{
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
-
-static __always_inline bool steal_account_process_tick(void)
-{
-#ifdef CONFIG_PARAVIRT
-	if (static_key_false(&paravirt_steal_enabled)) {
-		u64 steal;
-		cputime_t steal_ct;
-
-		steal = paravirt_steal_clock(smp_processor_id());
-		steal -= this_rq()->prev_steal_time;
-
-		/*
-		 * cputime_t may be less precise than nsecs (eg: if it's
-		 * based on jiffies). Lets cast the result to cputime
-		 * granularity and account the rest on the next rounds.
-		 */
-		steal_ct = nsecs_to_cputime(steal);
-		this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
-
-		account_steal_time(steal_ct);
-		return steal_ct;
-	}
-#endif
-	return false;
-}
-
-/*
- * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
- * tasks (sum on group iteration) belonging to @tsk's group.
- */
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct signal_struct *sig = tsk->signal;
-	cputime_t utime, stime;
-	struct task_struct *t;
-	unsigned int seq, nextseq;
-	unsigned long flags;
-
-	rcu_read_lock();
-	/* Attempt a lockless read on the first round. */
-	nextseq = 0;
-	do {
-		seq = nextseq;
-		flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-		times->utime = sig->utime;
-		times->stime = sig->stime;
-		times->sum_exec_runtime = sig->sum_sched_runtime;
-
-		for_each_thread(tsk, t) {
-			task_cputime(t, &utime, &stime);
-			times->utime += utime;
-			times->stime += stime;
-			times->sum_exec_runtime += task_sched_runtime(t);
-		}
-		/* If lockless access failed, take the lock. */
-		nextseq = 1;
-	} while (need_seqretry(&sig->stats_lock, seq));
-	done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
-	rcu_read_unlock();
-}
-
 /*
  * On each tick, add the number of nanoseconds to the unbanked variables and
  * once one tick's worth has accumulated, account it allowing for accurate
@@ -3175,15 +3002,11 @@ static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns)
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  * CPU scheduler quota accounting is also performed here in microseconds.
  */
-static void
-update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
+static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
 {
 	s64 account_ns = rq->niffies - p->last_ran;
 	struct task_struct *idle = rq->idle;
 
-	if (steal_account_process_tick())
-		goto ts_account;
-
 	/* Accurate tick timekeeping */
 	if (user_mode(get_irq_regs()))
 		pc_user_time(rq, p, account_ns);
@@ -3192,10 +3015,6 @@ update_cpu_clock_tick(struct rq *rq, struct task_struct *p)
 	} else
 		pc_idle_time(rq, idle, account_ns);
 
-	if (sched_clock_irqtime)
-		irqtime_account_hi_si();
-
-ts_account:
 	/* time_slice accounting is done in usecs to avoid overflow on 32bit */
 	if (p->policy != SCHED_FIFO && p != idle)
 		p->time_slice -= NS_TO_US(account_ns);
@@ -3208,8 +3027,7 @@ ts_account:
  * Bank in p->sched_time the ns elapsed since the last tick or switch.
  * CPU scheduler quota accounting is also performed here in microseconds.
  */
-static void
-update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
+static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p)
 {
 	s64 account_ns = rq->niffies - p->last_ran;
 	struct task_struct *idle = rq->idle;
@@ -3283,133 +3101,86 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-/* Compatibility crap */
-void account_user_time(struct task_struct *p, cputime_t cputime,
-		       cputime_t cputime_scaled)
-{
-}
-
-void account_idle_time(cputime_t cputime)
+#ifdef CONFIG_HIGH_RES_TIMERS
+static inline int hrexpiry_enabled(struct rq *rq)
 {
+	if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrexpiry_timer);
 }
 
 /*
- * Account guest cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
+ * Use HR-timers to deliver accurate preemption points.
  */
-static void account_guest_time(struct task_struct *p, cputime_t cputime,
-			       cputime_t cputime_scaled)
+static void hrexpiry_clear(struct rq *rq)
 {
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-
-	/* Add guest time to process. */
-	p->utime += (__force u64)cputime;
-	p->utimescaled += (__force u64)cputime_scaled;
-	account_group_user_time(p, cputime);
-	p->gtime += (__force u64)cputime;
-
-	/* Add guest time to cpustat. */
-	if (task_nice(p) > 0) {
-		cpustat[CPUTIME_NICE] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST_NICE] += (__force u64)cputime;
-	} else {
-		cpustat[CPUTIME_USER] += (__force u64)cputime;
-		cpustat[CPUTIME_GUEST] += (__force u64)cputime;
-	}
+	if (!hrexpiry_enabled(rq))
+		return;
+	if (hrtimer_active(&rq->hrexpiry_timer))
+		hrtimer_cancel(&rq->hrexpiry_timer);
 }
 
 /*
- * Account system cpu time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * @target_cputime64: pointer to cpustat field that has to be updated
+ * High-resolution time_slice expiry.
+ * Runs from hardirq context with interrupts disabled.
  */
-static inline
-void __account_system_time(struct task_struct *p, cputime_t cputime,
-			cputime_t cputime_scaled, cputime64_t *target_cputime64)
+static enum hrtimer_restart hrexpiry(struct hrtimer *timer)
 {
-	/* Add system time to process. */
-	p->stime += (__force u64)cputime;
-	p->stimescaled += (__force u64)cputime_scaled;
-	account_group_system_time(p, cputime);
-
-	/* Add system time to cpustat. */
-	*target_cputime64 += (__force u64)cputime;
-
-	/* Account for system time used */
-	acct_update_integrals(p);
-}
+	struct rq *rq = container_of(timer, struct rq, hrexpiry_timer);
+	struct task_struct *p;
 
-/*
- * Account system cpu time to a process.
- * @p: the process that the cpu time gets accounted to
- * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
- * @cputime_scaled: cputime scaled by cpu frequency
- * This is for guest only now.
- */
-void account_system_time(struct task_struct *p, int hardirq_offset,
-			 cputime_t cputime, cputime_t cputime_scaled)
-{
+	/* This can happen during CPU hotplug / resume */
+	if (unlikely(cpu_of(rq) != smp_processor_id()))
+		goto out;
 
-	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
-		account_guest_time(p, cputime, cputime_scaled);
+	/*
+	 * We're doing this without the runqueue lock but this should always
+	 * be run on the local CPU. Time slice should run out in __schedule
+	 * but we set it to zero here in case niffies is slightly less.
+	 */
+	p = rq->curr;
+	p->time_slice = 0;
+	__set_tsk_resched(p);
+out:
+	return HRTIMER_NORESTART;
 }
 
 /*
- * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * Called to set the hrexpiry timer state.
+ *
+ * called with irqs disabled from the local CPU only
  */
-void account_steal_time(cputime_t cputime)
+static void hrexpiry_start(struct rq *rq, u64 delay)
 {
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
+	if (!hrexpiry_enabled(rq))
+		return;
 
-	cpustat[CPUTIME_STEAL] += (__force u64)cputime;
+	hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay),
+		      HRTIMER_MODE_REL_PINNED);
 }
 
-/*
- * Account for idle time.
- * @cputime: the cpu time spent in idle wait
- */
-static void account_idle_times(cputime_t cputime)
+static void init_rq_hrexpiry(struct rq *rq)
 {
-	u64 *cpustat = kcpustat_this_cpu->cpustat;
-	struct rq *rq = this_rq();
-
-	if (atomic_read(&rq->nr_iowait) > 0)
-		cpustat[CPUTIME_IOWAIT] += (__force u64)cputime;
-	else
-		cpustat[CPUTIME_IDLE] += (__force u64)cputime;
+	hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrexpiry_timer.function = hrexpiry;
 }
 
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-
-void account_process_tick(struct task_struct *p, int user_tick)
+static inline int rq_dither(struct rq *rq)
 {
+	if (!hrexpiry_enabled(rq))
+		return HALF_JIFFY_US;
+	return 0;
 }
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
+#else /* CONFIG_HIGH_RES_TIMERS */
+static inline void init_rq_hrexpiry(struct rq *rq)
 {
-	account_steal_time(jiffies_to_cputime(ticks));
 }
 
-/*
- * Account multiple ticks of idle time.
- * @ticks: number of stolen ticks
- */
-void account_idle_ticks(unsigned long ticks)
+static inline int rq_dither(struct rq *rq)
 {
-	account_idle_times(jiffies_to_cputime(ticks));
+	return HALF_JIFFY_US;
 }
-#endif
+#endif /* CONFIG_HIGH_RES_TIMERS */
 
 /*
  * Functions to test for when SCHED_ISO tasks have used their allocated
@@ -3488,6 +3259,8 @@ static void task_running_tick(struct rq *rq)
 	 * allowed to run into the 2nd half of the next tick if they will
 	 * run out of time slice in the interim. Otherwise, if they have
 	 * less than RESCHED_US μs of time slice left they will be rescheduled.
+	 * Dither is used as a backup for when hrexpiry is disabled or high res
+	 * timers not configured in.
 	 */
 	if (p->time_slice - rq->dither >= RESCHED_US)
 		return;
@@ -3497,6 +3270,60 @@ out_resched:
 	rq_unlock(rq);
 }
 
+#ifdef CONFIG_NO_HZ_FULL
+/*
+ * We can stop the timer tick any time highres timers are active since
+ * we rely entirely on highres timeouts for task expiry rescheduling.
+ */
+static void sched_stop_tick(struct rq *rq, int cpu)
+{
+	if (!hrexpiry_enabled(rq))
+		return;
+	if (!tick_nohz_full_enabled())
+		return;
+	if (!tick_nohz_full_cpu(cpu))
+		return;
+	tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+	tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
+}
+
+/**
+ * scheduler_tick_max_deferment
+ *
+ * Keep at least one tick per second when a single
+ * active task is running.
+ *
+ * This makes sure that uptime continues to move forward, even
+ * with a very low granularity.
+ *
+ * Return: Maximum deferment in nanoseconds.
+ */
+u64 scheduler_tick_max_deferment(void)
+{
+	struct rq *rq = this_rq();
+	unsigned long next, now = READ_ONCE(jiffies);
+
+	next = rq->last_jiffy + HZ;
+
+	if (time_before_eq(next, now))
+		return 0;
+
+	return jiffies_to_nsecs(next - now);
+}
+#else
+static inline void sched_stop_tick(struct rq *rq, int cpu)
+{
+}
+
+static inline void sched_start_tick(struct rq *rq, int cpu)
+{
+}
+#endif
+
 /*
  * This function gets called by the timer code, with HZ frequency.
  * We call it with interrupts disabled.
@@ -3507,7 +3334,7 @@ void scheduler_tick(void)
 	struct rq *rq = cpu_rq(cpu);
 
 	sched_clock_tick();
-	update_rq_clock(rq);
+	update_clocks(rq);
 	update_load_avg(rq);
 	update_cpu_clock_tick(rq, rq->curr);
 	if (!rq_idle(rq))
@@ -3517,6 +3344,7 @@ void scheduler_tick(void)
 	rq->last_scheduler_tick = rq->last_jiffy;
 	rq->last_tick = rq->clock;
 	perf_event_task_tick();
+	sched_stop_tick(rq, cpu);
 }
 
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -3799,6 +3627,17 @@ static inline void schedule_debug(struct task_struct *prev)
  */
 static inline void set_rq_task(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (p == rq->idle || p->policy == SCHED_FIFO)
+		hrexpiry_clear(rq);
+	else
+		hrexpiry_start(rq, US_TO_NS(p->time_slice));
+#endif /* CONFIG_HIGH_RES_TIMERS */
+	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
+		rq->dither = 0;
+	else
+		rq->dither = rq_dither(rq);
+
 	rq->rq_deadline = p->deadline;
 	rq->rq_prio = p->prio;
 #ifdef CONFIG_SMT_NICE
@@ -3980,10 +3819,6 @@ static void __sched notrace __schedule(bool preempt)
 	update_clocks(rq);
 	niffies = rq->niffies;
 	update_cpu_clock_switch(rq, prev);
-	if (rq->clock - rq->last_tick > HALF_JIFFY_NS)
-		rq->dither = 0;
-	else
-		rq->dither = HALF_JIFFY_US;
 
 	clear_tsk_need_resched(prev);
 	clear_preempt_need_resched();
@@ -3994,15 +3829,15 @@ static void __sched notrace __schedule(bool preempt)
 	}
 
 	next = earliest_deadline_task(rq, cpu, idle);
-	if (likely(next->prio != PRIO_LIMIT)) {
+	if (likely(next->prio != PRIO_LIMIT))
 		clear_cpuidle_map(cpu);
-		next->last_ran = niffies;
-	} else {
+	else {
 		set_cpuidle_map(cpu);
 		update_load_avg(rq);
 	}
 
 	set_rq_task(rq, next);
+	next->last_ran = niffies;
 
 	if (likely(prev != next)) {
 		/*
@@ -4014,14 +3849,16 @@ static void __sched notrace __schedule(bool preempt)
 			check_siblings(rq);
 		else
 			wake_siblings(rq);
-		rq->nr_switches++;
+		atomic64_inc(&grq.nr_switches);
 		rq->curr = next;
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
-	} else
+	} else {
+		check_siblings(rq);
 		rq_unlock_irq(rq);
+	}
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -5607,8 +5444,12 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	__do_set_cpus_allowed(p, new_mask);
 	if (needs_other_cpu(p, task_cpu(p))) {
+		struct rq *rq;
+
 		set_task_cpu(p, valid_task_cpu(p));
+		rq = __task_rq_lock(p);
 		resched_task(p);
+		__task_rq_unlock(rq);
 	}
 }
 
@@ -5641,6 +5482,7 @@ void init_idle(struct task_struct *idle, int cpu)
 	raw_spin_lock_irqsave(&idle->pi_lock, flags);
 	raw_spin_lock(&rq->lock);
 	idle->last_ran = rq->niffies;
+	time_slice_expired(idle, rq);
 	idle->state = TASK_RUNNING;
 	/* Setting prio to illegal value shouldn't matter when never queued */
 	idle->prio = PRIO_LIMIT;
@@ -7439,6 +7281,8 @@ int sched_cpu_dying(unsigned int cpu)
 	}
 	bind_zero(cpu);
 	double_rq_unlock(rq, cpu_rq(0));
+	sched_start_tick(rq, cpu);
+	hrexpiry_clear(rq);
 	local_irq_restore(flags);
 
 	return 0;
@@ -7463,7 +7307,7 @@ static const cpumask_t *thread_cpumask(int cpu)
 /* All this CPU's SMT siblings are idle */
 static bool siblings_cpu_idle(struct rq *rq)
 {
-	return cpumask_subset(&rq->thread_mask, &cpu_idle_map);
+	return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
 }
 #endif
 #ifdef CONFIG_SCHED_MC
@@ -7474,7 +7318,7 @@ static const cpumask_t *core_cpumask(int cpu)
 /* All this CPU's shared cache siblings are idle */
 static bool cache_cpu_idle(struct rq *rq)
 {
-	return cpumask_subset(&rq->core_mask, &cpu_idle_map);
+	return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
 }
 #endif
 
@@ -7609,6 +7453,7 @@ void __init sched_init_smp(void)
 #else
 void __init sched_init_smp(void)
 {
+	sched_smp_initialized = true;
 }
 #endif /* CONFIG_SMP */
 
@@ -7655,11 +7500,14 @@ void __init sched_init(void)
 	for (i = 1 ; i < NICE_WIDTH ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
+	atomic_set(&grq.nr_running, 0);
+	atomic_set(&grq.nr_uninterruptible, 0);
+	atomic64_set(&grq.nr_switches, 0);
 	skiplist_node_init(&init_task.node);
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
-	cpumask_clear(&cpu_idle_map);
+	cpumask_clear(&grq.cpu_idle_map);
 #else
 	uprq = &per_cpu(runqueues, 0);
 #endif
@@ -7673,7 +7521,6 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
-		rq->nr_running = rq->nr_uninterruptible = rq->nr_switches = 0;
 		skiplist_init(&rq->node);
 		rq->sl = new_skiplist(&rq->node);
 		raw_spin_lock_init(&rq->lock);
@@ -7692,6 +7539,7 @@ void __init sched_init(void)
 		rq->cpu = i;
 		rq_attach_root(rq, &def_root_domain);
 #endif
+		init_rq_hrexpiry(rq);
 		atomic_set(&rq->nr_iowait, 0);
 	}
 
@@ -7899,199 +7747,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 #endif
 
-/*
- * Use precise platform statistics if available:
- */
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING
-
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-void vtime_common_task_switch(struct task_struct *prev)
-{
-	if (is_idle_task(prev))
-		vtime_account_idle(prev);
-	else
-		vtime_account_system(prev);
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-	vtime_account_user(prev);
-#endif
-	arch_vtime_task_switch(prev);
-}
-#endif
-
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	*ut = p->utime;
-	*st = p->stime;
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-
-	*ut = cputime.utime;
-	*st = cputime.stime;
-}
-
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-	unsigned long flags;
-
-	local_irq_save(flags);
-	vtime_account_system(tsk);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
-
-/*
- * Archs that account the whole time spent in the idle task
- * (outside irq) as idle time can rely on this and just implement
- * vtime_account_system() and vtime_account_idle(). Archs that
- * have other meaning of the idle time (s390 only includes the
- * time spent by the CPU when it's in low power mode) must override
- * vtime_account().
- */
-#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account_irq_enter(struct task_struct *tsk)
-{
-	if (!in_interrupt() && is_idle_task(tsk))
-		vtime_account_idle(tsk);
-	else
-		vtime_account_system(tsk);
-}
-EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
-#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-
-#else /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-/*
- * Perform (stime * rtime) / total, but avoid multiplication overflow by
- * losing precision when the numbers are big.
- */
-static cputime_t scale_stime(u64 stime, u64 rtime, u64 total)
-{
-	u64 scaled;
-
-	for (;;) {
-		/* Make sure "rtime" is the bigger of stime/rtime */
-		if (stime > rtime) {
-			u64 tmp = rtime; rtime = stime; stime = tmp;
-		}
-
-		/* Make sure 'total' fits in 32 bits */
-		if (total >> 32)
-			goto drop_precision;
-
-		/* Does rtime (and thus stime) fit in 32 bits? */
-		if (!(rtime >> 32))
-			break;
-
-		/* Can we just balance rtime/stime rather than dropping bits? */
-		if (stime >> 31)
-			goto drop_precision;
-
-		/* We can grow stime and shrink rtime and try to make them both fit */
-		stime <<= 1;
-		rtime >>= 1;
-		continue;
-
-drop_precision:
-		/* We drop from rtime, it has more bits than stime */
-		rtime >>= 1;
-		total >>= 1;
-	}
-
-	/*
-	 * Make sure gcc understands that this is a 32x32->64 multiply,
-	 * followed by a 64/32->64 divide.
-	 */
-	scaled = div_u64((u64) (u32) stime * (u64) (u32) rtime, (u32)total);
-	return (__force cputime_t) scaled;
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
- */
-static void cputime_adjust(struct task_cputime *curr,
-			   struct prev_cputime *prev,
-			   cputime_t *ut, cputime_t *st)
-{
-	cputime_t rtime, stime, utime, total;
-
-	stime = curr->stime;
-	total = stime + curr->utime;
-
-	/*
-	 * Tick based cputime accounting depend on random scheduling
-	 * timeslices of a task to be interrupted or not by the timer.
-	 * Depending on these circumstances, the number of these interrupts
-	 * may be over or under-optimistic, matching the real user and system
-	 * cputime with a variable precision.
-	 *
-	 * Fix this by scaling these tick based values against the total
-	 * runtime accounted by the CFS scheduler.
-	 */
-	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
-
-	/*
-	 * Update userspace visible utime/stime values only if actual execution
-	 * time is bigger than already exported. Note that can happen, that we
-	 * provided bigger values due to scaling inaccuracy on big numbers.
-	 */
-	if (prev->stime + prev->utime >= rtime)
-		goto out;
-
-	if (total) {
-		stime = scale_stime((__force u64)stime,
-				    (__force u64)rtime, (__force u64)total);
-		utime = rtime - stime;
-	} else {
-		stime = rtime;
-		utime = 0;
-	}
-
-	/*
-	 * If the tick based count grows faster than the scheduler one,
-	 * the result of the scaling may go backward.
-	 * Let's enforce monotonicity.
-	 */
-	prev->stime = max(prev->stime, stime);
-	prev->utime = max(prev->utime, utime);
-
-out:
-	*ut = prev->utime;
-	*st = prev->stime;
-}
-
-void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime = {
-		.sum_exec_runtime = tsk_seruntime(p),
-	};
-
-	task_cputime(p, &cputime.utime, &cputime.stime);
-	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
-}
-EXPORT_SYMBOL_GPL(task_cputime_adjusted);
-
-/*
- * Must be called with siglock held.
- */
-void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputime(p, &cputime);
-	cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
-}
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-
 void init_idle_bootup_task(struct task_struct *idle)
 {}