3 files changed, 124 insertions, 163 deletions
diff --git a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c
index 6a656ad4b..bd9b1f13f 100644
--- a/kernel/sched/MuQSS.c
+++ b/kernel/sched/MuQSS.c
@@ -135,7 +135,7 @@
 
 void print_scheduler_version(void)
 {
-	printk(KERN_INFO "MuQSS CPU scheduler v0.114 by Con Kolivas.\n");
+	printk(KERN_INFO "MuQSS CPU scheduler v0.115 by Con Kolivas.\n");
 }
 
 /*
@@ -179,28 +179,9 @@ static inline int timeslice(void)
 	return MS_TO_US(rr_interval);
 }
 
-/*
- * The global runqueue data that all CPUs work off. Contains either atomic
- * variables and a cpu bitmap set atomically.
- */
-struct global_rq {
 #ifdef CONFIG_SMP
-	atomic_t nr_running ____cacheline_aligned_in_smp;
-	atomic_t nr_uninterruptible ____cacheline_aligned_in_smp;
-	atomic64_t nr_switches ____cacheline_aligned_in_smp;
-	atomic_t qnr ____cacheline_aligned_in_smp; /* queued not running */
-#else
-	atomic_t nr_running ____cacheline_aligned;
-	atomic_t nr_uninterruptible ____cacheline_aligned;
-	atomic64_t nr_switches ____cacheline_aligned;
-	atomic_t qnr ____cacheline_aligned; /* queued not running */
-#endif
-#ifdef CONFIG_SMP
-	cpumask_t cpu_idle_map;
-#endif
-};
+static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp;
 
-#ifdef CONFIG_SMP
 /*
  * We add the notion of a root-domain which will be used to define per-domain
  * variables. Each exclusive cpuset essentially defines an island domain by
@@ -232,13 +213,6 @@ static struct root_domain def_root_domain;
 
 #endif /* CONFIG_SMP */
 
-/* There can be only one */
-#ifdef CONFIG_SMP
-static struct global_rq grq ____cacheline_aligned_in_smp;
-#else
-static struct global_rq grq ____cacheline_aligned;
-#endif
-
 static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 /* cpus with isolated domains */
@@ -710,6 +684,12 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 	next->on_cpu = 1;
 }
 
+static inline void smp_sched_reschedule(int cpu)
+{
+	if (likely(cpu_online(cpu)))
+		smp_send_reschedule(cpu);
+}
+
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  *
@@ -736,7 +716,7 @@ void resched_task(struct task_struct *p)
 	}
 
 	if (set_nr_and_not_polling(p))
-		smp_send_reschedule(cpu);
+		smp_sched_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
 }
@@ -788,6 +768,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 	 */
 	if (unlikely(task_on_rq_migrating(prev))) {
 		sched_info_dequeued(rq, prev);
+		rq->nr_running--;
 		/*
 		 * We move the ownership of prev to the new cpu now. ttwu can't
 		 * activate prev to the wrong cpu since it has to grab this
@@ -798,6 +779,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 
 		raw_spin_lock(&prev->pi_lock);
 		rq = __task_rq_lock(prev);
+		rq->nr_running++;
 		/* Check that someone else hasn't already queued prev */
 		if (likely(!task_queued(prev))) {
 			enqueue_task(rq, prev, 0);
@@ -852,7 +834,7 @@ static inline int ms_longest_deadline_diff(void)
 
 static inline int rq_load(struct rq *rq)
 {
-	return rq->sl->entries + !rq_idle(rq);
+	return rq->nr_running;
 }
 
 static inline bool rq_local(struct rq *rq);
@@ -999,26 +981,6 @@ static inline int task_timeslice(struct task_struct *p)
 	return (rr_interval * task_prio_ratio(p) / 128);
 }
 
-/*
- * qnr is the "queued but not running" count which is the total number of
- * tasks on the global runqueue list waiting for cpu time but not actually
- * currently running on a cpu.
- */
-static inline void inc_qnr(void)
-{
-	atomic_inc(&grq.qnr);
-}
-
-static inline void dec_qnr(void)
-{
-	atomic_dec(&grq.qnr);
-}
-
-static inline int queued_notrunning(void)
-{
-	return atomic_read(&grq.qnr);
-}
-
 #ifdef CONFIG_SMP
 /* Entered with rq locked */
 static inline void resched_if_idle(struct rq *rq)
@@ -1123,7 +1085,7 @@ static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask)
 static inline void set_cpuidle_map(int cpu)
 {
 	if (likely(cpu_online(cpu)))
-		atomic_set_cpu(cpu, &grq.cpu_idle_map);
+		atomic_set_cpu(cpu, &cpu_idle_map);
 }
 
 static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
@@ -1133,12 +1095,12 @@ static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask)
 
 static inline void clear_cpuidle_map(int cpu)
 {
-	atomic_clear_cpu(cpu, &grq.cpu_idle_map);
+	atomic_clear_cpu(cpu, &cpu_idle_map);
 }
 
 static bool suitable_idle_cpus(struct task_struct *p)
 {
-	return (cpumask_intersects(&p->cpus_allowed, &grq.cpu_idle_map));
+	return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map));
 }
 
 /*
@@ -1165,7 +1127,7 @@ static void resched_curr(struct rq *rq)
 	}
 
 	if (set_nr_and_not_polling(rq->curr))
-		smp_send_reschedule(cpu);
+		smp_sched_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
 }
@@ -1260,7 +1222,7 @@ static inline void resched_idle(struct rq *rq)
 		return;
 	}
 
-	smp_send_reschedule(rq->cpu);
+	smp_sched_reschedule(rq->cpu);
 }
 
 static struct rq *resched_best_idle(struct task_struct *p, int cpu)
@@ -1269,7 +1231,7 @@ static struct rq *resched_best_idle(struct task_struct *p, int cpu)
 	struct rq *rq;
 	int best_cpu;
 
-	cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
+	cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map);
 	best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask);
 	rq = cpu_rq(best_cpu);
 	if (!smt_schedule(p, rq))
@@ -1381,12 +1343,11 @@ static void activate_task(struct task_struct *p, struct rq *rq)
 
 	p->prio = effective_prio(p);
 	if (task_contributes_to_load(p))
-		atomic_dec(&grq.nr_uninterruptible);
+		rq->nr_uninterruptible--;
 
 	enqueue_task(rq, p, 0);
 	p->on_rq = TASK_ON_RQ_QUEUED;
-	atomic_inc(&grq.nr_running);
-	inc_qnr();
+	rq->nr_running++;
 }
 
 /*
@@ -1396,10 +1357,10 @@ static void activate_task(struct task_struct *p, struct rq *rq)
 static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 {
 	if (task_contributes_to_load(p))
-		atomic_inc(&grq.nr_uninterruptible);
+		rq->nr_uninterruptible++;
 
 	p->on_rq = 0;
-	atomic_dec(&grq.nr_running);
+	rq->nr_running--;
 	sched_info_dequeued(rq, p);
 }
 
@@ -1467,11 +1428,12 @@ static inline void take_task(struct rq *rq, int cpu, struct task_struct *p)
 
 	dequeue_task(p_rq, p, DEQUEUE_SAVE);
 	if (p_rq != rq) {
+		p_rq->nr_running--;
 		sched_info_dequeued(p_rq, p);
+		rq->nr_running++;
 		sched_info_queued(rq, p);
 	}
 	set_task_cpu(p, cpu);
-	dec_qnr();
 }
 
 /*
@@ -1484,7 +1446,6 @@ static inline void return_task(struct task_struct *p, struct rq *rq,
 	if (deactivate)
 		deactivate_task(p, rq);
 	else {
-		inc_qnr();
 #ifdef CONFIG_SMP
 		/*
 		 * set_task_cpu was called on the running task that doesn't
@@ -1641,7 +1602,7 @@ void kick_process(struct task_struct *p)
 	preempt_disable();
 	cpu = task_cpu(p);
 	if ((cpu != smp_processor_id()) && task_curr(p))
-		smp_send_reschedule(cpu);
+		smp_sched_reschedule(cpu);
 	preempt_enable();
 }
 EXPORT_SYMBOL_GPL(kick_process);
@@ -1806,7 +1767,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 
 #ifdef CONFIG_SMP
 	if (p->sched_contributes_to_load)
-		atomic_dec(&grq.nr_uninterruptible);
+		rq->nr_uninterruptible--;
 #endif
 
 	ttwu_activate(rq, p);
@@ -1897,7 +1858,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
 
 	if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
 		if (!set_nr_if_polling(rq->idle))
-			smp_send_reschedule(cpu);
+			smp_sched_reschedule(cpu);
 		else
 			trace_sched_wake_idle_without_ipi(cpu);
 	}
@@ -1918,7 +1879,7 @@ void wake_up_if_idle(int cpu)
 	} else {
 		rq_lock_irqsave(rq, &flags);
 		if (likely(is_idle_task(rq->curr)))
-			smp_send_reschedule(cpu);
+			smp_sched_reschedule(cpu);
 		/* Else cpu is not in idle, do nothing here */
 		rq_unlock_irqrestore(rq, &flags);
 	}
@@ -1980,7 +1941,7 @@ static inline int select_best_cpu(struct task_struct *p)
 		rq = other_rq;
 	}
 	if (unlikely(!rq))
-		return smp_processor_id();
+		return task_cpu(p);
 	return rq->cpu;
 }
 #else /* CONFIG_SMP */
@@ -2690,22 +2651,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 }
 
 /*
- * nr_running, nr_uninterruptible and nr_context_switches:
- *
- * externally visible scheduler statistics: current number of runnable
- * threads, total number of context switches performed since bootup.
- */
-unsigned long nr_running(void)
-{
-	return atomic_read(&grq.nr_running);
-}
-
-static unsigned long nr_uninterruptible(void)
-{
-	return atomic_read(&grq.nr_uninterruptible);
-}
-
-/*
  * Check if only the current task is running on the cpu.
  *
  * Caution: this function does not check that the caller has disabled
@@ -2729,9 +2674,31 @@ bool single_task_running(void)
 }
 EXPORT_SYMBOL(single_task_running);
 
+/*
+ * nr_running, nr_uninterruptible and nr_context_switches:
+ *
+ * externally visible scheduler statistics: current number of runnable
+ * threads, total number of context switches performed since bootup.
+ */
 unsigned long long nr_context_switches(void)
 {
-	return (unsigned long long)atomic64_read(&grq.nr_switches);
+	long long sum = 0;
+	int i;
+
+	for_each_possible_cpu(i)
+		sum += cpu_rq(i)->nr_switches;
+
+	return sum;
+}
+
+unsigned long nr_running(void)
+{
+	long i, sum = 0;
+
+	for_each_online_cpu(i)
+		sum += cpu_rq(i)->nr_running;
+
+	return sum;
 }
 
 unsigned long nr_iowait(void)
@@ -2752,7 +2719,14 @@ unsigned long nr_iowait_cpu(int cpu)
 
 unsigned long nr_active(void)
 {
-	return nr_running() + nr_uninterruptible();
+	long i, sum = 0;
+
+	for_each_online_cpu(i) {
+		sum += cpu_rq(i)->nr_running;
+		sum += cpu_rq(i)->nr_uninterruptible;
+	}
+
+	return sum;
 }
 
 /*
@@ -3662,8 +3636,6 @@ static inline void check_deadline(struct task_struct *p, struct rq *rq)
 		time_slice_expired(p, rq);
 }
 
-#define BITOP_WORD(nr)		((nr) / BITS_PER_LONG)
-
 /*
  * Task selection with skiplists is a simple matter of picking off the first
  * task in the sorted list, an O(1) operation. The lookup is amortised O(1)
@@ -3680,8 +3652,9 @@ static inline void check_deadline(struct task_struct *p, struct rq *rq)
  * runqueue or a runqueue with more tasks than the current one with a better
  * key/deadline.
  */
-static inline struct
-task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+#ifdef CONFIG_SMP
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
 {
 	struct task_struct *edt = idle;
 	struct rq *locked = NULL;
@@ -3759,6 +3732,19 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *
 
 	return edt;
 }
+#else /* CONFIG_SMP */
+static inline struct task_struct
+*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
+{
+	struct task_struct *edt;
+
+	if (unlikely(!rq->sl->entries))
+		return idle;
+	edt = rq->node.next[0]->value;
+	take_task(rq, cpu, edt);
+	return edt;
+}
+#endif /* CONFIG_SMP */
 
 /*
  * Print scheduling while atomic bug:
@@ -3840,13 +3826,9 @@ static void check_smt_siblings(struct rq *this_rq)
 		rq = cpu_rq(other_cpu);
 		if (rq_idle(rq))
 			continue;
-		if (unlikely(!rq->online))
-			continue;
 		p = rq->curr;
-		if (!smt_schedule(p, this_rq)) {
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
+		if (!smt_schedule(p, this_rq))
+			resched_curr(rq);
 	}
 }
 
@@ -3854,21 +3836,12 @@ static void wake_smt_siblings(struct rq *this_rq)
 {
 	int other_cpu;
 
-	if (!queued_notrunning())
-		return;
-
 	for_each_cpu(other_cpu, &this_rq->thread_mask) {
 		struct rq *rq;
 
 		rq = cpu_rq(other_cpu);
-		if (unlikely(!rq->online))
-			continue;
-		if (rq_idle(rq)) {
-			struct task_struct *p = rq->curr;
-
-			set_tsk_need_resched(p);
-			smp_send_reschedule(other_cpu);
-		}
+		if (rq_idle(rq))
+			resched_idle(rq);
 	}
 }
 #else
@@ -4020,23 +3993,16 @@ static void __sched notrace __schedule(bool preempt)
 		return_task(prev, rq, cpu, deactivate);
 	}
 
-	if (unlikely(!queued_notrunning())) {
-		next = idle;
-		schedstat_inc(rq, sched_goidle);
+	next = earliest_deadline_task(rq, cpu, idle);
+	if (likely(next->prio != PRIO_LIMIT)) {
+		clear_cpuidle_map(cpu);
+		next->last_ran = niffies;
+	} else {
 		set_cpuidle_map(cpu);
 		update_load_avg(rq);
-	} else {
-		next = earliest_deadline_task(rq, cpu, idle);
-		if (likely(next->prio != PRIO_LIMIT))
-			clear_cpuidle_map(cpu);
-		else {
-			set_cpuidle_map(cpu);
-			update_load_avg(rq);
-		}
 	}
 
 	set_rq_task(rq, next);
-	next->last_ran = niffies;
 
 	if (likely(prev != next)) {
 		/*
@@ -4048,16 +4014,14 @@ static void __sched notrace __schedule(bool preempt)
 			check_siblings(rq);
 		else
 			wake_siblings(rq);
-		atomic64_inc(&grq.nr_switches);
+		rq->nr_switches++;
 		rq->curr = next;
 		++*switch_count;
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the rq */
-	} else {
-		check_siblings(rq);
+	} else
 		rq_unlock_irq(rq);
-	}
 }
 
 static inline void sched_submit_work(struct task_struct *tsk)
@@ -5618,7 +5582,7 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
 	p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	struct rq *rq = task_rq(p);
 
@@ -5633,6 +5597,29 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 		 */
 		lockdep_assert_held(&rq->lock);
 	}
+}
+
+/*
+ * Calling do_set_cpus_allowed from outside the scheduler code may make the
+ * task not be able to run on its current CPU so we resched it here.
+ */
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask);
+	if (needs_other_cpu(p, task_cpu(p))) {
+		set_task_cpu(p, valid_task_cpu(p));
+		resched_task(p);
+	}
+}
+
+/*
+ * For internal scheduler calls to do_set_cpus_allowed which will resched
+ * themselves if needed.
+ */
+static void _do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+	__do_set_cpus_allowed(p, new_mask);
+	/* __set_cpus_allowed_ptr will handle the reschedule in this variant */
 	if (needs_other_cpu(p, task_cpu(p)))
 		set_task_cpu(p, valid_task_cpu(p));
 }
@@ -5830,7 +5817,7 @@ void wake_up_idle_cpu(int cpu)
 		return;
 
 	if (set_nr_and_not_polling(cpu_rq(cpu)->idle))
-		smp_send_reschedule(cpu);
+		smp_sched_reschedule(cpu);
 	else
 		trace_sched_wake_idle_without_ipi(cpu);
 }
@@ -5890,7 +5877,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 
 	queued = task_queued(p);
 
-	do_set_cpus_allowed(p, new_mask);
+	_do_set_cpus_allowed(p, new_mask);
 
 	if (p->flags & PF_KTHREAD) {
 		/*
@@ -7476,7 +7463,7 @@ static const cpumask_t *thread_cpumask(int cpu)
 /* All this CPU's SMT siblings are idle */
 static bool siblings_cpu_idle(struct rq *rq)
 {
-	return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
+	return cpumask_subset(&rq->thread_mask, &cpu_idle_map);
 }
 #endif
 #ifdef CONFIG_SCHED_MC
@@ -7487,7 +7474,7 @@ static const cpumask_t *core_cpumask(int cpu)
 /* All this CPU's shared cache siblings are idle */
 static bool cache_cpu_idle(struct rq *rq)
 {
-	return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
+	return cpumask_subset(&rq->core_mask, &cpu_idle_map);
 }
 #endif
 
@@ -7668,15 +7655,11 @@ void __init sched_init(void)
 	for (i = 1 ; i < NICE_WIDTH ; i++)
 		prio_ratios[i] = prio_ratios[i - 1] * 11 / 10;
 
-	atomic_set(&grq.nr_running, 0);
-	atomic_set(&grq.nr_uninterruptible, 0);
-	atomic64_set(&grq.nr_switches, 0);
 	skiplist_node_init(&init_task.node);
 
 #ifdef CONFIG_SMP
 	init_defrootdomain();
-	atomic_set(&grq.qnr, 0);
-	cpumask_clear(&grq.cpu_idle_map);
+	cpumask_clear(&cpu_idle_map);
 #else
 	uprq = &per_cpu(runqueues, 0);
 #endif
@@ -7690,6 +7673,7 @@ void __init sched_init(void)
 #endif /* CONFIG_CGROUP_SCHED */
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
+		rq->nr_running = rq->nr_uninterruptible = rq->nr_switches = 0;
 		skiplist_init(&rq->node);
 		rq->sl = new_skiplist(&rq->node);
 		raw_spin_lock_init(&rq->lock);
diff --git a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h
index 4e3115dac..10a12b335 100644
--- a/kernel/sched/MuQSS.h
+++ b/kernel/sched/MuQSS.h
@@ -17,6 +17,9 @@
 struct rq {
 	struct task_struct *curr, *idle, *stop;
 	struct mm_struct *prev_mm;
+	long nr_uninterruptible;
+	s64 nr_switches;
+	int nr_running;
 
 	raw_spinlock_t lock;
 
diff --git a/kernel/skip_list.c b/kernel/skip_list.c
index 5c66067f2..d52508056 100644
--- a/kernel/skip_list.c
+++ b/kernel/skip_list.c
@@ -93,34 +93,9 @@ void skiplist_node_init(skiplist_node *node)
 	memset(node, 0, sizeof(skiplist_node));
 }
 
-/*
- * Returns a pseudo-random number based on the randseed value by masking out
- * 0-15. As many levels are not required when only few values are on the list,
- * we limit the height of the levels according to how many list entries there
- * are in a cheap manner. The height of the levels may have been higher while
- * there were more entries queued previously but as this code is used only by
- * the scheduler, entries are short lived and will be torn down regularly.
- *
- * 00-03 entries - 1 level
- * 04-07 entries - 2 levels
- * 08-15 entries - 4 levels
- * 15-31 entries - 7 levels
- *  32+  entries - max(16) levels
- */
-static inline unsigned int randomLevel(int entries, unsigned int randseed)
+static inline unsigned int randomLevel(const long unsigned int randseed)
 {
-	unsigned int mask;
-
-	if (entries > 15)
-		mask = 0x7;
-	else if (entries > 7)
-		mask = 0x3;
-	else if (entries > 3)
-		mask = 0x1;
-	else
-		return 0;
-
-	return randseed & mask;
+	return find_first_bit(&randseed, MaxLevel);
 }
 
 void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed)
@@ -136,9 +111,8 @@ void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType va
 		update[k] = p;
 	} while (--k >= 0);
 
-	k = randomLevel(++l->entries, randseed);
-	if (k > MaxLevel)
-		k = MaxLevel;
+	++l->entries;
+	k = randomLevel(randseed);
 	if (k > l->level) {
 		k = ++l->level;
 		update[k] = l->header;