1 files changed, 229 insertions, 271 deletions
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
index 67f93e752..bb5bac4b2 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/bfs.c
@@ -24,7 +24,7 @@
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
  *              Thomas Gleixner, Mike Kravetz
- *  now		Brainfuck deadline scheduling policy by Con Kolivas deletes
+ *  2009-08-13	Brainfuck deadline scheduling policy by Con Kolivas deletes
  *              a whole lot of those previous things.
  */
 
@@ -137,7 +137,7 @@
 
 void print_scheduler_version(void)
 {
-	printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n");
+	printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n");
 }
 
 /*
@@ -403,7 +403,6 @@ static inline void grq_lock_irq(void)
 }
 
 static inline void time_lock_grq(struct rq *rq)
-	__acquires(grq.lock)
 {
 	grq_lock();
 	update_clocks(rq);
@@ -429,86 +428,35 @@ static inline void grq_unlock_irqrestore(unsigned long *flags)
 
 static inline struct rq
 *task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
+	__acquires(p->pi_lock)
 {
-	grq_lock_irqsave(flags);
+	raw_spin_lock_irqsave(&p->pi_lock, *flags);
+	grq_lock();
 	return task_rq(p);
 }
 
 static inline struct rq
 *time_task_grq_lock(struct task_struct *p, unsigned long *flags)
-	__acquires(grq.lock)
 {
 	struct rq *rq = task_grq_lock(p, flags);
-	update_clocks(rq);
-	return rq;
-}
 
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	grq_lock_irq();
-	return task_rq(p);
-}
-
-static inline void time_task_grq_lock_irq(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	struct rq *rq = task_grq_lock_irq(p);
 	update_clocks(rq);
+	return rq;
 }
 
-static inline void task_grq_unlock_irq(void)
-	__releases(grq.lock)
-{
-	grq_unlock_irq();
-}
-
-static inline void task_grq_unlock(unsigned long *flags)
-	__releases(grq.lock)
-{
-	grq_unlock_irqrestore(flags);
-}
-
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
-{
-	return raw_spin_is_locked(&grq.lock);
-}
-
-void grq_unlock_wait(void)
-	__releases(grq.lock)
+static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags)
+	__releases(p->pi_lock)
 {
-	smp_mb(); /* spin-unlock-wait is not a full memory barrier */
-	raw_spin_unlock_wait(&grq.lock);
+	grq_unlock();
+	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 
 static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
-	__acquires(grq.lock)
 {
 	local_irq_save(*flags);
 	time_lock_grq(rq);
 }
 
-static inline struct rq *__task_grq_lock(struct task_struct *p)
-	__acquires(grq.lock)
-{
-	grq_lock();
-	return task_rq(p);
-}
-
-static inline void __task_grq_unlock(void)
-	__releases(grq.lock)
-{
-	grq_unlock();
-}
-
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 }
@@ -540,6 +488,40 @@ static inline bool deadline_after(u64 deadline, u64 time)
 }
 
 /*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+	return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+	return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+	return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+	return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
  * A task that is not running or queued will not have a node set.
  * A task that is queued but not running will have a node set.
  * A task that is currently running will have ->on_cpu set but no node set.
@@ -561,14 +543,23 @@ static void dequeue_task(struct task_struct *p)
 	sched_info_dequeued(task_rq(p), p);
 }
 
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+	return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
 /*
  * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
  * an idle task, we ensure none of the following conditions are met.
  */
 static bool idleprio_suitable(struct task_struct *p)
 {
-	return (!freezing(p) && !signal_pending(p) &&
-		!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+	return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+		!signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
 }
 
 /*
@@ -612,9 +603,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
 		sl_id = p->prio;
 	else {
 		sl_id = p->deadline;
-		/* Set it to cope with 4 left shifts with locality_diff */
-		if (p->prio == IDLE_PRIO)
-			sl_id |= 0x0F00000000000000;
+		if (idleprio_task(p)) {
+			/* Set it to cope with 4 left shifts with locality_diff */
+			if (p->prio == IDLE_PRIO)
+				sl_id |= 0x00FF000000000000;
+			else
+				sl_id += longest_deadline_diff();
+		}
 	}
 	/*
 	 * Some architectures don't have better than microsecond resolution
@@ -1008,15 +1003,18 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq)
 #ifdef CONFIG_SMP
 void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
-	unsigned int tcpu;
-
 #ifdef CONFIG_LOCKDEP
 	/*
-	 * The caller should hold grq lock.
+	 * The caller should hold either p->pi_lock or grq lock, when changing
+	 * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks.
+	 *
+	 * Furthermore, all task_rq users should acquire both locks, see
+	 * task_grq_lock().
 	 */
-	WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+	WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+				      lockdep_is_held(&grq.lock)));
 #endif
-	if ((tcpu = task_cpu(p)) == cpu)
+	if (task_cpu(p) == cpu)
 		return;
 	trace_sched_migrate_task(p, cpu);
 	perf_event_task_migrate(p);
@@ -1027,6 +1025,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
 	 * per-task data have been completed by this moment.
 	 */
 	smp_wmb();
+
 	if (p->on_rq) {
 		struct rq *rq = task_rq(p);
 
@@ -1166,7 +1165,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		ncsw = 0;
 		if (!match_state || p->state == match_state)
 			ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-		task_grq_unlock(&flags);
+		task_grq_unlock(p, &flags);
 
 		/*
 		 * If it changed from the expected state, bail out now.
@@ -1292,9 +1291,7 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
 
 static void try_preempt(struct task_struct *p, struct rq *this_rq)
 {
-	int cpu, pcpu, highest_prio, highest_cpu;
-	struct rq *highest_prio_rq;
-	u64 latest_deadline;
+	int i, this_entries = this_rq->soft_affined;
 	cpumask_t tmp;
 
 	if (suitable_idle_cpus(p) && resched_best_idle(p))
@@ -1306,56 +1303,32 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
 
 	cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
 
-	/* See if this task can preempt the task on the current CPU first. */
-	pcpu = cpu_of(this_rq);
-	if (likely(cpumask_test_cpu(pcpu, &tmp))) {
-		if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
-			resched_curr(this_rq);
-			return;
-		}
-		cpumask_clear_cpu(pcpu, &tmp);
-	}
-
-	highest_prio = latest_deadline = 0;
-	highest_prio_rq = NULL;
-
-	/* Now look for the CPU with the latest deadline */
-	for_each_cpu(cpu, &tmp) {
-		struct rq *rq;
-		int rq_prio;
-		u64 dl;
+	/*
+	 * We iterate over CPUs in locality order using rq_order, finding the
+	 * first one we can preempt if possible, thus staying closest in
+	 * locality.
+	 */
+	for (i = 0; i < num_possible_cpus(); i++) {
+		struct rq *rq = this_rq->rq_order[i];
 
-		rq = cpu_rq(cpu);
-		rq_prio = rq->rq_prio;
-		if (rq_prio < highest_prio)
+		if (!cpumask_test_cpu(rq->cpu, &tmp))
 			continue;
 
-		dl = rq->rq_deadline;
-		if (!sched_interactive && pcpu != cpu)
-			dl <<= locality_diff(pcpu, rq);
-		if (rq_prio > highest_prio ||
-		    deadline_after(dl, latest_deadline)) {
-			latest_deadline = dl;
-			highest_prio = rq_prio;
-			highest_cpu = cpu;
-			highest_prio_rq = rq;
+		if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries)
+			continue;
+		if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+			/*
+			 * If we have decided this task should preempt this CPU,
+			 * set the task's CPU to match thereby speeding up matching
+			 * this task in earliest_deadline_task.
+			 */
+			set_task_cpu(p, rq->cpu);
+			resched_curr(rq);
+			return;
 		}
 	}
-
-	if (unlikely(!highest_prio_rq))
-		return;
-	if (!smt_schedule(p, highest_prio_rq))
-		return;
-	if (can_preempt(p, highest_prio, latest_deadline)) {
-		/*
-		 * If we have decided this task should preempt this CPU,
-		 * set the task's CPU to match thereby speeding up matching
-		 * this task in earliest_deadline_task.
-		 */
-		set_task_cpu(p, highest_cpu);
-		resched_curr(highest_prio_rq);
-	}
 }
+
 static int __set_cpus_allowed_ptr(struct task_struct *p,
 				  const struct cpumask *new_mask, bool check);
 #else /* CONFIG_SMP */
@@ -1501,8 +1474,6 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
 	struct rq *rq;
 	int cpu;
 
-	get_cpu();
-
 	/*
 	 * If we are going to wake up a thread waiting for CONDITION we
 	 * need to ensure that CONDITION=1 done by the caller can not be
@@ -1533,13 +1504,11 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
 out_running:
 	ttwu_post_activation(p, rq, success);
 out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_unlock(p, &flags);
 
 	if (schedstat_enabled())
 		ttwu_stat(p, cpu, wake_flags);
 
-	put_cpu();
-
 	return success;
 }
 
@@ -1629,6 +1598,13 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
 	skiplist_node_init(&p->node);
 
 	/*
+	 * We mark the process as NEW here. This guarantees that
+	 * nobody will actually run it, and a signal or other external
+	 * event cannot wake it up and insert it on the runqueue either.
+	 */
+	p->state = TASK_NEW;
+
+	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
@@ -1744,12 +1720,16 @@ static inline void init_schedstats(void) {}
  */
 void wake_up_new_task(struct task_struct *p)
 {
-	struct task_struct *parent;
+	struct task_struct *parent, *rq_curr;
+	struct rq *rq, *new_rq;
 	unsigned long flags;
-	struct rq *rq;
 
 	parent = p->parent;
 	rq = task_grq_lock(p, &flags);
+	if (unlikely(needs_other_cpu(p, task_cpu(p))))
+ 		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+	rq_curr = rq->curr;
+	p->state = TASK_RUNNING;
 
 	/*
 	 * Reinit new task deadline as its creator deadline could have changed
@@ -1757,22 +1737,20 @@ void wake_up_new_task(struct task_struct *p)
 	 */
 	p->deadline = rq->rq_deadline;
 
-	/*
-	 * If the task is a new process, current and parent are the same. If
-	 * the task is a new thread in the thread group, it will have much more
-	 * in common with current than with the parent.
-	 */
-	set_task_cpu(p, task_cpu(rq->curr));
+	/* The new task might not be able to run on the same CPU as rq->curr */
+	if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+		new_rq = task_rq(p);
+	} else
+		new_rq = rq;
 
 	/*
 	 * Make sure we do not leak PI boosting priority to the child.
 	 */
-	p->prio = rq->curr->normal_prio;
+	p->prio = rq_curr->normal_prio;
 
 	activate_task(p, rq);
 	trace_sched_wakeup_new(p);
-	if (unlikely(p->policy == SCHED_FIFO))
-		goto after_ts_init;
 
 	/*
 	 * Share the timeslice between parent and child, thus the
@@ -1784,33 +1762,39 @@ void wake_up_new_task(struct task_struct *p)
 	 * is always equal to current->deadline.
 	 */
 	p->last_ran = rq->rq_last_ran;
-	if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+	if (likely(rq_curr->policy != SCHED_FIFO)) {
 		rq->rq_time_slice /= 2;
-		p->time_slice = rq->rq_time_slice;
-after_ts_init:
-		if (rq->curr == parent && !suitable_idle_cpus(p)) {
+		if (unlikely(rq->rq_time_slice < RESCHED_US)) {
 			/*
-			 * The VM isn't cloned, so we're in a good position to
-			 * do child-runs-first in anticipation of an exec. This
-			 * usually avoids a lot of COW overhead.
+			 * Forking task has run out of timeslice. Reschedule it and
+			 * start its child with a new time slice and deadline. The
+			 * child will end up running first because its deadline will
+			 * be slightly earlier.
 			 */
-			__set_tsk_resched(parent);
-		} else
-			try_preempt(p, rq);
-	} else {
-		if (rq->curr == parent) {
-			/*
-			* Forking task has run out of timeslice. Reschedule it and
-			* start its child with a new time slice and deadline. The
-			* child will end up running first because its deadline will
-			* be slightly earlier.
-			*/
 			rq->rq_time_slice = 0;
-			__set_tsk_resched(parent);
+			__set_tsk_resched(rq_curr);
+			time_slice_expired(p);
+			if (suitable_idle_cpus(p))
+				resched_best_idle(p);
+			else if (unlikely(rq != new_rq))
+				try_preempt(p, new_rq);
+		} else {
+			p->time_slice = rq->rq_time_slice;
+			if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+				/*
+				 * The VM isn't cloned, so we're in a good position to
+				 * do child-runs-first in anticipation of an exec. This
+				 * usually avoids a lot of COW overhead.
+				 */
+				__set_tsk_resched(rq_curr);
+			} else
+				try_preempt(p, new_rq);
 		}
+	} else {
 		time_slice_expired(p);
+		try_preempt(p, new_rq);
 	}
-	task_grq_unlock(&flags);
+	task_grq_unlock(p, &flags);
 }
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2724,7 +2708,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 
 	rq = task_grq_lock(p, &flags);
 	ns = p->sched_time + do_task_delta_exec(p, rq);
-	task_grq_unlock(&flags);
+	task_grq_unlock(p, &flags);
 
 	return ns;
 }
@@ -2978,7 +2962,7 @@ static void task_running_tick(struct rq *rq)
 
 	grq_lock();
 	requeue_task(p);
-	__set_tsk_resched(p);
+	resched_task(p);
 	grq_unlock();
 }
 
@@ -3083,40 +3067,6 @@ static inline void preempt_latency_stop(int val) { }
 #endif
 
 /*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
-	return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
-	return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
-	return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
-	return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
-	return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
  * The time_slice is only refilled when it is empty and that is when we set a
  * new deadline.
  */
@@ -3215,13 +3165,12 @@ found_middle:
 static inline struct
 task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
 {
-	struct task_struct *edt = idle;
 	skiplist_node *node = &grq.node;
+	struct task_struct *edt = idle;
 	u64 earliest_deadline = ~0ULL;
 
 	while ((node = node->next[0]) != &grq.node) {
 		struct task_struct *p = node->value;
-		int tcpu;
 
 		/* Make sure affinity is ok */
 		if (needs_other_cpu(p, cpu))
@@ -3230,22 +3179,24 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *
 		if (!smt_schedule(p, rq))
 			continue;
 
-		if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
-			u64 dl = p->deadline << locality_diff(tcpu, rq);
+		if (!sched_interactive) {
+			int tcpu;
+
+			if ((tcpu = task_cpu(p)) != cpu) {
+				u64 dl = p->deadline << locality_diff(tcpu, rq);
 
-			if (unlikely(!deadline_before(dl, earliest_deadline)))
+				if (!deadline_before(dl, earliest_deadline))
+					continue;
+				earliest_deadline = dl;
+				edt = p;
+				/* We continue even though we've found the earliest
+				 * deadline task as the locality offset means there
+				 * may be a better candidate after it. */
 				continue;
-			earliest_deadline = dl;
-			edt = p;
-			/* We continue even though we've found the earliest
-			 * deadline task as the locality offset means there
-			 * may be a better candidate after it. */
-			continue;
+			}
 		}
-		/* This wouldn't happen if we encountered a better deadline from
-		 * another CPU and have already set edt. */
-		if (likely(p->deadline < earliest_deadline))
-			edt = p;
+		/* We've encountered the best deadline local task */
+		edt = p;
 		break;
 	}
 	if (likely(edt != idle))
@@ -3275,6 +3226,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 		pr_cont("\n");
 	}
 #endif
+	if (panic_on_warn)
+		panic("scheduling while atomic\n");
+
 	dump_stack();
 	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
@@ -3316,10 +3270,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p)
 	rq->rq_mm = p->mm;
 	rq->rq_smt_bias = p->smt_bias;
 #endif
-	if (p != rq->idle)
-		rq->rq_running = true;
-	else
-		rq->rq_running = false;
 }
 
 static void reset_rq_task(struct rq *rq, struct task_struct *p)
@@ -3353,7 +3303,7 @@ static void check_smt_siblings(struct rq *this_rq)
 		if (unlikely(!rq->online))
 			continue;
 		p = rq->curr;
-		if (!smt_should_schedule(p, this_rq)) {
+		if (!smt_schedule(p, this_rq)) {
 			set_tsk_need_resched(p);
 			smp_send_reschedule(other_cpu);
 		}
@@ -3546,8 +3496,6 @@ static void __sched notrace __schedule(bool preempt)
 
 		trace_sched_switch(preempt, prev, next);
 		rq = context_switch(rq, prev, next); /* unlocks the grq */
-		cpu = cpu_of(rq);
-		idle = rq->idle;
 	} else {
 		check_siblings(rq);
 		grq_unlock_irq();
@@ -3766,8 +3714,8 @@ EXPORT_SYMBOL(default_wake_function);
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
 	unsigned long flags;
-	int queued, oldprio;
 	struct rq *rq;
+	int oldprio;
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
@@ -3793,19 +3741,18 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
 	trace_sched_pi_setprio(p, prio);
 	oldprio = p->prio;
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	p->prio = prio;
-	if (task_running(p) && prio > oldprio)
-		resched_task(p);
-	if (queued) {
+	if (task_running(p)){
+		if (prio > oldprio)
+			resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p);
 		enqueue_task(p, rq);
-		try_preempt(p, rq);
+		if (prio < oldprio)
+			try_preempt(p, rq);
 	}
-
 out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_unlock(p, &flags);
 }
 
 #endif
@@ -3821,7 +3768,7 @@ static inline void adjust_deadline(struct task_struct *p, int new_prio)
 
 void set_user_nice(struct task_struct *p, long nice)
 {
-	int queued, new_static, old_static;
+	int new_static, old_static;
 	unsigned long flags;
 	struct rq *rq;
 
@@ -3843,16 +3790,14 @@ void set_user_nice(struct task_struct *p, long nice)
 		p->static_prio = new_static;
 		goto out_unlock;
 	}
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 
 	adjust_deadline(p, new_static);
 	old_static = p->static_prio;
 	p->static_prio = new_static;
 	p->prio = effective_prio(p);
 
-	if (queued) {
+	if (task_queued(p)) {
+		dequeue_task(p);
 		enqueue_task(p, rq);
 		if (new_static < old_static)
 			try_preempt(p, rq);
@@ -3862,7 +3807,7 @@ void set_user_nice(struct task_struct *p, long nice)
 			resched_task(p);
 	}
 out_unlock:
-	task_grq_unlock(&flags);
+	task_grq_unlock(p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
 
@@ -4002,11 +3947,15 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
 		p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
 	} else
 		p->prio = p->normal_prio;
+
 	if (task_running(p)) {
 		reset_rq_task(rq, p);
-		/* Resched only if we might now be preempted */
-		if (p->prio > oldprio || p->rt_priority > oldrtprio)
-			resched_task(p);
+		resched_task(p);
+	} else if (task_queued(p)) {
+		dequeue_task(p);
+		enqueue_task(p, rq);
+		if (p->prio < oldprio || p->rt_priority > oldrtprio)
+			try_preempt(p, rq);
 	}
 }
 
@@ -4031,8 +3980,8 @@ __sched_setscheduler(struct task_struct *p, int policy,
 		     const struct sched_param *param, bool user, bool pi)
 {
 	struct sched_param zero_param = { .sched_priority = 0 };
-	int queued, retval, oldpolicy = -1;
 	unsigned long flags, rlim_rtprio = 0;
+	int retval, oldpolicy = -1;
 	int reset_on_fork;
 	struct rq *rq;
 
@@ -4142,20 +4091,17 @@ recheck:
 	/*
 	 * make sure no PI-waiters arrive (or leave) while we are
 	 * changing the priority of the task:
-	 */
-	raw_spin_lock_irqsave(&p->pi_lock, flags);
-	/*
+	 *
 	 * To be able to change p->policy safely, the grunqueue lock must be
 	 * held.
 	 */
-	rq = __task_grq_lock(p);
+	rq = task_grq_lock(p, &flags);
 
 	/*
 	 * Changing the policy of the stop threads its a very bad idea
 	 */
 	if (p == rq->stop) {
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_grq_unlock(p, &flags);
 		return -EINVAL;
 	}
 
@@ -4165,31 +4111,21 @@ recheck:
 	if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
 			param->sched_priority == p->rt_priority))) {
 
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_grq_unlock(p, &flags);
 		return 0;
 	}
 
 	/* recheck policy now with rq lock held */
 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
 		policy = oldpolicy = -1;
-		__task_grq_unlock();
-		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+		task_grq_unlock(p, &flags);
 		goto recheck;
 	}
 	update_clocks(rq);
 	p->sched_reset_on_fork = reset_on_fork;
 
-	queued = task_queued(p);
-	if (queued)
-		dequeue_task(p);
 	__setscheduler(p, rq, policy, param->sched_priority, pi);
-	if (queued) {
-		enqueue_task(p, rq);
-		try_preempt(p, rq);
-	}
-	__task_grq_unlock();
-	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+	task_grq_unlock(p, &flags);
 
 	if (pi)
 		rt_mutex_adjust_pi(p);
@@ -4706,7 +4642,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  *
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
  */
 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
 		unsigned long __user *, user_mask_ptr)
@@ -5113,6 +5050,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
 	cpumask_copy(tsk_cpus_allowed(p), new_mask);
+	if (needs_other_cpu(p, task_cpu(p)))
+		set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
 }
 #endif
 
@@ -5376,6 +5315,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 {
 	const struct cpumask *cpu_valid_mask = cpu_active_mask;
 	bool running_wrong = false;
+	struct cpumask old_mask;
 	bool queued = false;
 	unsigned long flags;
 	struct rq *rq;
@@ -5399,7 +5339,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		goto out;
 	}
 
-	if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+	cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+	if (cpumask_equal(&old_mask, new_mask))
 		goto out;
 
 	if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -5436,12 +5377,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 		set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
 
 out:
-	if (queued)
+	if (queued && !cpumask_subset(new_mask, &old_mask))
 		try_preempt(p, rq);
-	task_grq_unlock(&flags);
-
 	if (running_wrong)
-		preempt_schedule_common();
+		preempt_disable();
+	task_grq_unlock(p, &flags);
+
+	if (running_wrong) {
+		__schedule(true);
+		preempt_enable();
+	}
 
 	return ret;
 }
@@ -5471,6 +5416,11 @@ static void bind_zero(int src_cpu)
 			cpumask_set_cpu(0, tsk_cpus_allowed(p));
 			p->zerobound = true;
 			bound++;
+			if (task_cpu(p) == src_cpu) {
+				set_task_cpu(p, 0);
+				if (task_running(p))
+					resched_task(p);
+			}
 		}
 	} while_each_thread(t, p);
 
@@ -7008,6 +6958,7 @@ void __init sched_init_smp(void)
 #ifdef CONFIG_SCHED_SMT
 	bool smt_threads = false;
 #endif
+	struct rq *rq;
 
 	cpumask_var_t non_isolated_cpus;
 
@@ -7045,7 +6996,7 @@ void __init sched_init_smp(void)
 	 * nodes) are treated as very distant.
 	 */
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
 
 		/* First check if this cpu is in the same node */
 		for_each_domain(cpu, sd) {
@@ -7084,6 +7035,17 @@ void __init sched_init_smp(void)
 		}
 #endif
 	}
+	for_each_possible_cpu(cpu) {
+		int total_cpus = 0, locality;
+
+		rq = cpu_rq(cpu);
+		for (locality = 0; locality <= 4; locality++) {
+			for_each_possible_cpu(other_cpu) {
+				if (rq->cpu_locality[other_cpu] == locality)
+					rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+			}
+		}
+	}
 #ifdef CONFIG_SMT_NICE
 	if (smt_threads) {
 		check_siblings = &check_smt_siblings;
@@ -7095,7 +7057,8 @@ void __init sched_init_smp(void)
 	mutex_unlock(&sched_domains_mutex);
 
 	for_each_online_cpu(cpu) {
-		struct rq *rq = cpu_rq(cpu);
+		rq = cpu_rq(cpu);
+
 		for_each_online_cpu(other_cpu) {
 			if (other_cpu <= cpu)
 				continue;
@@ -7220,6 +7183,10 @@ void __init sched_init(void)
 			else
 				rq->cpu_locality[j] = 4;
 		}
+		rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+		rq->rq_order[0] = rq;
+		for (j = 1; j < cpu_ids; j++)
+			rq->rq_order[j] = cpu_rq(j);
 	}
 #endif
 
@@ -7323,7 +7290,6 @@ static inline void normalise_rt_tasks(void)
 	struct task_struct *g, *p;
 	unsigned long flags;
 	struct rq *rq;
-	int queued;
 
 	read_lock(&tasklist_lock);
 	for_each_process_thread(g, p) {
@@ -7337,16 +7303,8 @@ static inline void normalise_rt_tasks(void)
 			continue;
 
 		rq = task_grq_lock(p, &flags);
-		queued = task_queued(p);
-		if (queued)
-			dequeue_task(p);
 		__setscheduler(p, rq, SCHED_NORMAL, 0, false);
-		if (queued) {
-			enqueue_task(p, rq);
-			try_preempt(p, rq);
-		}
-
-		task_grq_unlock(&flags);
+		task_grq_unlock(p, &flags);
 	}
 	read_unlock(&tasklist_lock);
 }