summaryrefslogtreecommitdiff
path: root/kernel/sched/bfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/bfs.c')
-rw-r--r--kernel/sched/bfs.c704
1 files changed, 470 insertions, 234 deletions
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
index 6fd00c5ae..4168a5527 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/bfs.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/sched/prio.h>
#include <linux/tick.h>
+#include <linux/skip_lists.h>
#include <asm/irq_regs.h>
#include <asm/switch_to.h>
@@ -136,7 +137,7 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "BFS CPU scheduler v0.472 by Con Kolivas.\n");
+ printk(KERN_INFO "BFS CPU scheduler v0.490 by Con Kolivas.\n");
}
/*
@@ -190,8 +191,6 @@ struct global_rq {
unsigned long nr_running;
unsigned long nr_uninterruptible;
unsigned long long nr_switches;
- struct list_head queue[PRIO_LIMIT];
- DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
unsigned long qnr; /* queued not running */
#ifdef CONFIG_SMP
cpumask_t cpu_idle_map;
@@ -204,6 +203,9 @@ struct global_rq {
raw_spinlock_t iso_lock;
int iso_ticks;
bool iso_refractory;
+
+ skiplist_node *node;
+ skiplist *sl;
};
#ifdef CONFIG_SMP
@@ -538,24 +540,25 @@ static inline bool deadline_after(u64 deadline, u64 time)
}
/*
- * A task that is queued but not running will be on the grq run list.
- * A task that is not running or queued will not be on the grq run list.
- * A task that is currently running will have ->on_cpu set but not on the
- * grq run list.
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
*/
static inline bool task_queued(struct task_struct *p)
{
- return (!list_empty(&p->run_list));
+ return p->node;
}
/*
- * Removing from the global runqueue. Enter with grq locked.
+ * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 16.
*/
static void dequeue_task(struct task_struct *p)
{
- list_del_init(&p->run_list);
- if (list_empty(grq.queue + p->prio))
- __clear_bit(p->prio, grq.prio_bitmap);
+ skiplist_delnode(grq.node, grq.sl, p->node);
+ p->node = NULL;
sched_info_dequeued(task_rq(p), p);
}
@@ -583,6 +586,9 @@ static bool isoprio_suitable(void)
*/
static void enqueue_task(struct task_struct *p, struct rq *rq)
{
+ unsigned int randseed;
+ u64 sl_id;
+
if (!rt_task(p)) {
/* Check it hasn't gotten rt from PI */
if ((idleprio_task(p) && idleprio_suitable(p)) ||
@@ -591,8 +597,32 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
else
p->prio = NORMAL_PRIO;
}
- __set_bit(p->prio, grq.prio_bitmap);
- list_add_tail(&p->run_list, grq.queue + p->prio);
+ /*
+ * The sl_id key passed to the skiplist generates a sorted list.
+ * Realtime and sched iso tasks run FIFO so they only need be sorted
+ * according to priority. The skiplist will put tasks of the same
+ * key inserted later in FIFO order. Tasks of sched normal, batch
+ * and idleprio are sorted according to their deadlines. Idleprio
+ * tasks are offset by an impossibly large deadline value ensuring
+ * they get sorted into last positions, but still according to their
+ * own deadlines. This creates a "landscape" of skiplists running
+ * from priority 0 realtime in first place to the lowest priority
+ * idleprio tasks last. Skiplist insertion is an O(log n) process.
+ */
+ if (p->prio <= ISO_PRIO)
+ sl_id = p->prio;
+ else {
+ sl_id = p->deadline;
+ /* Set it to cope with 4 left shifts with locality_diff */
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0x0F00000000000000;
+ }
+ /*
+ * Some architectures don't have better than microsecond resolution
+ * so mask out ~microseconds as the random seed for skiplist insertion.
+ */
+ randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
+ p->node = skiplist_insert(grq.node, grq.sl, sl_id, p, randseed);
sched_info_queued(rq, p);
}
@@ -647,6 +677,113 @@ static inline int queued_notrunning(void)
return grq.qnr;
}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+ int other_cpu, best_bias = 0;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq = cpu_rq(other_cpu);
+
+ if (rq_idle(rq))
+ continue;
+ if (!rq->online)
+ continue;
+ if (!rq->rq_mm)
+ continue;
+ if (likely(rq->rq_smt_bias > best_bias))
+ best_bias = rq->rq_smt_bias;
+ }
+ return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+ if (rt_task(p))
+ return 1 << 30;
+ else if (task_running_iso(p))
+ return 1 << 29;
+ else if (task_running_idle(p))
+ return 0;
+ return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+ return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+ int best_bias, task_bias;
+
+ /* Kernel threads always run */
+ if (unlikely(!p->mm))
+ return true;
+ if (rt_task(p))
+ return true;
+ if (!idleprio_suitable(p))
+ return true;
+ best_bias = best_smt_bias(this_rq);
+ /* The smt siblings are all idle or running IDLEPRIO */
+ if (best_bias < 1)
+ return true;
+ task_bias = task_prio_bias(p);
+ if (task_bias < 1)
+ return false;
+ if (task_bias >= best_bias)
+ return true;
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
+ if (best_bias % 4 == 1)
+ return true;
+ /* Sorry, you lose */
+ return false;
+}
+
+static unsigned long cpu_load_avg(struct rq *rq)
+{
+ return rq->soft_affined * SCHED_CAPACITY_SCALE;
+}
+
+/*
+ * This is the proportion of SCHED_CAPACITY_SCALE (1024) used when each thread
+ * of a CPU with SMT siblings is in use.
+ */
+#define SCHED_SMT_LOAD (890)
+
+/*
+ * Load of a CPU with smt siblings should be considered to be the load from all
+ * the SMT siblings, thus will be >1 if both threads are in use since they are
+ * not full cores.
+ */
+static unsigned long smt_load_avg(struct rq *rq)
+{
+ unsigned long load = rq->soft_affined * SCHED_SMT_LOAD;
+ int cpu;
+
+ for_each_cpu(cpu, thread_cpumask(rq->cpu))
+ load += cpu_rq(cpu)->soft_affined * SCHED_SMT_LOAD;
+ return load;
+}
+
+static unsigned long (*rq_load_avg)(struct rq *rq) = &cpu_load_avg;
+#else
+#define smt_schedule(p, this_rq) (true)
+static inline unsigned long rq_load_avg(struct rq *rq)
+{
+ return rq->soft_affined * SCHED_CAPACITY_SCALE;
+}
+#endif
#ifdef CONFIG_SMP
/*
* The cpu_idle_map stores a bitmap of all the CPUs currently idle to
@@ -691,7 +828,7 @@ static inline bool scaling_rq(struct rq *rq);
* lowest value would give the most suitable CPU to schedule p onto next. The
* order works out to be the following:
*
- * Same core, idle or busy cache, idle or busy threads
+ * Same thread, idle or busy cache, idle or busy threads
* Other core, same cache, idle or busy cache, idle threads.
* Same node, other CPU, idle cache, idle threads.
* Same node, other CPU, busy cache, idle threads.
@@ -729,13 +866,13 @@ static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
#ifdef CONFIG_SCHED_MC
else if (locality == 2)
ranking |= CPUIDLE_DIFF_CORE;
- if (!(tmp_rq->cache_idle(cpu_tmp)))
+ else if (!(tmp_rq->cache_idle(tmp_rq)))
ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
if (locality == 1)
ranking |= CPUIDLE_DIFF_THREAD;
- if (!(tmp_rq->siblings_idle(cpu_tmp)))
+ if (!(tmp_rq->siblings_idle(tmp_rq)))
ranking |= CPUIDLE_THREAD_BUSY;
#endif
if (scaling_rq(tmp_rq))
@@ -763,90 +900,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return (this_rq->cpu_locality[that_cpu] < 3);
}
-#ifdef CONFIG_SMT_NICE
-static const cpumask_t *thread_cpumask(int cpu);
-
-/* Find the best real time priority running on any SMT siblings of cpu and if
- * none are running, the static priority of the best deadline task running.
- * The lookups to the other runqueues is done lockless as the occasional wrong
- * value would be harmless. */
-static int best_smt_bias(int cpu)
-{
- int other_cpu, best_bias = 0;
-
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
- struct rq *rq;
-
- if (other_cpu == cpu)
- continue;
- rq = cpu_rq(other_cpu);
- if (rq_idle(rq))
- continue;
- if (!rq->online)
- continue;
- if (!rq->rq_mm)
- continue;
- if (likely(rq->rq_smt_bias > best_bias))
- best_bias = rq->rq_smt_bias;
- }
- return best_bias;
-}
-
-static int task_prio_bias(struct task_struct *p)
-{
- if (rt_task(p))
- return 1 << 30;
- else if (task_running_iso(p))
- return 1 << 29;
- else if (task_running_idle(p))
- return 0;
- return MAX_PRIO - p->static_prio;
-}
-
-/* We've already decided p can run on CPU, now test if it shouldn't for SMT
- * nice reasons. */
-static bool smt_should_schedule(struct task_struct *p, int cpu)
-{
- int best_bias, task_bias;
-
- /* Kernel threads always run */
- if (unlikely(!p->mm))
- return true;
- if (rt_task(p))
- return true;
- if (!idleprio_suitable(p))
- return true;
- best_bias = best_smt_bias(cpu);
- /* The smt siblings are all idle or running IDLEPRIO */
- if (best_bias < 1)
- return true;
- task_bias = task_prio_bias(p);
- if (task_bias < 1)
- return false;
- if (task_bias >= best_bias)
- return true;
- /* Dither 25% cpu of normal tasks regardless of nice difference */
- if (best_bias % 4 == 1)
- return true;
- /* Sorry, you lose */
- return false;
-}
-#else
-#define smt_should_schedule(p, cpu) (1)
-#endif
-
static bool resched_best_idle(struct task_struct *p)
{
cpumask_t tmpmask;
+ struct rq *rq;
int best_cpu;
cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, best_cpu))
+ rq = cpu_rq(best_cpu);
+ if (!smt_schedule(p, rq))
return false;
-#endif
- resched_curr(cpu_rq(best_cpu));
+ resched_curr(rq);
return true;
}
@@ -953,6 +1018,26 @@ static int effective_prio(struct task_struct *p)
}
/*
+ * Update the load average for feeding into cpu frequency governors. Use a rolling
+ * average with ~ time constant of 32ms
+ */
+static void update_load_avg(struct rq *rq)
+{
+ /* rq clock can go backwards so skip update if that happens */
+ if (likely(rq->clock > rq->load_update)) {
+ unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+ long load;
+
+ load = rq->load_avg - (rq->load_avg * us_interval * 80 / 32768 / 128);
+ if (unlikely(load < 0))
+ load = 0;
+ load += rq->soft_affined * rq_load_avg(rq) * us_interval * 80 / 32768 / 128;
+ rq->load_avg = load;
+ }
+ rq->load_update = rq->clock;
+}
+
+/*
* activate_task - move a task to the runqueue. Enter with grq locked.
*/
static void activate_task(struct task_struct *p, struct rq *rq)
@@ -978,7 +1063,8 @@ static void activate_task(struct task_struct *p, struct rq *rq)
p->on_rq = 1;
grq.nr_running++;
inc_qnr();
- cpufreq_trigger(grq.niffies, rq->soft_affined);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
}
static inline void clear_sticky(struct task_struct *p);
@@ -995,19 +1081,22 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq)
p->on_rq = 0;
grq.nr_running--;
clear_sticky(p);
- cpufreq_trigger(grq.niffies, rq->soft_affined);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
}
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
+ unsigned int tcpu;
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold grq lock.
*/
WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
#endif
- if (task_cpu(p) == cpu)
+ if ((tcpu = task_cpu(p)) == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
@@ -1019,8 +1108,21 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
if (p->on_rq) {
- task_rq(p)->soft_affined--;
- cpu_rq(cpu)->soft_affined++;
+ struct rq *rq;
+
+ /*
+ * set_task_cpu can be set on other CPUs so call cpufreq_trigger
+ * explicitly telling it what CPU is being updated as the value
+ * of soft_affined has changed.
+ */
+ rq = task_rq(p);
+ rq->soft_affined--;
+ update_load_avg(rq);
+ other_cpufreq_trigger(tcpu, grq.niffies, rq->load_avg);
+ rq = cpu_rq(cpu);
+ rq->soft_affined++;
+ update_load_avg(rq);
+ other_cpufreq_trigger(cpu, grq.niffies, rq->load_avg);
}
task_thread_info(p)->cpu = cpu;
}
@@ -1353,13 +1455,10 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
return false;
}
-/*
- * When all else is equal, still prefer this_rq.
- */
static void try_preempt(struct task_struct *p, struct rq *this_rq)
{
+ int cpu, pcpu, highest_prio, highest_cpu;
struct rq *highest_prio_rq = NULL;
- int cpu, highest_prio;
u64 latest_deadline;
cpumask_t tmp;
@@ -1383,13 +1482,13 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
return;
/* See if this task can preempt the task on the current CPU first. */
- cpu = cpu_of(this_rq);
- if (cpumask_test_cpu(cpu, &tmp)) {
- if (smt_should_schedule(p, cpu) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
+ pcpu = cpu_of(this_rq);
+ if (likely(cpumask_test_cpu(pcpu, &tmp))) {
+ if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
resched_curr(this_rq);
return;
}
- cpumask_clear_cpu(cpu, &tmp);
+ cpumask_clear_cpu(pcpu, &tmp);
}
highest_prio = latest_deadline = 0;
@@ -1398,37 +1497,40 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
for_each_cpu(cpu, &tmp) {
struct rq *rq;
int rq_prio;
+ u64 dl;
rq = cpu_rq(cpu);
rq_prio = rq->rq_prio;
if (rq_prio < highest_prio)
continue;
+ dl = rq->rq_deadline;
+ if (!sched_interactive && pcpu != cpu)
+ dl <<= locality_diff(pcpu, rq);
if (rq_prio > highest_prio ||
- deadline_after(rq->rq_deadline, latest_deadline)) {
- latest_deadline = rq->rq_deadline;
+ deadline_after(dl, latest_deadline)) {
+ latest_deadline = dl;
highest_prio = rq_prio;
+ highest_cpu = cpu;
highest_prio_rq = rq;
}
}
- if (likely(highest_prio_rq)) {
-#ifdef CONFIG_SMT_NICE
- cpu = cpu_of(highest_prio_rq);
- if (!smt_should_schedule(p, cpu))
- return;
-#endif
- if (can_preempt(p, highest_prio, latest_deadline)) {
- /*
- * If we have decided this task should preempt this CPU,
- * set the task's CPU to match so there is no discrepancy
- * in earliest_deadline_task which biases away tasks with
- * a different CPU set. This means waking tasks are
- * treated differently to rescheduling tasks.
- */
- set_task_cpu(p, cpu);
- resched_curr(highest_prio_rq);
- }
+ if (unlikely(!highest_prio_rq))
+ return;
+ if (!smt_schedule(p, highest_prio_rq))
+ return;
+ if (can_preempt(p, highest_prio, latest_deadline)) {
+ /*
+ * If we have decided this task should preempt this CPU,
+ * set the task's CPU to match so there is no discrepancy
+ * in earliest_deadline_task which biases away tasks with
+ * a different CPU set. This means waking tasks are
+ * treated differently to rescheduling tasks in
+ * interactive mode.
+ */
+ set_task_cpu(p, highest_cpu);
+ resched_curr(highest_prio_rq);
}
}
static int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1723,7 +1825,7 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- INIT_LIST_HEAD(&p->run_list);
+ p->node = NULL;
#ifdef CONFIG_SCHED_INFO
if (unlikely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3072,6 +3174,8 @@ void scheduler_tick(void)
/* grq lock not grabbed, so only update rq clock */
update_rq_clock(rq);
update_cpu_clock_tick(rq, rq->curr);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
if (!rq_idle(rq))
task_running_tick(rq);
else
@@ -3280,101 +3384,56 @@ found_middle:
}
/*
- * O(n) lookup of all tasks in the global runqueue. The real brainfuck
- * of lock contention and O(n). It's not really O(n) as only the queued,
- * but not running tasks are scanned, and is O(n) queued in the worst case
- * scenario only because the right task can be found before scanning all of
- * them.
- * Tasks are selected in this order:
- * Real time tasks are selected purely by their static priority and in the
- * order they were queued, so the lowest value idx, and the first queued task
- * of that priority value is chosen.
- * If no real time tasks are found, the SCHED_ISO priority is checked, and
- * all SCHED_ISO tasks have the same priority value, so they're selected by
- * the earliest deadline value.
- * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
- * earliest deadline.
- * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
- * selected by the earliest deadline.
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The only time it takes longer
+ * is if tasks do not have suitable affinity and then we iterate over entries
+ * till we find the first that does. Worst case here is no tasks with suitable
+ * affinity and taking O(n).
*/
static inline struct
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
{
- struct task_struct *edt = NULL;
- unsigned long idx = -1;
+ struct task_struct *edt = idle;
+ skiplist_node *node = grq.node;
+ u64 earliest_deadline = ~0ULL;
- do {
- struct list_head *queue;
- struct task_struct *p;
- u64 earliest_deadline;
-
- idx = next_sched_bit(grq.prio_bitmap, ++idx);
- if (idx >= PRIO_LIMIT)
- return idle;
- queue = grq.queue + idx;
-
- if (idx < MAX_RT_PRIO) {
- /* We found an rt task */
- list_for_each_entry(p, queue, run_list) {
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
- continue;
- edt = p;
- goto out_take;
- }
- /*
- * None of the RT tasks at this priority can run on
- * this cpu
- */
+ while ((node = node->next[0]) != grq.node) {
+ struct task_struct *p = node->value;
+ int tcpu;
+
+ /* Make sure affinity is ok */
+ if (needs_other_cpu(p, cpu))
continue;
- }
- /*
- * No rt tasks. Find the earliest deadline task. Now we're in
- * O(n) territory.
- */
- earliest_deadline = ~0ULL;
- list_for_each_entry(p, queue, run_list) {
+ if (!smt_schedule(p, rq))
+ continue;
+
+ if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
u64 dl;
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
+ if (task_sticky(p) && scaling_rq(rq))
continue;
-
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, cpu))
+ dl = p->deadline << locality_diff(tcpu, rq);
+ if (unlikely(!deadline_before(dl, earliest_deadline)))
continue;
-#endif
- /*
- * Soft affinity happens here by not scheduling a task
- * with its sticky flag set that ran on a different CPU
- * last when the CPU is scaling, or by greatly biasing
- * against its deadline when not, based on cpu cache
- * locality.
- */
- if (sched_interactive)
- dl = p->deadline;
- else {
- int tcpu = task_cpu(p);
-
- if (tcpu != cpu && task_sticky(p) && scaling_rq(rq))
- continue;
- dl = p->deadline << locality_diff(tcpu, rq);
- }
-
- if (deadline_before(dl, earliest_deadline)) {
- earliest_deadline = dl;
- edt = p;
- }
+ earliest_deadline = dl;
+ edt = p;
+ /* We continue even though we've found the earliest
+ * deadline task as the locality offset means there
+ * may be a better candidate after it. */
+ continue;
}
- } while (!edt);
-
-out_take:
- take_task(cpu, edt);
+ /* This wouldn't happen if we encountered a better deadline from
+ * another CPU and have already set edt. */
+ if (likely(p->deadline < earliest_deadline))
+ edt = p;
+ break;
+ }
+ if (likely(edt != idle))
+ take_task(cpu, edt);
return edt;
}
-
/*
* Print scheduling while atomic bug:
*/
@@ -3454,44 +3513,47 @@ static void reset_rq_task(struct rq *rq, struct task_struct *p)
}
#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
/* Iterate over smt siblings when we've scheduled a process on cpu and decide
* whether they should continue running or be descheduled. */
-static void check_smt_siblings(int cpu)
+static void check_smt_siblings(struct rq *this_rq)
{
int other_cpu;
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
struct task_struct *p;
struct rq *rq;
- if (other_cpu == cpu)
- continue;
rq = cpu_rq(other_cpu);
if (rq_idle(rq))
continue;
if (!rq->online)
continue;
p = rq->curr;
- if (!smt_should_schedule(p, cpu)) {
+ if (!smt_should_schedule(p, this_rq)) {
set_tsk_need_resched(p);
smp_send_reschedule(other_cpu);
}
}
}
-static void wake_smt_siblings(int cpu)
+static void wake_smt_siblings(struct rq *this_rq)
{
int other_cpu;
if (!queued_notrunning())
return;
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
struct rq *rq;
- if (other_cpu == cpu)
- continue;
rq = cpu_rq(other_cpu);
+ if (!rq->online)
+ continue;
if (rq_idle(rq)) {
struct task_struct *p = rq->curr;
@@ -3501,8 +3563,8 @@ static void wake_smt_siblings(int cpu)
}
}
#else
-static void check_smt_siblings(int __maybe_unused cpu) {}
-static void wake_smt_siblings(int __maybe_unused cpu) {}
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
#endif
/*
@@ -3639,7 +3701,7 @@ static void __sched notrace __schedule(bool preempt)
* again.
*/
set_rq_task(rq, prev);
- check_smt_siblings(cpu);
+ check_siblings(rq);
grq_unlock_irq();
goto rerun_prev_unlocked;
} else
@@ -3679,9 +3741,9 @@ static void __sched notrace __schedule(bool preempt)
unstick_task(rq, prev);
set_rq_task(rq, next);
if (next != idle)
- check_smt_siblings(cpu);
+ check_siblings(rq);
else
- wake_smt_siblings(cpu);
+ wake_siblings(rq);
grq.nr_switches++;
prev->on_cpu = false;
next->on_cpu = true;
@@ -3693,7 +3755,7 @@ static void __sched notrace __schedule(bool preempt)
cpu = cpu_of(rq);
idle = rq->idle;
} else {
- check_smt_siblings(cpu);
+ check_siblings(rq);
grq_unlock_irq();
}
@@ -7107,9 +7169,9 @@ int sched_cpu_dying(unsigned int cpu)
* Cheaper version of the below functions in case support for SMT and MC is
* compiled in but CPUs have no siblings.
*/
-static bool sole_cpu_idle(int cpu)
+static bool sole_cpu_idle(struct rq *rq)
{
- return rq_idle(cpu_rq(cpu));
+ return rq_idle(rq);
}
#endif
#ifdef CONFIG_SCHED_SMT
@@ -7118,9 +7180,9 @@ static const cpumask_t *thread_cpumask(int cpu)
return topology_sibling_cpumask(cpu);
}
/* All this CPU's SMT siblings are idle */
-static bool siblings_cpu_idle(int cpu)
+static bool siblings_cpu_idle(struct rq *rq)
{
- return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
+ return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
}
#endif
#ifdef CONFIG_SCHED_MC
@@ -7129,9 +7191,9 @@ static const cpumask_t *core_cpumask(int cpu)
return topology_core_cpumask(cpu);
}
/* All this CPU's shared cache siblings are idle */
-static bool cache_cpu_idle(int cpu)
+static bool cache_cpu_idle(struct rq *rq)
{
- return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
+ return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
}
#endif
@@ -7150,6 +7212,9 @@ void __init sched_init_smp(void)
{
struct sched_domain *sd;
int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+ bool smt_threads = false;
+#endif
cpumask_var_t non_isolated_cpus;
@@ -7209,16 +7274,31 @@ void __init sched_init_smp(void)
if (rq->cpu_locality[other_cpu] > 2)
rq->cpu_locality[other_cpu] = 2;
}
- if (cpumask_weight(core_cpumask(cpu)) > 1)
+ if (cpumask_weight(core_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->core_mask, core_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->core_mask);
rq->cache_idle = cache_cpu_idle;
+ }
#endif
#ifdef CONFIG_SCHED_SMT
for_each_cpu(other_cpu, thread_cpumask(cpu))
rq->cpu_locality[other_cpu] = 1;
- if (cpumask_weight(thread_cpumask(cpu)) > 1)
+ if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->thread_mask);
rq->siblings_idle = siblings_cpu_idle;
+ smt_threads = true;
+ }
#endif
}
+#ifdef CONFIG_SMT_NICE
+ if (smt_threads) {
+ check_siblings = &check_smt_siblings;
+ wake_siblings = &wake_smt_siblings;
+ smt_schedule = &smt_should_schedule;
+ rq_load_avg = &smt_load_avg;
+ }
+#endif
grq_unlock_irq();
mutex_unlock(&sched_domains_mutex);
@@ -7245,6 +7325,32 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+#endif /* CONFIG_CGROUP_SCHED */
+
void __init sched_init(void)
{
#ifdef CONFIG_SMP
@@ -7265,6 +7371,9 @@ void __init sched_init(void)
grq.iso_ticks = 0;
grq.iso_refractory = false;
grq.noc = 1;
+ grq.node = skiplist_init();
+ grq.sl = new_skiplist(grq.node);
+
#ifdef CONFIG_SMP
init_defrootdomain();
grq.qnr = grq.idle_cpus = 0;
@@ -7272,6 +7381,14 @@ void __init sched_init(void)
#else
uprq = &per_cpu(runqueues, 0);
#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
rq = cpu_rq(i);
rq->grq_lock = &grq.lock;
@@ -7316,11 +7433,6 @@ void __init sched_init(void)
}
#endif
- for (i = 0; i < PRIO_LIMIT; i++)
- INIT_LIST_HEAD(grq.queue + i);
- /* delimiter for bitsearch */
- __set_bit(PRIO_LIMIT, grq.prio_bitmap);
-
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
@@ -7702,3 +7814,127 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+ kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_add_rcu(&tg->list, &task_groups);
+
+ WARN_ON(!parent); /* root should already exist */
+
+ tg->parent = parent;
+ INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ /* now it should be safe to free those cfs_rqs */
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_del_rcu(&tg->list);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+ return &tg->css;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_files[] = {
+ { } /* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .legacy_cftypes = cpu_files,
+ .early_init = true,
+};
+#endif /* CONFIG_CGROUP_SCHED */