summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/events/core.c60
-rw-r--r--kernel/events/uprobes.c5
-rw-r--r--kernel/irq/msi.c19
-rw-r--r--kernel/sched/bfs.c704
-rw-r--r--kernel/sched/bfs_sched.h25
-rw-r--r--kernel/sched/cputime.c15
-rw-r--r--kernel/skip_lists.c177
8 files changed, 744 insertions, 263 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e2ec54e2b..dd84575cb 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \
extable.o params.o \
kthread.o sys_ni.o nsproxy.o \
notifier.o ksysfs.o cred.o reboot.o \
- async.o range.o smpboot.o
+ async.o range.o smpboot.o skip_lists.o
obj-$(CONFIG_MULTIUSER) += groups.o
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 43d43a2d5..e68c0a735 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -242,18 +242,6 @@ unlock:
return ret;
}
-static void event_function_local(struct perf_event *event, event_f func, void *data)
-{
- struct event_function_struct efs = {
- .event = event,
- .func = func,
- .data = data,
- };
-
- int ret = event_function(&efs);
- WARN_ON_ONCE(ret);
-}
-
static void event_function_call(struct perf_event *event, event_f func, void *data)
{
struct perf_event_context *ctx = event->ctx;
@@ -303,6 +291,54 @@ again:
raw_spin_unlock_irq(&ctx->lock);
}
+/*
+ * Similar to event_function_call() + event_function(), but hard assumes IRQs
+ * are already disabled and we're on the right CPU.
+ */
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct task_struct *task = READ_ONCE(ctx->task);
+ struct perf_event_context *task_ctx = NULL;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (task) {
+ if (task == TASK_TOMBSTONE)
+ return;
+
+ task_ctx = ctx;
+ }
+
+ perf_ctx_lock(cpuctx, task_ctx);
+
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE)
+ goto unlock;
+
+ if (task) {
+ /*
+ * We must be either inactive or active and the right task,
+ * otherwise we're screwed, since we cannot IPI to somewhere
+ * else.
+ */
+ if (ctx->is_active) {
+ if (WARN_ON_ONCE(task != current))
+ goto unlock;
+
+ if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
+ goto unlock;
+ }
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ func(event, cpuctx, ctx, data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+}
+
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b7a525ab2..8c50276b6 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
err = -EAGAIN;
ptep = page_check_address(page, mm, addr, &ptl, 0);
- if (!ptep)
+ if (!ptep) {
+ mem_cgroup_cancel_charge(kpage, memcg, false);
goto unlock;
+ }
get_page(kpage);
page_add_new_anon_rmap(kpage, vma, addr, false);
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
err = 0;
unlock:
- mem_cgroup_cancel_charge(kpage, memcg, false);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
unlock_page(page);
return err;
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
index 38e89ce7b..0afe671f1 100644
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
struct msi_domain_ops *ops = info->ops;
msi_alloc_info_t arg;
struct msi_desc *desc;
- int i, ret, virq = -1;
+ int i, ret, virq;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
@@ -332,12 +332,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
- if (info->flags & MSI_FLAG_IDENTITY_MAP)
- virq = (int)ops->get_hwirq(info, &arg);
- else
- virq = -1;
- virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used,
+ virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false);
if (virq < 0) {
ret = -ENOSPC;
@@ -361,6 +357,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
else
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
+ /*
+ * This flag is set by the PCI layer as we need to activate
+ * the MSI entries before the PCI layer enables MSI in the
+ * card. Otherwise the card latches a random msi message.
+ */
+ if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
+ struct irq_data *irq_data;
+
+ irq_data = irq_domain_get_irq_data(domain, desc->irq);
+ irq_domain_activate_irq(irq_data);
+ }
}
return 0;
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
index 6fd00c5ae..4168a5527 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/bfs.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/sched/prio.h>
#include <linux/tick.h>
+#include <linux/skip_lists.h>
#include <asm/irq_regs.h>
#include <asm/switch_to.h>
@@ -136,7 +137,7 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "BFS CPU scheduler v0.472 by Con Kolivas.\n");
+ printk(KERN_INFO "BFS CPU scheduler v0.490 by Con Kolivas.\n");
}
/*
@@ -190,8 +191,6 @@ struct global_rq {
unsigned long nr_running;
unsigned long nr_uninterruptible;
unsigned long long nr_switches;
- struct list_head queue[PRIO_LIMIT];
- DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1);
unsigned long qnr; /* queued not running */
#ifdef CONFIG_SMP
cpumask_t cpu_idle_map;
@@ -204,6 +203,9 @@ struct global_rq {
raw_spinlock_t iso_lock;
int iso_ticks;
bool iso_refractory;
+
+ skiplist_node *node;
+ skiplist *sl;
};
#ifdef CONFIG_SMP
@@ -538,24 +540,25 @@ static inline bool deadline_after(u64 deadline, u64 time)
}
/*
- * A task that is queued but not running will be on the grq run list.
- * A task that is not running or queued will not be on the grq run list.
- * A task that is currently running will have ->on_cpu set but not on the
- * grq run list.
+ * A task that is not running or queued will not have a node set.
+ * A task that is queued but not running will have a node set.
+ * A task that is currently running will have ->on_cpu set but no node set.
*/
static inline bool task_queued(struct task_struct *p)
{
- return (!list_empty(&p->run_list));
+ return p->node;
}
/*
- * Removing from the global runqueue. Enter with grq locked.
+ * Removing from the global runqueue. Enter with grq locked. Deleting a task
+ * from the skip list is done via the stored node reference in the task struct
+ * and does not require a full look up. Thus it occurs in O(k) time where k
+ * is the "level" of the list the task was stored at - usually < 4, max 16.
*/
static void dequeue_task(struct task_struct *p)
{
- list_del_init(&p->run_list);
- if (list_empty(grq.queue + p->prio))
- __clear_bit(p->prio, grq.prio_bitmap);
+ skiplist_delnode(grq.node, grq.sl, p->node);
+ p->node = NULL;
sched_info_dequeued(task_rq(p), p);
}
@@ -583,6 +586,9 @@ static bool isoprio_suitable(void)
*/
static void enqueue_task(struct task_struct *p, struct rq *rq)
{
+ unsigned int randseed;
+ u64 sl_id;
+
if (!rt_task(p)) {
/* Check it hasn't gotten rt from PI */
if ((idleprio_task(p) && idleprio_suitable(p)) ||
@@ -591,8 +597,32 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
else
p->prio = NORMAL_PRIO;
}
- __set_bit(p->prio, grq.prio_bitmap);
- list_add_tail(&p->run_list, grq.queue + p->prio);
+ /*
+ * The sl_id key passed to the skiplist generates a sorted list.
+ * Realtime and sched iso tasks run FIFO so they only need be sorted
+ * according to priority. The skiplist will put tasks of the same
+ * key inserted later in FIFO order. Tasks of sched normal, batch
+ * and idleprio are sorted according to their deadlines. Idleprio
+ * tasks are offset by an impossibly large deadline value ensuring
+ * they get sorted into last positions, but still according to their
+ * own deadlines. This creates a "landscape" of skiplists running
+ * from priority 0 realtime in first place to the lowest priority
+ * idleprio tasks last. Skiplist insertion is an O(log n) process.
+ */
+ if (p->prio <= ISO_PRIO)
+ sl_id = p->prio;
+ else {
+ sl_id = p->deadline;
+ /* Set it to cope with 4 left shifts with locality_diff */
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0x0F00000000000000;
+ }
+ /*
+ * Some architectures don't have better than microsecond resolution
+ * so mask out ~microseconds as the random seed for skiplist insertion.
+ */
+ randseed = (grq.niffies >> 10) & 0xFFFFFFFF;
+ p->node = skiplist_insert(grq.node, grq.sl, sl_id, p, randseed);
sched_info_queued(rq, p);
}
@@ -647,6 +677,113 @@ static inline int queued_notrunning(void)
return grq.qnr;
}
+#ifdef CONFIG_SMT_NICE
+static const cpumask_t *thread_cpumask(int cpu);
+
+/* Find the best real time priority running on any SMT siblings of cpu and if
+ * none are running, the static priority of the best deadline task running.
+ * The lookups to the other runqueues is done lockless as the occasional wrong
+ * value would be harmless. */
+static int best_smt_bias(struct rq *this_rq)
+{
+ int other_cpu, best_bias = 0;
+
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
+ struct rq *rq = cpu_rq(other_cpu);
+
+ if (rq_idle(rq))
+ continue;
+ if (!rq->online)
+ continue;
+ if (!rq->rq_mm)
+ continue;
+ if (likely(rq->rq_smt_bias > best_bias))
+ best_bias = rq->rq_smt_bias;
+ }
+ return best_bias;
+}
+
+static int task_prio_bias(struct task_struct *p)
+{
+ if (rt_task(p))
+ return 1 << 30;
+ else if (task_running_iso(p))
+ return 1 << 29;
+ else if (task_running_idle(p))
+ return 0;
+ return MAX_PRIO - p->static_prio;
+}
+
+static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq)
+{
+ return true;
+}
+
+static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule;
+
+/* We've already decided p can run on CPU, now test if it shouldn't for SMT
+ * nice reasons. */
+static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq)
+{
+ int best_bias, task_bias;
+
+ /* Kernel threads always run */
+ if (unlikely(!p->mm))
+ return true;
+ if (rt_task(p))
+ return true;
+ if (!idleprio_suitable(p))
+ return true;
+ best_bias = best_smt_bias(this_rq);
+ /* The smt siblings are all idle or running IDLEPRIO */
+ if (best_bias < 1)
+ return true;
+ task_bias = task_prio_bias(p);
+ if (task_bias < 1)
+ return false;
+ if (task_bias >= best_bias)
+ return true;
+ /* Dither 25% cpu of normal tasks regardless of nice difference */
+ if (best_bias % 4 == 1)
+ return true;
+ /* Sorry, you lose */
+ return false;
+}
+
+static unsigned long cpu_load_avg(struct rq *rq)
+{
+ return rq->soft_affined * SCHED_CAPACITY_SCALE;
+}
+
+/*
+ * This is the proportion of SCHED_CAPACITY_SCALE (1024) used when each thread
+ * of a CPU with SMT siblings is in use.
+ */
+#define SCHED_SMT_LOAD (890)
+
+/*
+ * Load of a CPU with smt siblings should be considered to be the load from all
+ * the SMT siblings, thus will be >1 if both threads are in use since they are
+ * not full cores.
+ */
+static unsigned long smt_load_avg(struct rq *rq)
+{
+ unsigned long load = rq->soft_affined * SCHED_SMT_LOAD;
+ int cpu;
+
+ for_each_cpu(cpu, thread_cpumask(rq->cpu))
+ load += cpu_rq(cpu)->soft_affined * SCHED_SMT_LOAD;
+ return load;
+}
+
+static unsigned long (*rq_load_avg)(struct rq *rq) = &cpu_load_avg;
+#else
+#define smt_schedule(p, this_rq) (true)
+static inline unsigned long rq_load_avg(struct rq *rq)
+{
+ return rq->soft_affined * SCHED_CAPACITY_SCALE;
+}
+#endif
#ifdef CONFIG_SMP
/*
* The cpu_idle_map stores a bitmap of all the CPUs currently idle to
@@ -691,7 +828,7 @@ static inline bool scaling_rq(struct rq *rq);
* lowest value would give the most suitable CPU to schedule p onto next. The
* order works out to be the following:
*
- * Same core, idle or busy cache, idle or busy threads
+ * Same thread, idle or busy cache, idle or busy threads
* Other core, same cache, idle or busy cache, idle threads.
* Same node, other CPU, idle cache, idle threads.
* Same node, other CPU, busy cache, idle threads.
@@ -729,13 +866,13 @@ static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask)
#ifdef CONFIG_SCHED_MC
else if (locality == 2)
ranking |= CPUIDLE_DIFF_CORE;
- if (!(tmp_rq->cache_idle(cpu_tmp)))
+ else if (!(tmp_rq->cache_idle(tmp_rq)))
ranking |= CPUIDLE_CACHE_BUSY;
#endif
#ifdef CONFIG_SCHED_SMT
if (locality == 1)
ranking |= CPUIDLE_DIFF_THREAD;
- if (!(tmp_rq->siblings_idle(cpu_tmp)))
+ if (!(tmp_rq->siblings_idle(tmp_rq)))
ranking |= CPUIDLE_THREAD_BUSY;
#endif
if (scaling_rq(tmp_rq))
@@ -763,90 +900,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return (this_rq->cpu_locality[that_cpu] < 3);
}
-#ifdef CONFIG_SMT_NICE
-static const cpumask_t *thread_cpumask(int cpu);
-
-/* Find the best real time priority running on any SMT siblings of cpu and if
- * none are running, the static priority of the best deadline task running.
- * The lookups to the other runqueues is done lockless as the occasional wrong
- * value would be harmless. */
-static int best_smt_bias(int cpu)
-{
- int other_cpu, best_bias = 0;
-
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
- struct rq *rq;
-
- if (other_cpu == cpu)
- continue;
- rq = cpu_rq(other_cpu);
- if (rq_idle(rq))
- continue;
- if (!rq->online)
- continue;
- if (!rq->rq_mm)
- continue;
- if (likely(rq->rq_smt_bias > best_bias))
- best_bias = rq->rq_smt_bias;
- }
- return best_bias;
-}
-
-static int task_prio_bias(struct task_struct *p)
-{
- if (rt_task(p))
- return 1 << 30;
- else if (task_running_iso(p))
- return 1 << 29;
- else if (task_running_idle(p))
- return 0;
- return MAX_PRIO - p->static_prio;
-}
-
-/* We've already decided p can run on CPU, now test if it shouldn't for SMT
- * nice reasons. */
-static bool smt_should_schedule(struct task_struct *p, int cpu)
-{
- int best_bias, task_bias;
-
- /* Kernel threads always run */
- if (unlikely(!p->mm))
- return true;
- if (rt_task(p))
- return true;
- if (!idleprio_suitable(p))
- return true;
- best_bias = best_smt_bias(cpu);
- /* The smt siblings are all idle or running IDLEPRIO */
- if (best_bias < 1)
- return true;
- task_bias = task_prio_bias(p);
- if (task_bias < 1)
- return false;
- if (task_bias >= best_bias)
- return true;
- /* Dither 25% cpu of normal tasks regardless of nice difference */
- if (best_bias % 4 == 1)
- return true;
- /* Sorry, you lose */
- return false;
-}
-#else
-#define smt_should_schedule(p, cpu) (1)
-#endif
-
static bool resched_best_idle(struct task_struct *p)
{
cpumask_t tmpmask;
+ struct rq *rq;
int best_cpu;
cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map);
best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask);
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, best_cpu))
+ rq = cpu_rq(best_cpu);
+ if (!smt_schedule(p, rq))
return false;
-#endif
- resched_curr(cpu_rq(best_cpu));
+ resched_curr(rq);
return true;
}
@@ -953,6 +1018,26 @@ static int effective_prio(struct task_struct *p)
}
/*
+ * Update the load average for feeding into cpu frequency governors. Use a rolling
+ * average with ~ time constant of 32ms
+ */
+static void update_load_avg(struct rq *rq)
+{
+ /* rq clock can go backwards so skip update if that happens */
+ if (likely(rq->clock > rq->load_update)) {
+ unsigned long us_interval = (rq->clock - rq->load_update) >> 10;
+ long load;
+
+ load = rq->load_avg - (rq->load_avg * us_interval * 80 / 32768 / 128);
+ if (unlikely(load < 0))
+ load = 0;
+ load += rq->soft_affined * rq_load_avg(rq) * us_interval * 80 / 32768 / 128;
+ rq->load_avg = load;
+ }
+ rq->load_update = rq->clock;
+}
+
+/*
* activate_task - move a task to the runqueue. Enter with grq locked.
*/
static void activate_task(struct task_struct *p, struct rq *rq)
@@ -978,7 +1063,8 @@ static void activate_task(struct task_struct *p, struct rq *rq)
p->on_rq = 1;
grq.nr_running++;
inc_qnr();
- cpufreq_trigger(grq.niffies, rq->soft_affined);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
}
static inline void clear_sticky(struct task_struct *p);
@@ -995,19 +1081,22 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq)
p->on_rq = 0;
grq.nr_running--;
clear_sticky(p);
- cpufreq_trigger(grq.niffies, rq->soft_affined);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
}
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
+ unsigned int tcpu;
+
#ifdef CONFIG_LOCKDEP
/*
* The caller should hold grq lock.
*/
WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
#endif
- if (task_cpu(p) == cpu)
+ if ((tcpu = task_cpu(p)) == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
@@ -1019,8 +1108,21 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
*/
smp_wmb();
if (p->on_rq) {
- task_rq(p)->soft_affined--;
- cpu_rq(cpu)->soft_affined++;
+ struct rq *rq;
+
+ /*
+ * set_task_cpu can be set on other CPUs so call cpufreq_trigger
+ * explicitly telling it what CPU is being updated as the value
+ * of soft_affined has changed.
+ */
+ rq = task_rq(p);
+ rq->soft_affined--;
+ update_load_avg(rq);
+ other_cpufreq_trigger(tcpu, grq.niffies, rq->load_avg);
+ rq = cpu_rq(cpu);
+ rq->soft_affined++;
+ update_load_avg(rq);
+ other_cpufreq_trigger(cpu, grq.niffies, rq->load_avg);
}
task_thread_info(p)->cpu = cpu;
}
@@ -1353,13 +1455,10 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
return false;
}
-/*
- * When all else is equal, still prefer this_rq.
- */
static void try_preempt(struct task_struct *p, struct rq *this_rq)
{
+ int cpu, pcpu, highest_prio, highest_cpu;
struct rq *highest_prio_rq = NULL;
- int cpu, highest_prio;
u64 latest_deadline;
cpumask_t tmp;
@@ -1383,13 +1482,13 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
return;
/* See if this task can preempt the task on the current CPU first. */
- cpu = cpu_of(this_rq);
- if (cpumask_test_cpu(cpu, &tmp)) {
- if (smt_should_schedule(p, cpu) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
+ pcpu = cpu_of(this_rq);
+ if (likely(cpumask_test_cpu(pcpu, &tmp))) {
+ if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
resched_curr(this_rq);
return;
}
- cpumask_clear_cpu(cpu, &tmp);
+ cpumask_clear_cpu(pcpu, &tmp);
}
highest_prio = latest_deadline = 0;
@@ -1398,37 +1497,40 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
for_each_cpu(cpu, &tmp) {
struct rq *rq;
int rq_prio;
+ u64 dl;
rq = cpu_rq(cpu);
rq_prio = rq->rq_prio;
if (rq_prio < highest_prio)
continue;
+ dl = rq->rq_deadline;
+ if (!sched_interactive && pcpu != cpu)
+ dl <<= locality_diff(pcpu, rq);
if (rq_prio > highest_prio ||
- deadline_after(rq->rq_deadline, latest_deadline)) {
- latest_deadline = rq->rq_deadline;
+ deadline_after(dl, latest_deadline)) {
+ latest_deadline = dl;
highest_prio = rq_prio;
+ highest_cpu = cpu;
highest_prio_rq = rq;
}
}
- if (likely(highest_prio_rq)) {
-#ifdef CONFIG_SMT_NICE
- cpu = cpu_of(highest_prio_rq);
- if (!smt_should_schedule(p, cpu))
- return;
-#endif
- if (can_preempt(p, highest_prio, latest_deadline)) {
- /*
- * If we have decided this task should preempt this CPU,
- * set the task's CPU to match so there is no discrepancy
- * in earliest_deadline_task which biases away tasks with
- * a different CPU set. This means waking tasks are
- * treated differently to rescheduling tasks.
- */
- set_task_cpu(p, cpu);
- resched_curr(highest_prio_rq);
- }
+ if (unlikely(!highest_prio_rq))
+ return;
+ if (!smt_schedule(p, highest_prio_rq))
+ return;
+ if (can_preempt(p, highest_prio, latest_deadline)) {
+ /*
+ * If we have decided this task should preempt this CPU,
+ * set the task's CPU to match so there is no discrepancy
+ * in earliest_deadline_task which biases away tasks with
+ * a different CPU set. This means waking tasks are
+ * treated differently to rescheduling tasks in
+ * interactive mode.
+ */
+ set_task_cpu(p, highest_cpu);
+ resched_curr(highest_prio_rq);
}
}
static int __set_cpus_allowed_ptr(struct task_struct *p,
@@ -1723,7 +1825,7 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
p->sched_reset_on_fork = 0;
}
- INIT_LIST_HEAD(&p->run_list);
+ p->node = NULL;
#ifdef CONFIG_SCHED_INFO
if (unlikely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
@@ -3072,6 +3174,8 @@ void scheduler_tick(void)
/* grq lock not grabbed, so only update rq clock */
update_rq_clock(rq);
update_cpu_clock_tick(rq, rq->curr);
+ update_load_avg(rq);
+ cpufreq_trigger(grq.niffies, rq->load_avg);
if (!rq_idle(rq))
task_running_tick(rq);
else
@@ -3280,101 +3384,56 @@ found_middle:
}
/*
- * O(n) lookup of all tasks in the global runqueue. The real brainfuck
- * of lock contention and O(n). It's not really O(n) as only the queued,
- * but not running tasks are scanned, and is O(n) queued in the worst case
- * scenario only because the right task can be found before scanning all of
- * them.
- * Tasks are selected in this order:
- * Real time tasks are selected purely by their static priority and in the
- * order they were queued, so the lowest value idx, and the first queued task
- * of that priority value is chosen.
- * If no real time tasks are found, the SCHED_ISO priority is checked, and
- * all SCHED_ISO tasks have the same priority value, so they're selected by
- * the earliest deadline value.
- * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the
- * earliest deadline.
- * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are
- * selected by the earliest deadline.
+ * Task selection with skiplists is a simple matter of picking off the first
+ * task in the sorted list, an O(1) operation. The only time it takes longer
+ * is if tasks do not have suitable affinity and then we iterate over entries
+ * till we find the first that does. Worst case here is no tasks with suitable
+ * affinity and taking O(n).
*/
static inline struct
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
{
- struct task_struct *edt = NULL;
- unsigned long idx = -1;
+ struct task_struct *edt = idle;
+ skiplist_node *node = grq.node;
+ u64 earliest_deadline = ~0ULL;
- do {
- struct list_head *queue;
- struct task_struct *p;
- u64 earliest_deadline;
-
- idx = next_sched_bit(grq.prio_bitmap, ++idx);
- if (idx >= PRIO_LIMIT)
- return idle;
- queue = grq.queue + idx;
-
- if (idx < MAX_RT_PRIO) {
- /* We found an rt task */
- list_for_each_entry(p, queue, run_list) {
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
- continue;
- edt = p;
- goto out_take;
- }
- /*
- * None of the RT tasks at this priority can run on
- * this cpu
- */
+ while ((node = node->next[0]) != grq.node) {
+ struct task_struct *p = node->value;
+ int tcpu;
+
+ /* Make sure affinity is ok */
+ if (needs_other_cpu(p, cpu))
continue;
- }
- /*
- * No rt tasks. Find the earliest deadline task. Now we're in
- * O(n) territory.
- */
- earliest_deadline = ~0ULL;
- list_for_each_entry(p, queue, run_list) {
+ if (!smt_schedule(p, rq))
+ continue;
+
+ if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
u64 dl;
- /* Make sure cpu affinity is ok */
- if (needs_other_cpu(p, cpu))
+ if (task_sticky(p) && scaling_rq(rq))
continue;
-
-#ifdef CONFIG_SMT_NICE
- if (!smt_should_schedule(p, cpu))
+ dl = p->deadline << locality_diff(tcpu, rq);
+ if (unlikely(!deadline_before(dl, earliest_deadline)))
continue;
-#endif
- /*
- * Soft affinity happens here by not scheduling a task
- * with its sticky flag set that ran on a different CPU
- * last when the CPU is scaling, or by greatly biasing
- * against its deadline when not, based on cpu cache
- * locality.
- */
- if (sched_interactive)
- dl = p->deadline;
- else {
- int tcpu = task_cpu(p);
-
- if (tcpu != cpu && task_sticky(p) && scaling_rq(rq))
- continue;
- dl = p->deadline << locality_diff(tcpu, rq);
- }
-
- if (deadline_before(dl, earliest_deadline)) {
- earliest_deadline = dl;
- edt = p;
- }
+ earliest_deadline = dl;
+ edt = p;
+ /* We continue even though we've found the earliest
+ * deadline task as the locality offset means there
+ * may be a better candidate after it. */
+ continue;
}
- } while (!edt);
-
-out_take:
- take_task(cpu, edt);
+ /* This wouldn't happen if we encountered a better deadline from
+ * another CPU and have already set edt. */
+ if (likely(p->deadline < earliest_deadline))
+ edt = p;
+ break;
+ }
+ if (likely(edt != idle))
+ take_task(cpu, edt);
return edt;
}
-
/*
* Print scheduling while atomic bug:
*/
@@ -3454,44 +3513,47 @@ static void reset_rq_task(struct rq *rq, struct task_struct *p)
}
#ifdef CONFIG_SMT_NICE
+static void check_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_no_siblings(struct rq __maybe_unused *this_rq) {}
+static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings;
+static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings;
+
/* Iterate over smt siblings when we've scheduled a process on cpu and decide
* whether they should continue running or be descheduled. */
-static void check_smt_siblings(int cpu)
+static void check_smt_siblings(struct rq *this_rq)
{
int other_cpu;
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
struct task_struct *p;
struct rq *rq;
- if (other_cpu == cpu)
- continue;
rq = cpu_rq(other_cpu);
if (rq_idle(rq))
continue;
if (!rq->online)
continue;
p = rq->curr;
- if (!smt_should_schedule(p, cpu)) {
+ if (!smt_should_schedule(p, this_rq)) {
set_tsk_need_resched(p);
smp_send_reschedule(other_cpu);
}
}
}
-static void wake_smt_siblings(int cpu)
+static void wake_smt_siblings(struct rq *this_rq)
{
int other_cpu;
if (!queued_notrunning())
return;
- for_each_cpu(other_cpu, thread_cpumask(cpu)) {
+ for_each_cpu(other_cpu, &this_rq->thread_mask) {
struct rq *rq;
- if (other_cpu == cpu)
- continue;
rq = cpu_rq(other_cpu);
+ if (!rq->online)
+ continue;
if (rq_idle(rq)) {
struct task_struct *p = rq->curr;
@@ -3501,8 +3563,8 @@ static void wake_smt_siblings(int cpu)
}
}
#else
-static void check_smt_siblings(int __maybe_unused cpu) {}
-static void wake_smt_siblings(int __maybe_unused cpu) {}
+static void check_siblings(struct rq __maybe_unused *this_rq) {}
+static void wake_siblings(struct rq __maybe_unused *this_rq) {}
#endif
/*
@@ -3639,7 +3701,7 @@ static void __sched notrace __schedule(bool preempt)
* again.
*/
set_rq_task(rq, prev);
- check_smt_siblings(cpu);
+ check_siblings(rq);
grq_unlock_irq();
goto rerun_prev_unlocked;
} else
@@ -3679,9 +3741,9 @@ static void __sched notrace __schedule(bool preempt)
unstick_task(rq, prev);
set_rq_task(rq, next);
if (next != idle)
- check_smt_siblings(cpu);
+ check_siblings(rq);
else
- wake_smt_siblings(cpu);
+ wake_siblings(rq);
grq.nr_switches++;
prev->on_cpu = false;
next->on_cpu = true;
@@ -3693,7 +3755,7 @@ static void __sched notrace __schedule(bool preempt)
cpu = cpu_of(rq);
idle = rq->idle;
} else {
- check_smt_siblings(cpu);
+ check_siblings(rq);
grq_unlock_irq();
}
@@ -7107,9 +7169,9 @@ int sched_cpu_dying(unsigned int cpu)
* Cheaper version of the below functions in case support for SMT and MC is
* compiled in but CPUs have no siblings.
*/
-static bool sole_cpu_idle(int cpu)
+static bool sole_cpu_idle(struct rq *rq)
{
- return rq_idle(cpu_rq(cpu));
+ return rq_idle(rq);
}
#endif
#ifdef CONFIG_SCHED_SMT
@@ -7118,9 +7180,9 @@ static const cpumask_t *thread_cpumask(int cpu)
return topology_sibling_cpumask(cpu);
}
/* All this CPU's SMT siblings are idle */
-static bool siblings_cpu_idle(int cpu)
+static bool siblings_cpu_idle(struct rq *rq)
{
- return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map);
+ return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map);
}
#endif
#ifdef CONFIG_SCHED_MC
@@ -7129,9 +7191,9 @@ static const cpumask_t *core_cpumask(int cpu)
return topology_core_cpumask(cpu);
}
/* All this CPU's shared cache siblings are idle */
-static bool cache_cpu_idle(int cpu)
+static bool cache_cpu_idle(struct rq *rq)
{
- return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map);
+ return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map);
}
#endif
@@ -7150,6 +7212,9 @@ void __init sched_init_smp(void)
{
struct sched_domain *sd;
int cpu, other_cpu;
+#ifdef CONFIG_SCHED_SMT
+ bool smt_threads = false;
+#endif
cpumask_var_t non_isolated_cpus;
@@ -7209,16 +7274,31 @@ void __init sched_init_smp(void)
if (rq->cpu_locality[other_cpu] > 2)
rq->cpu_locality[other_cpu] = 2;
}
- if (cpumask_weight(core_cpumask(cpu)) > 1)
+ if (cpumask_weight(core_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->core_mask, core_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->core_mask);
rq->cache_idle = cache_cpu_idle;
+ }
#endif
#ifdef CONFIG_SCHED_SMT
for_each_cpu(other_cpu, thread_cpumask(cpu))
rq->cpu_locality[other_cpu] = 1;
- if (cpumask_weight(thread_cpumask(cpu)) > 1)
+ if (cpumask_weight(thread_cpumask(cpu)) > 1) {
+ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu));
+ cpumask_clear_cpu(cpu, &rq->thread_mask);
rq->siblings_idle = siblings_cpu_idle;
+ smt_threads = true;
+ }
#endif
}
+#ifdef CONFIG_SMT_NICE
+ if (smt_threads) {
+ check_siblings = &check_smt_siblings;
+ wake_siblings = &wake_smt_siblings;
+ smt_schedule = &smt_should_schedule;
+ rq_load_avg = &smt_load_avg;
+ }
+#endif
grq_unlock_irq();
mutex_unlock(&sched_domains_mutex);
@@ -7245,6 +7325,32 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
+#ifdef CONFIG_CGROUP_SCHED
+/* task group related information */
+struct task_group {
+ struct cgroup_subsys_state css;
+
+ struct rcu_head rcu;
+ struct list_head list;
+
+ struct task_group *parent;
+ struct list_head siblings;
+ struct list_head children;
+};
+
+/*
+ * Default task group.
+ * Every task in system belongs to this group at bootup.
+ */
+struct task_group root_task_group;
+LIST_HEAD(task_groups);
+
+/* Cacheline aligned slab cache for task_group */
+static struct kmem_cache *task_group_cache __read_mostly;
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
+#endif /* CONFIG_CGROUP_SCHED */
+
void __init sched_init(void)
{
#ifdef CONFIG_SMP
@@ -7265,6 +7371,9 @@ void __init sched_init(void)
grq.iso_ticks = 0;
grq.iso_refractory = false;
grq.noc = 1;
+ grq.node = skiplist_init();
+ grq.sl = new_skiplist(grq.node);
+
#ifdef CONFIG_SMP
init_defrootdomain();
grq.qnr = grq.idle_cpus = 0;
@@ -7272,6 +7381,14 @@ void __init sched_init(void)
#else
uprq = &per_cpu(runqueues, 0);
#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+ task_group_cache = KMEM_CACHE(task_group, 0);
+
+ list_add(&root_task_group.list, &task_groups);
+ INIT_LIST_HEAD(&root_task_group.children);
+ INIT_LIST_HEAD(&root_task_group.siblings);
+#endif /* CONFIG_CGROUP_SCHED */
for_each_possible_cpu(i) {
rq = cpu_rq(i);
rq->grq_lock = &grq.lock;
@@ -7316,11 +7433,6 @@ void __init sched_init(void)
}
#endif
- for (i = 0; i < PRIO_LIMIT; i++)
- INIT_LIST_HEAD(grq.queue + i);
- /* delimiter for bitsearch */
- __set_bit(PRIO_LIMIT, grq.prio_bitmap);
-
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
#endif
@@ -7702,3 +7814,127 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
#endif
+
+#ifdef CONFIG_CGROUP_SCHED
+static void sched_free_group(struct task_group *tg)
+{
+ kmem_cache_free(task_group_cache, tg);
+}
+
+/* allocate runqueue etc for a new task group */
+struct task_group *sched_create_group(struct task_group *parent)
+{
+ struct task_group *tg;
+
+ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ return tg;
+}
+
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_add_rcu(&tg->list, &task_groups);
+
+ WARN_ON(!parent); /* root should already exist */
+
+ tg->parent = parent;
+ INIT_LIST_HEAD(&tg->children);
+ list_add_rcu(&tg->siblings, &parent->children);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+/* rcu callback to free various structures associated with a task group */
+static void sched_free_group_rcu(struct rcu_head *rhp)
+{
+ /* now it should be safe to free those cfs_rqs */
+ sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+void sched_destroy_group(struct task_group *tg)
+{
+ /* wait for possible concurrent references to cfs_rqs complete */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
+void sched_offline_group(struct task_group *tg)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&task_group_lock, flags);
+ list_del_rcu(&tg->list);
+ list_del_rcu(&tg->siblings);
+ spin_unlock_irqrestore(&task_group_lock, flags);
+}
+
+static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct task_group, css) : NULL;
+}
+
+static struct cgroup_subsys_state *
+cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct task_group *parent = css_tg(parent_css);
+ struct task_group *tg;
+
+ if (!parent) {
+ /* This is early initialization for the top cgroup */
+ return &root_task_group.css;
+ }
+
+ tg = sched_create_group(parent);
+ if (IS_ERR(tg))
+ return ERR_PTR(-ENOMEM);
+ return &tg->css;
+}
+
+static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ sched_offline_group(tg);
+}
+
+static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ /*
+ * Relies on the RCU grace period between css_released() and this.
+ */
+ sched_free_group(tg);
+}
+
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+}
+
+static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
+{
+ return 0;
+}
+
+static void cpu_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+
+static struct cftype cpu_files[] = {
+ { } /* terminate */
+};
+
+struct cgroup_subsys cpu_cgrp_subsys = {
+ .css_alloc = cpu_cgroup_css_alloc,
+ .css_released = cpu_cgroup_css_released,
+ .css_free = cpu_cgroup_css_free,
+ .fork = cpu_cgroup_fork,
+ .can_attach = cpu_cgroup_can_attach,
+ .attach = cpu_cgroup_attach,
+ .legacy_cftypes = cpu_files,
+ .early_init = true,
+};
+#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
index 9ab0ec66a..06b2cdca7 100644
--- a/kernel/sched/bfs_sched.h
+++ b/kernel/sched/bfs_sched.h
@@ -24,6 +24,8 @@ struct rq {
int rq_prio;
bool rq_running; /* There is a task running */
int soft_affined; /* Running or queued tasks with this set as their rq */
+ u64 load_update; /* When we last updated load */
+ unsigned long load_avg; /* Rolling load average */
#ifdef CONFIG_SMT_NICE
struct mm_struct *rq_mm;
int rq_smt_bias; /* Policy/nice level bias across smt siblings */
@@ -44,11 +46,13 @@ struct rq {
struct sched_domain *sd;
int *cpu_locality; /* CPU relative cache distance */
#ifdef CONFIG_SCHED_SMT
- bool (*siblings_idle)(int cpu);
+ cpumask_t thread_mask;
+ bool (*siblings_idle)(struct rq *rq);
/* See if all smt siblings are idle */
#endif /* CONFIG_SCHED_SMT */
#ifdef CONFIG_SCHED_MC
- bool (*cache_idle)(int cpu);
+ cpumask_t core_mask;
+ bool (*cache_idle)(struct rq *rq);
/* See if all cache siblings are idle */
#endif /* CONFIG_SCHED_MC */
u64 last_niffy; /* Last time this RQ updated grq.niffies */
@@ -199,14 +203,29 @@ static inline void cpufreq_trigger(u64 time, unsigned long util)
{
struct update_util_data *data;
+ if (util > SCHED_CAPACITY_SCALE)
+ util = SCHED_CAPACITY_SCALE;
data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
if (data)
- data->func(data, time, util, 0);
+ data->func(data, time, util, SCHED_CAPACITY_SCALE);
+}
+
+static inline void other_cpufreq_trigger(int cpu, u64 time, unsigned long util)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu));
+ if (data)
+ data->func(data, time, util, SCHED_CAPACITY_SCALE);
}
#else
static inline void cpufreq_trigger(u64 time, unsigned long util)
{
}
+
+static inline void other_cpufreq_trigger(int cpu, u64 time, unsigned long util)
+{
+}
#endif /* CONFIG_CPU_FREQ */
#ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 75f98c549..a24cfb41d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -603,19 +603,25 @@ static void cputime_adjust(struct task_cputime *curr,
stime = curr->stime;
utime = curr->utime;
- if (utime == 0) {
- stime = rtime;
+ /*
+ * If either stime or both stime and utime are 0, assume all runtime is
+ * userspace. Once a task gets some ticks, the monotonicy code at
+ * 'update' will ensure things converge to the observed ratio.
+ */
+ if (stime == 0) {
+ utime = rtime;
goto update;
}
- if (stime == 0) {
- utime = rtime;
+ if (utime == 0) {
+ stime = rtime;
goto update;
}
stime = scale_stime((__force u64)stime, (__force u64)rtime,
(__force u64)(stime + utime));
+update:
/*
* Make sure stime doesn't go backwards; this preserves monotonicity
* for utime because rtime is monotonic.
@@ -638,7 +644,6 @@ static void cputime_adjust(struct task_cputime *curr,
stime = rtime - utime;
}
-update:
prev->stime = stime;
prev->utime = utime;
out:
diff --git a/kernel/skip_lists.c b/kernel/skip_lists.c
new file mode 100644
index 000000000..4d82ee18b
--- /dev/null
+++ b/kernel/skip_lists.c
@@ -0,0 +1,177 @@
+/*
+ Copyright (C) 2011,2016 Con Kolivas.
+
+ Code based on example originally by William Pugh.
+
+Skip Lists are a probabilistic alternative to balanced trees, as
+described in the June 1990 issue of CACM and were invented by
+William Pugh in 1987.
+
+A couple of comments about this implementation:
+The routine randomLevel has been hard-coded to generate random
+levels using p=0.25. It can be easily changed.
+
+The insertion routine has been implemented so as to use the
+dirty hack described in the CACM paper: if a random level is
+generated that is more than the current maximum level, the
+current maximum level plus one is used instead.
+
+Levels start at zero and go up to MaxLevel (which is equal to
+MaxNumberOfLevels-1).
+
+The routines defined in this file are:
+
+init: defines slnode
+
+new_skiplist: returns a new, empty list
+
+randomLevel: Returns a random level based on a u64 random seed passed to it.
+In BFS, the "niffy" time is used for this purpose.
+
+insert(l,key, value): inserts the binding (key, value) into l. This operation
+occurs in O(log n) time.
+
+delnode(slnode, l, node): deletes any binding of key from the l based on the
+actual node value. This operation occurs in O(k) time where k is the
+number of levels of the node in question (max 16). The original delete
+function occurred in O(log n) time and involved a search.
+
+BFS Notes: In this implementation of skiplists, there are bidirectional
+next/prev pointers and the insert function returns a pointer to the actual
+node the value is stored. The key here is chosen by the scheduler so as to
+sort tasks according to the priority list requirements and is no longer used
+by the scheduler after insertion. The scheduler lookup, however, occurs in
+O(1) time because it is always the first item in the level 0 linked list.
+Since the task struct stores a copy of the node pointer upon skiplist_insert,
+it can also remove it much faster than the original implementation with the
+aid of prev<->next pointer manipulation and no searching.
+
+*/
+
+#include <linux/slab.h>
+#include <linux/skip_lists.h>
+
+#define MaxNumberOfLevels 16
+#define MaxLevel (MaxNumberOfLevels - 1)
+#define newNode kmalloc(sizeof(skiplist_node), GFP_ATOMIC)
+
+skiplist_node *skiplist_init(void)
+{
+ skiplist_node *slnode = newNode;
+ int i;
+
+ BUG_ON(!slnode);
+ slnode->key = 0xFFFFFFFFFFFFFFFF;
+ slnode->level = 0;
+ slnode->value = NULL;
+ for (i = 0; i < MaxNumberOfLevels; i++)
+ slnode->next[i] = slnode->prev[i] = slnode;
+ return slnode;
+}
+
+skiplist *new_skiplist(skiplist_node *slnode)
+{
+ skiplist *l = kmalloc(sizeof(skiplist), GFP_ATOMIC);
+
+ BUG_ON(!l);
+ l->entries = 0;
+ l->level = 0;
+ l->header = slnode;
+ return l;
+}
+
+void free_skiplist(skiplist_node *slnode, skiplist *l)
+{
+ skiplist_node *p, *q;
+
+ p = l->header;
+ do {
+ q = p->next[0];
+ p->next[0]->prev[0] = q->prev[0];
+ kfree(p);
+ p = q;
+ } while (p != slnode);
+ kfree(l);
+}
+
+/*
+ * Returns a pseudo-random number based on the randseed value by masking out
+ * 0-15. As many levels are not required when only few values are on the list,
+ * we limit the height of the levels according to how many list entries there
+ * are in a cheap manner. The height of the levels may have been higher while
+ * there were more entries queued previously but as this code is used only by
+ * the scheduler, entries are short lived and will be torn down regularly.
+ *
+ * 00-03 entries - 1 level
+ * 04-07 entries - 2 levels
+ * 08-15 entries - 4 levels
+ * 15-31 entries - 7 levels
+ * 32+ entries - max(16) levels
+ */
+static inline unsigned int randomLevel(int entries, unsigned int randseed)
+{
+ unsigned int mask;
+
+ if (entries > 31)
+ mask = 0xF;
+ else if (entries > 15)
+ mask = 0x7;
+ else if (entries > 7)
+ mask = 0x3;
+ else if (entries > 3)
+ mask = 0x1;
+ else
+ return 0;
+
+ return randseed & mask;
+}
+
+skiplist_node *skiplist_insert(skiplist_node *slnode, skiplist *l, keyType key, valueType value, unsigned int randseed)
+{
+ skiplist_node *update[MaxNumberOfLevels];
+ skiplist_node *p, *q;
+ int k = l->level;
+
+ p = slnode;
+ do {
+ while (q = p->next[k], q->key <= key)
+ p = q;
+ update[k] = p;
+ } while (--k >= 0);
+
+ k = randomLevel(++l->entries, randseed);
+ if (k > l->level) {
+ k = ++l->level;
+ update[k] = slnode;
+ }
+
+ q = newNode;
+ q->level = k;
+ q->key = key;
+ q->value = value;
+ do {
+ p = update[k];
+ q->next[k] = p->next[k];
+ p->next[k] = q;
+ q->prev[k] = p;
+ q->next[k]->prev[k] = q;
+ } while (--k >= 0);
+ return q;
+}
+
+void skiplist_delnode(skiplist_node *slnode, skiplist *l, skiplist_node *node)
+{
+ int k, m = node->level;
+
+ for (k = 0; k <= m; k++) {
+ node->prev[k]->next[k] = node->next[k];
+ node->next[k]->prev[k] = node->prev[k];
+ }
+ kfree(node);
+ if (m == l->level) {
+ while (l->header->next[m] == slnode && l->header->prev[m] == slnode && m > 0)
+ m--;
+ l->level = m;
+ }
+ l->entries--;
+}