diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/events/core.c | 60 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 5 | ||||
-rw-r--r-- | kernel/irq/msi.c | 19 | ||||
-rw-r--r-- | kernel/sched/bfs.c | 704 | ||||
-rw-r--r-- | kernel/sched/bfs_sched.h | 25 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 15 | ||||
-rw-r--r-- | kernel/skip_lists.c | 177 |
8 files changed, 744 insertions, 263 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index e2ec54e2b..dd84575cb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -9,7 +9,7 @@ obj-y = fork.o exec_domain.o panic.o \ extable.o params.o \ kthread.o sys_ni.o nsproxy.o \ notifier.o ksysfs.o cred.o reboot.o \ - async.o range.o smpboot.o + async.o range.o smpboot.o skip_lists.o obj-$(CONFIG_MULTIUSER) += groups.o diff --git a/kernel/events/core.c b/kernel/events/core.c index 43d43a2d5..e68c0a735 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -242,18 +242,6 @@ unlock: return ret; } -static void event_function_local(struct perf_event *event, event_f func, void *data) -{ - struct event_function_struct efs = { - .event = event, - .func = func, - .data = data, - }; - - int ret = event_function(&efs); - WARN_ON_ONCE(ret); -} - static void event_function_call(struct perf_event *event, event_f func, void *data) { struct perf_event_context *ctx = event->ctx; @@ -303,6 +291,54 @@ again: raw_spin_unlock_irq(&ctx->lock); } +/* + * Similar to event_function_call() + event_function(), but hard assumes IRQs + * are already disabled and we're on the right CPU. + */ +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct task_struct *task = READ_ONCE(ctx->task); + struct perf_event_context *task_ctx = NULL; + + WARN_ON_ONCE(!irqs_disabled()); + + if (task) { + if (task == TASK_TOMBSTONE) + return; + + task_ctx = ctx; + } + + perf_ctx_lock(cpuctx, task_ctx); + + task = ctx->task; + if (task == TASK_TOMBSTONE) + goto unlock; + + if (task) { + /* + * We must be either inactive or active and the right task, + * otherwise we're screwed, since we cannot IPI to somewhere + * else. + */ + if (ctx->is_active) { + if (WARN_ON_ONCE(task != current)) + goto unlock; + + if (WARN_ON_ONCE(cpuctx->task_ctx != ctx)) + goto unlock; + } + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + func(event, cpuctx, ctx, data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); +} + #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index b7a525ab2..8c50276b6 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg, false); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr, false); @@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 38e89ce7b..0afe671f1 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -324,7 +324,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_domain_ops *ops = info->ops; msi_alloc_info_t arg; struct msi_desc *desc; - int i, ret, virq = -1; + int i, ret, virq; ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) @@ -332,12 +332,8 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, for_each_msi_entry(desc, dev) { ops->set_desc(&arg, desc); - if (info->flags & MSI_FLAG_IDENTITY_MAP) - virq = (int)ops->get_hwirq(info, &arg); - else - virq = -1; - virq = __irq_domain_alloc_irqs(domain, virq, desc->nvec_used, + virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used, dev_to_node(dev), &arg, false); if (virq < 0) { ret = -ENOSPC; @@ -361,6 +357,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, else dev_dbg(dev, "irq [%d-%d] for MSI\n", virq, virq + desc->nvec_used - 1); + /* + * This flag is set by the PCI layer as we need to activate + * the MSI entries before the PCI layer enables MSI in the + * card. Otherwise the card latches a random msi message. + */ + if (info->flags & MSI_FLAG_ACTIVATE_EARLY) { + struct irq_data *irq_data; + + irq_data = irq_domain_get_irq_data(domain, desc->irq); + irq_domain_activate_irq(irq_data); + } } return 0; diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c index 6fd00c5ae..4168a5527 100644 --- a/kernel/sched/bfs.c +++ b/kernel/sched/bfs.c @@ -74,6 +74,7 @@ #include <linux/context_tracking.h> #include <linux/sched/prio.h> #include <linux/tick.h> +#include <linux/skip_lists.h> #include <asm/irq_regs.h> #include <asm/switch_to.h> @@ -136,7 +137,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.472 by Con Kolivas.\n"); + printk(KERN_INFO "BFS CPU scheduler v0.490 by Con Kolivas.\n"); } /* @@ -190,8 +191,6 @@ struct global_rq { unsigned long nr_running; unsigned long nr_uninterruptible; unsigned long long nr_switches; - struct list_head queue[PRIO_LIMIT]; - DECLARE_BITMAP(prio_bitmap, PRIO_LIMIT + 1); unsigned long qnr; /* queued not running */ #ifdef CONFIG_SMP cpumask_t cpu_idle_map; @@ -204,6 +203,9 @@ struct global_rq { raw_spinlock_t iso_lock; int iso_ticks; bool iso_refractory; + + skiplist_node *node; + skiplist *sl; }; #ifdef CONFIG_SMP @@ -538,24 +540,25 @@ static inline bool deadline_after(u64 deadline, u64 time) } /* - * A task that is queued but not running will be on the grq run list. - * A task that is not running or queued will not be on the grq run list. - * A task that is currently running will have ->on_cpu set but not on the - * grq run list. + * A task that is not running or queued will not have a node set. + * A task that is queued but not running will have a node set. + * A task that is currently running will have ->on_cpu set but no node set. */ static inline bool task_queued(struct task_struct *p) { - return (!list_empty(&p->run_list)); + return p->node; } /* - * Removing from the global runqueue. Enter with grq locked. + * Removing from the global runqueue. Enter with grq locked. Deleting a task + * from the skip list is done via the stored node reference in the task struct + * and does not require a full look up. Thus it occurs in O(k) time where k + * is the "level" of the list the task was stored at - usually < 4, max 16. */ static void dequeue_task(struct task_struct *p) { - list_del_init(&p->run_list); - if (list_empty(grq.queue + p->prio)) - __clear_bit(p->prio, grq.prio_bitmap); + skiplist_delnode(grq.node, grq.sl, p->node); + p->node = NULL; sched_info_dequeued(task_rq(p), p); } @@ -583,6 +586,9 @@ static bool isoprio_suitable(void) */ static void enqueue_task(struct task_struct *p, struct rq *rq) { + unsigned int randseed; + u64 sl_id; + if (!rt_task(p)) { /* Check it hasn't gotten rt from PI */ if ((idleprio_task(p) && idleprio_suitable(p)) || @@ -591,8 +597,32 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) else p->prio = NORMAL_PRIO; } - __set_bit(p->prio, grq.prio_bitmap); - list_add_tail(&p->run_list, grq.queue + p->prio); + /* + * The sl_id key passed to the skiplist generates a sorted list. + * Realtime and sched iso tasks run FIFO so they only need be sorted + * according to priority. The skiplist will put tasks of the same + * key inserted later in FIFO order. Tasks of sched normal, batch + * and idleprio are sorted according to their deadlines. Idleprio + * tasks are offset by an impossibly large deadline value ensuring + * they get sorted into last positions, but still according to their + * own deadlines. This creates a "landscape" of skiplists running + * from priority 0 realtime in first place to the lowest priority + * idleprio tasks last. Skiplist insertion is an O(log n) process. + */ + if (p->prio <= ISO_PRIO) + sl_id = p->prio; + else { + sl_id = p->deadline; + /* Set it to cope with 4 left shifts with locality_diff */ + if (p->prio == IDLE_PRIO) + sl_id |= 0x0F00000000000000; + } + /* + * Some architectures don't have better than microsecond resolution + * so mask out ~microseconds as the random seed for skiplist insertion. + */ + randseed = (grq.niffies >> 10) & 0xFFFFFFFF; + p->node = skiplist_insert(grq.node, grq.sl, sl_id, p, randseed); sched_info_queued(rq, p); } @@ -647,6 +677,113 @@ static inline int queued_notrunning(void) return grq.qnr; } +#ifdef CONFIG_SMT_NICE +static const cpumask_t *thread_cpumask(int cpu); + +/* Find the best real time priority running on any SMT siblings of cpu and if + * none are running, the static priority of the best deadline task running. + * The lookups to the other runqueues is done lockless as the occasional wrong + * value would be harmless. */ +static int best_smt_bias(struct rq *this_rq) +{ + int other_cpu, best_bias = 0; + + for_each_cpu(other_cpu, &this_rq->thread_mask) { + struct rq *rq = cpu_rq(other_cpu); + + if (rq_idle(rq)) + continue; + if (!rq->online) + continue; + if (!rq->rq_mm) + continue; + if (likely(rq->rq_smt_bias > best_bias)) + best_bias = rq->rq_smt_bias; + } + return best_bias; +} + +static int task_prio_bias(struct task_struct *p) +{ + if (rt_task(p)) + return 1 << 30; + else if (task_running_iso(p)) + return 1 << 29; + else if (task_running_idle(p)) + return 0; + return MAX_PRIO - p->static_prio; +} + +static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) +{ + return true; +} + +static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; + +/* We've already decided p can run on CPU, now test if it shouldn't for SMT + * nice reasons. */ +static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) +{ + int best_bias, task_bias; + + /* Kernel threads always run */ + if (unlikely(!p->mm)) + return true; + if (rt_task(p)) + return true; + if (!idleprio_suitable(p)) + return true; + best_bias = best_smt_bias(this_rq); + /* The smt siblings are all idle or running IDLEPRIO */ + if (best_bias < 1) + return true; + task_bias = task_prio_bias(p); + if (task_bias < 1) + return false; + if (task_bias >= best_bias) + return true; + /* Dither 25% cpu of normal tasks regardless of nice difference */ + if (best_bias % 4 == 1) + return true; + /* Sorry, you lose */ + return false; +} + +static unsigned long cpu_load_avg(struct rq *rq) +{ + return rq->soft_affined * SCHED_CAPACITY_SCALE; +} + +/* + * This is the proportion of SCHED_CAPACITY_SCALE (1024) used when each thread + * of a CPU with SMT siblings is in use. + */ +#define SCHED_SMT_LOAD (890) + +/* + * Load of a CPU with smt siblings should be considered to be the load from all + * the SMT siblings, thus will be >1 if both threads are in use since they are + * not full cores. + */ +static unsigned long smt_load_avg(struct rq *rq) +{ + unsigned long load = rq->soft_affined * SCHED_SMT_LOAD; + int cpu; + + for_each_cpu(cpu, thread_cpumask(rq->cpu)) + load += cpu_rq(cpu)->soft_affined * SCHED_SMT_LOAD; + return load; +} + +static unsigned long (*rq_load_avg)(struct rq *rq) = &cpu_load_avg; +#else +#define smt_schedule(p, this_rq) (true) +static inline unsigned long rq_load_avg(struct rq *rq) +{ + return rq->soft_affined * SCHED_CAPACITY_SCALE; +} +#endif #ifdef CONFIG_SMP /* * The cpu_idle_map stores a bitmap of all the CPUs currently idle to @@ -691,7 +828,7 @@ static inline bool scaling_rq(struct rq *rq); * lowest value would give the most suitable CPU to schedule p onto next. The * order works out to be the following: * - * Same core, idle or busy cache, idle or busy threads + * Same thread, idle or busy cache, idle or busy threads * Other core, same cache, idle or busy cache, idle threads. * Same node, other CPU, idle cache, idle threads. * Same node, other CPU, busy cache, idle threads. @@ -729,13 +866,13 @@ static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) #ifdef CONFIG_SCHED_MC else if (locality == 2) ranking |= CPUIDLE_DIFF_CORE; - if (!(tmp_rq->cache_idle(cpu_tmp))) + else if (!(tmp_rq->cache_idle(tmp_rq))) ranking |= CPUIDLE_CACHE_BUSY; #endif #ifdef CONFIG_SCHED_SMT if (locality == 1) ranking |= CPUIDLE_DIFF_THREAD; - if (!(tmp_rq->siblings_idle(cpu_tmp))) + if (!(tmp_rq->siblings_idle(tmp_rq))) ranking |= CPUIDLE_THREAD_BUSY; #endif if (scaling_rq(tmp_rq)) @@ -763,90 +900,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return (this_rq->cpu_locality[that_cpu] < 3); } -#ifdef CONFIG_SMT_NICE -static const cpumask_t *thread_cpumask(int cpu); - -/* Find the best real time priority running on any SMT siblings of cpu and if - * none are running, the static priority of the best deadline task running. - * The lookups to the other runqueues is done lockless as the occasional wrong - * value would be harmless. */ -static int best_smt_bias(int cpu) -{ - int other_cpu, best_bias = 0; - - for_each_cpu(other_cpu, thread_cpumask(cpu)) { - struct rq *rq; - - if (other_cpu == cpu) - continue; - rq = cpu_rq(other_cpu); - if (rq_idle(rq)) - continue; - if (!rq->online) - continue; - if (!rq->rq_mm) - continue; - if (likely(rq->rq_smt_bias > best_bias)) - best_bias = rq->rq_smt_bias; - } - return best_bias; -} - -static int task_prio_bias(struct task_struct *p) -{ - if (rt_task(p)) - return 1 << 30; - else if (task_running_iso(p)) - return 1 << 29; - else if (task_running_idle(p)) - return 0; - return MAX_PRIO - p->static_prio; -} - -/* We've already decided p can run on CPU, now test if it shouldn't for SMT - * nice reasons. */ -static bool smt_should_schedule(struct task_struct *p, int cpu) -{ - int best_bias, task_bias; - - /* Kernel threads always run */ - if (unlikely(!p->mm)) - return true; - if (rt_task(p)) - return true; - if (!idleprio_suitable(p)) - return true; - best_bias = best_smt_bias(cpu); - /* The smt siblings are all idle or running IDLEPRIO */ - if (best_bias < 1) - return true; - task_bias = task_prio_bias(p); - if (task_bias < 1) - return false; - if (task_bias >= best_bias) - return true; - /* Dither 25% cpu of normal tasks regardless of nice difference */ - if (best_bias % 4 == 1) - return true; - /* Sorry, you lose */ - return false; -} -#else -#define smt_should_schedule(p, cpu) (1) -#endif - static bool resched_best_idle(struct task_struct *p) { cpumask_t tmpmask; + struct rq *rq; int best_cpu; cpumask_and(&tmpmask, &p->cpus_allowed, &grq.cpu_idle_map); best_cpu = best_mask_cpu(task_cpu(p), task_rq(p), &tmpmask); -#ifdef CONFIG_SMT_NICE - if (!smt_should_schedule(p, best_cpu)) + rq = cpu_rq(best_cpu); + if (!smt_schedule(p, rq)) return false; -#endif - resched_curr(cpu_rq(best_cpu)); + resched_curr(rq); return true; } @@ -953,6 +1018,26 @@ static int effective_prio(struct task_struct *p) } /* + * Update the load average for feeding into cpu frequency governors. Use a rolling + * average with ~ time constant of 32ms + */ +static void update_load_avg(struct rq *rq) +{ + /* rq clock can go backwards so skip update if that happens */ + if (likely(rq->clock > rq->load_update)) { + unsigned long us_interval = (rq->clock - rq->load_update) >> 10; + long load; + + load = rq->load_avg - (rq->load_avg * us_interval * 80 / 32768 / 128); + if (unlikely(load < 0)) + load = 0; + load += rq->soft_affined * rq_load_avg(rq) * us_interval * 80 / 32768 / 128; + rq->load_avg = load; + } + rq->load_update = rq->clock; +} + +/* * activate_task - move a task to the runqueue. Enter with grq locked. */ static void activate_task(struct task_struct *p, struct rq *rq) @@ -978,7 +1063,8 @@ static void activate_task(struct task_struct *p, struct rq *rq) p->on_rq = 1; grq.nr_running++; inc_qnr(); - cpufreq_trigger(grq.niffies, rq->soft_affined); + update_load_avg(rq); + cpufreq_trigger(grq.niffies, rq->load_avg); } static inline void clear_sticky(struct task_struct *p); @@ -995,19 +1081,22 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq) p->on_rq = 0; grq.nr_running--; clear_sticky(p); - cpufreq_trigger(grq.niffies, rq->soft_affined); + update_load_avg(rq); + cpufreq_trigger(grq.niffies, rq->load_avg); } #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { + unsigned int tcpu; + #ifdef CONFIG_LOCKDEP /* * The caller should hold grq lock. */ WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); #endif - if (task_cpu(p) == cpu) + if ((tcpu = task_cpu(p)) == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); @@ -1019,8 +1108,21 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) */ smp_wmb(); if (p->on_rq) { - task_rq(p)->soft_affined--; - cpu_rq(cpu)->soft_affined++; + struct rq *rq; + + /* + * set_task_cpu can be set on other CPUs so call cpufreq_trigger + * explicitly telling it what CPU is being updated as the value + * of soft_affined has changed. + */ + rq = task_rq(p); + rq->soft_affined--; + update_load_avg(rq); + other_cpufreq_trigger(tcpu, grq.niffies, rq->load_avg); + rq = cpu_rq(cpu); + rq->soft_affined++; + update_load_avg(rq); + other_cpufreq_trigger(cpu, grq.niffies, rq->load_avg); } task_thread_info(p)->cpu = cpu; } @@ -1353,13 +1455,10 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu) return false; } -/* - * When all else is equal, still prefer this_rq. - */ static void try_preempt(struct task_struct *p, struct rq *this_rq) { + int cpu, pcpu, highest_prio, highest_cpu; struct rq *highest_prio_rq = NULL; - int cpu, highest_prio; u64 latest_deadline; cpumask_t tmp; @@ -1383,13 +1482,13 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) return; /* See if this task can preempt the task on the current CPU first. */ - cpu = cpu_of(this_rq); - if (cpumask_test_cpu(cpu, &tmp)) { - if (smt_should_schedule(p, cpu) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) { + pcpu = cpu_of(this_rq); + if (likely(cpumask_test_cpu(pcpu, &tmp))) { + if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) { resched_curr(this_rq); return; } - cpumask_clear_cpu(cpu, &tmp); + cpumask_clear_cpu(pcpu, &tmp); } highest_prio = latest_deadline = 0; @@ -1398,37 +1497,40 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) for_each_cpu(cpu, &tmp) { struct rq *rq; int rq_prio; + u64 dl; rq = cpu_rq(cpu); rq_prio = rq->rq_prio; if (rq_prio < highest_prio) continue; + dl = rq->rq_deadline; + if (!sched_interactive && pcpu != cpu) + dl <<= locality_diff(pcpu, rq); if (rq_prio > highest_prio || - deadline_after(rq->rq_deadline, latest_deadline)) { - latest_deadline = rq->rq_deadline; + deadline_after(dl, latest_deadline)) { + latest_deadline = dl; highest_prio = rq_prio; + highest_cpu = cpu; highest_prio_rq = rq; } } - if (likely(highest_prio_rq)) { -#ifdef CONFIG_SMT_NICE - cpu = cpu_of(highest_prio_rq); - if (!smt_should_schedule(p, cpu)) - return; -#endif - if (can_preempt(p, highest_prio, latest_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match so there is no discrepancy - * in earliest_deadline_task which biases away tasks with - * a different CPU set. This means waking tasks are - * treated differently to rescheduling tasks. - */ - set_task_cpu(p, cpu); - resched_curr(highest_prio_rq); - } + if (unlikely(!highest_prio_rq)) + return; + if (!smt_schedule(p, highest_prio_rq)) + return; + if (can_preempt(p, highest_prio, latest_deadline)) { + /* + * If we have decided this task should preempt this CPU, + * set the task's CPU to match so there is no discrepancy + * in earliest_deadline_task which biases away tasks with + * a different CPU set. This means waking tasks are + * treated differently to rescheduling tasks in + * interactive mode. + */ + set_task_cpu(p, highest_cpu); + resched_curr(highest_prio_rq); } } static int __set_cpus_allowed_ptr(struct task_struct *p, @@ -1723,7 +1825,7 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) p->sched_reset_on_fork = 0; } - INIT_LIST_HEAD(&p->run_list); + p->node = NULL; #ifdef CONFIG_SCHED_INFO if (unlikely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -3072,6 +3174,8 @@ void scheduler_tick(void) /* grq lock not grabbed, so only update rq clock */ update_rq_clock(rq); update_cpu_clock_tick(rq, rq->curr); + update_load_avg(rq); + cpufreq_trigger(grq.niffies, rq->load_avg); if (!rq_idle(rq)) task_running_tick(rq); else @@ -3280,101 +3384,56 @@ found_middle: } /* - * O(n) lookup of all tasks in the global runqueue. The real brainfuck - * of lock contention and O(n). It's not really O(n) as only the queued, - * but not running tasks are scanned, and is O(n) queued in the worst case - * scenario only because the right task can be found before scanning all of - * them. - * Tasks are selected in this order: - * Real time tasks are selected purely by their static priority and in the - * order they were queued, so the lowest value idx, and the first queued task - * of that priority value is chosen. - * If no real time tasks are found, the SCHED_ISO priority is checked, and - * all SCHED_ISO tasks have the same priority value, so they're selected by - * the earliest deadline value. - * If no SCHED_ISO tasks are found, SCHED_NORMAL tasks are selected by the - * earliest deadline. - * Finally if no SCHED_NORMAL tasks are found, SCHED_IDLEPRIO tasks are - * selected by the earliest deadline. + * Task selection with skiplists is a simple matter of picking off the first + * task in the sorted list, an O(1) operation. The only time it takes longer + * is if tasks do not have suitable affinity and then we iterate over entries + * till we find the first that does. Worst case here is no tasks with suitable + * affinity and taking O(n). */ static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { - struct task_struct *edt = NULL; - unsigned long idx = -1; + struct task_struct *edt = idle; + skiplist_node *node = grq.node; + u64 earliest_deadline = ~0ULL; - do { - struct list_head *queue; - struct task_struct *p; - u64 earliest_deadline; - - idx = next_sched_bit(grq.prio_bitmap, ++idx); - if (idx >= PRIO_LIMIT) - return idle; - queue = grq.queue + idx; - - if (idx < MAX_RT_PRIO) { - /* We found an rt task */ - list_for_each_entry(p, queue, run_list) { - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) - continue; - edt = p; - goto out_take; - } - /* - * None of the RT tasks at this priority can run on - * this cpu - */ + while ((node = node->next[0]) != grq.node) { + struct task_struct *p = node->value; + int tcpu; + + /* Make sure affinity is ok */ + if (needs_other_cpu(p, cpu)) continue; - } - /* - * No rt tasks. Find the earliest deadline task. Now we're in - * O(n) territory. - */ - earliest_deadline = ~0ULL; - list_for_each_entry(p, queue, run_list) { + if (!smt_schedule(p, rq)) + continue; + + if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) { u64 dl; - /* Make sure cpu affinity is ok */ - if (needs_other_cpu(p, cpu)) + if (task_sticky(p) && scaling_rq(rq)) continue; - -#ifdef CONFIG_SMT_NICE - if (!smt_should_schedule(p, cpu)) + dl = p->deadline << locality_diff(tcpu, rq); + if (unlikely(!deadline_before(dl, earliest_deadline))) continue; -#endif - /* - * Soft affinity happens here by not scheduling a task - * with its sticky flag set that ran on a different CPU - * last when the CPU is scaling, or by greatly biasing - * against its deadline when not, based on cpu cache - * locality. - */ - if (sched_interactive) - dl = p->deadline; - else { - int tcpu = task_cpu(p); - - if (tcpu != cpu && task_sticky(p) && scaling_rq(rq)) - continue; - dl = p->deadline << locality_diff(tcpu, rq); - } - - if (deadline_before(dl, earliest_deadline)) { - earliest_deadline = dl; - edt = p; - } + earliest_deadline = dl; + edt = p; + /* We continue even though we've found the earliest + * deadline task as the locality offset means there + * may be a better candidate after it. */ + continue; } - } while (!edt); - -out_take: - take_task(cpu, edt); + /* This wouldn't happen if we encountered a better deadline from + * another CPU and have already set edt. */ + if (likely(p->deadline < earliest_deadline)) + edt = p; + break; + } + if (likely(edt != idle)) + take_task(cpu, edt); return edt; } - /* * Print scheduling while atomic bug: */ @@ -3454,44 +3513,47 @@ static void reset_rq_task(struct rq *rq, struct task_struct *p) } #ifdef CONFIG_SMT_NICE +static void check_no_siblings(struct rq __maybe_unused *this_rq) {} +static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} +static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; +static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; + /* Iterate over smt siblings when we've scheduled a process on cpu and decide * whether they should continue running or be descheduled. */ -static void check_smt_siblings(int cpu) +static void check_smt_siblings(struct rq *this_rq) { int other_cpu; - for_each_cpu(other_cpu, thread_cpumask(cpu)) { + for_each_cpu(other_cpu, &this_rq->thread_mask) { struct task_struct *p; struct rq *rq; - if (other_cpu == cpu) - continue; rq = cpu_rq(other_cpu); if (rq_idle(rq)) continue; if (!rq->online) continue; p = rq->curr; - if (!smt_should_schedule(p, cpu)) { + if (!smt_should_schedule(p, this_rq)) { set_tsk_need_resched(p); smp_send_reschedule(other_cpu); } } } -static void wake_smt_siblings(int cpu) +static void wake_smt_siblings(struct rq *this_rq) { int other_cpu; if (!queued_notrunning()) return; - for_each_cpu(other_cpu, thread_cpumask(cpu)) { + for_each_cpu(other_cpu, &this_rq->thread_mask) { struct rq *rq; - if (other_cpu == cpu) - continue; rq = cpu_rq(other_cpu); + if (!rq->online) + continue; if (rq_idle(rq)) { struct task_struct *p = rq->curr; @@ -3501,8 +3563,8 @@ static void wake_smt_siblings(int cpu) } } #else -static void check_smt_siblings(int __maybe_unused cpu) {} -static void wake_smt_siblings(int __maybe_unused cpu) {} +static void check_siblings(struct rq __maybe_unused *this_rq) {} +static void wake_siblings(struct rq __maybe_unused *this_rq) {} #endif /* @@ -3639,7 +3701,7 @@ static void __sched notrace __schedule(bool preempt) * again. */ set_rq_task(rq, prev); - check_smt_siblings(cpu); + check_siblings(rq); grq_unlock_irq(); goto rerun_prev_unlocked; } else @@ -3679,9 +3741,9 @@ static void __sched notrace __schedule(bool preempt) unstick_task(rq, prev); set_rq_task(rq, next); if (next != idle) - check_smt_siblings(cpu); + check_siblings(rq); else - wake_smt_siblings(cpu); + wake_siblings(rq); grq.nr_switches++; prev->on_cpu = false; next->on_cpu = true; @@ -3693,7 +3755,7 @@ static void __sched notrace __schedule(bool preempt) cpu = cpu_of(rq); idle = rq->idle; } else { - check_smt_siblings(cpu); + check_siblings(rq); grq_unlock_irq(); } @@ -7107,9 +7169,9 @@ int sched_cpu_dying(unsigned int cpu) * Cheaper version of the below functions in case support for SMT and MC is * compiled in but CPUs have no siblings. */ -static bool sole_cpu_idle(int cpu) +static bool sole_cpu_idle(struct rq *rq) { - return rq_idle(cpu_rq(cpu)); + return rq_idle(rq); } #endif #ifdef CONFIG_SCHED_SMT @@ -7118,9 +7180,9 @@ static const cpumask_t *thread_cpumask(int cpu) return topology_sibling_cpumask(cpu); } /* All this CPU's SMT siblings are idle */ -static bool siblings_cpu_idle(int cpu) +static bool siblings_cpu_idle(struct rq *rq) { - return cpumask_subset(thread_cpumask(cpu), &grq.cpu_idle_map); + return cpumask_subset(&rq->thread_mask, &grq.cpu_idle_map); } #endif #ifdef CONFIG_SCHED_MC @@ -7129,9 +7191,9 @@ static const cpumask_t *core_cpumask(int cpu) return topology_core_cpumask(cpu); } /* All this CPU's shared cache siblings are idle */ -static bool cache_cpu_idle(int cpu) +static bool cache_cpu_idle(struct rq *rq) { - return cpumask_subset(core_cpumask(cpu), &grq.cpu_idle_map); + return cpumask_subset(&rq->core_mask, &grq.cpu_idle_map); } #endif @@ -7150,6 +7212,9 @@ void __init sched_init_smp(void) { struct sched_domain *sd; int cpu, other_cpu; +#ifdef CONFIG_SCHED_SMT + bool smt_threads = false; +#endif cpumask_var_t non_isolated_cpus; @@ -7209,16 +7274,31 @@ void __init sched_init_smp(void) if (rq->cpu_locality[other_cpu] > 2) rq->cpu_locality[other_cpu] = 2; } - if (cpumask_weight(core_cpumask(cpu)) > 1) + if (cpumask_weight(core_cpumask(cpu)) > 1) { + cpumask_copy(&rq->core_mask, core_cpumask(cpu)); + cpumask_clear_cpu(cpu, &rq->core_mask); rq->cache_idle = cache_cpu_idle; + } #endif #ifdef CONFIG_SCHED_SMT for_each_cpu(other_cpu, thread_cpumask(cpu)) rq->cpu_locality[other_cpu] = 1; - if (cpumask_weight(thread_cpumask(cpu)) > 1) + if (cpumask_weight(thread_cpumask(cpu)) > 1) { + cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); + cpumask_clear_cpu(cpu, &rq->thread_mask); rq->siblings_idle = siblings_cpu_idle; + smt_threads = true; + } #endif } +#ifdef CONFIG_SMT_NICE + if (smt_threads) { + check_siblings = &check_smt_siblings; + wake_siblings = &wake_smt_siblings; + smt_schedule = &smt_should_schedule; + rq_load_avg = &smt_load_avg; + } +#endif grq_unlock_irq(); mutex_unlock(&sched_domains_mutex); @@ -7245,6 +7325,32 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } +#ifdef CONFIG_CGROUP_SCHED +/* task group related information */ +struct task_group { + struct cgroup_subsys_state css; + + struct rcu_head rcu; + struct list_head list; + + struct task_group *parent; + struct list_head siblings; + struct list_head children; +}; + +/* + * Default task group. + * Every task in system belongs to this group at bootup. + */ +struct task_group root_task_group; +LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; +/* task_group_lock serializes the addition/removal of task groups */ +static DEFINE_SPINLOCK(task_group_lock); +#endif /* CONFIG_CGROUP_SCHED */ + void __init sched_init(void) { #ifdef CONFIG_SMP @@ -7265,6 +7371,9 @@ void __init sched_init(void) grq.iso_ticks = 0; grq.iso_refractory = false; grq.noc = 1; + grq.node = skiplist_init(); + grq.sl = new_skiplist(grq.node); + #ifdef CONFIG_SMP init_defrootdomain(); grq.qnr = grq.idle_cpus = 0; @@ -7272,6 +7381,14 @@ void __init sched_init(void) #else uprq = &per_cpu(runqueues, 0); #endif + +#ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + + list_add(&root_task_group.list, &task_groups); + INIT_LIST_HEAD(&root_task_group.children); + INIT_LIST_HEAD(&root_task_group.siblings); +#endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { rq = cpu_rq(i); rq->grq_lock = &grq.lock; @@ -7316,11 +7433,6 @@ void __init sched_init(void) } #endif - for (i = 0; i < PRIO_LIMIT; i++) - INIT_LIST_HEAD(grq.queue + i); - /* delimiter for bitsearch */ - __set_bit(PRIO_LIMIT, grq.prio_bitmap); - #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&init_task.preempt_notifiers); #endif @@ -7702,3 +7814,127 @@ unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) return smt_gain; } #endif + +#ifdef CONFIG_CGROUP_SCHED +static void sched_free_group(struct task_group *tg) +{ + kmem_cache_free(task_group_cache, tg); +} + +/* allocate runqueue etc for a new task group */ +struct task_group *sched_create_group(struct task_group *parent) +{ + struct task_group *tg; + + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); + if (!tg) + return ERR_PTR(-ENOMEM); + + return tg; +} + +void sched_online_group(struct task_group *tg, struct task_group *parent) +{ + unsigned long flags; + + spin_lock_irqsave(&task_group_lock, flags); + list_add_rcu(&tg->list, &task_groups); + + WARN_ON(!parent); /* root should already exist */ + + tg->parent = parent; + INIT_LIST_HEAD(&tg->children); + list_add_rcu(&tg->siblings, &parent->children); + spin_unlock_irqrestore(&task_group_lock, flags); +} + +/* rcu callback to free various structures associated with a task group */ +static void sched_free_group_rcu(struct rcu_head *rhp) +{ + /* now it should be safe to free those cfs_rqs */ + sched_free_group(container_of(rhp, struct task_group, rcu)); +} + +void sched_destroy_group(struct task_group *tg) +{ + /* wait for possible concurrent references to cfs_rqs complete */ + call_rcu(&tg->rcu, sched_free_group_rcu); +} + +void sched_offline_group(struct task_group *tg) +{ + unsigned long flags; + + spin_lock_irqsave(&task_group_lock, flags); + list_del_rcu(&tg->list); + list_del_rcu(&tg->siblings); + spin_unlock_irqrestore(&task_group_lock, flags); +} + +static inline struct task_group *css_tg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct task_group, css) : NULL; +} + +static struct cgroup_subsys_state * +cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct task_group *parent = css_tg(parent_css); + struct task_group *tg; + + if (!parent) { + /* This is early initialization for the top cgroup */ + return &root_task_group.css; + } + + tg = sched_create_group(parent); + if (IS_ERR(tg)) + return ERR_PTR(-ENOMEM); + return &tg->css; +} + +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + sched_offline_group(tg); +} + +static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) +{ + struct task_group *tg = css_tg(css); + + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); +} + +static void cpu_cgroup_fork(struct task_struct *task) +{ +} + +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) +{ + return 0; +} + +static void cpu_cgroup_attach(struct cgroup_taskset *tset) +{ +} + +static struct cftype cpu_files[] = { + { } /* terminate */ +}; + +struct cgroup_subsys cpu_cgrp_subsys = { + .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, + .css_free = cpu_cgroup_css_free, + .fork = cpu_cgroup_fork, + .can_attach = cpu_cgroup_can_attach, + .attach = cpu_cgroup_attach, + .legacy_cftypes = cpu_files, + .early_init = true, +}; +#endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h index 9ab0ec66a..06b2cdca7 100644 --- a/kernel/sched/bfs_sched.h +++ b/kernel/sched/bfs_sched.h @@ -24,6 +24,8 @@ struct rq { int rq_prio; bool rq_running; /* There is a task running */ int soft_affined; /* Running or queued tasks with this set as their rq */ + u64 load_update; /* When we last updated load */ + unsigned long load_avg; /* Rolling load average */ #ifdef CONFIG_SMT_NICE struct mm_struct *rq_mm; int rq_smt_bias; /* Policy/nice level bias across smt siblings */ @@ -44,11 +46,13 @@ struct rq { struct sched_domain *sd; int *cpu_locality; /* CPU relative cache distance */ #ifdef CONFIG_SCHED_SMT - bool (*siblings_idle)(int cpu); + cpumask_t thread_mask; + bool (*siblings_idle)(struct rq *rq); /* See if all smt siblings are idle */ #endif /* CONFIG_SCHED_SMT */ #ifdef CONFIG_SCHED_MC - bool (*cache_idle)(int cpu); + cpumask_t core_mask; + bool (*cache_idle)(struct rq *rq); /* See if all cache siblings are idle */ #endif /* CONFIG_SCHED_MC */ u64 last_niffy; /* Last time this RQ updated grq.niffies */ @@ -199,14 +203,29 @@ static inline void cpufreq_trigger(u64 time, unsigned long util) { struct update_util_data *data; + if (util > SCHED_CAPACITY_SCALE) + util = SCHED_CAPACITY_SCALE; data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); if (data) - data->func(data, time, util, 0); + data->func(data, time, util, SCHED_CAPACITY_SCALE); +} + +static inline void other_cpufreq_trigger(int cpu, u64 time, unsigned long util) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu)); + if (data) + data->func(data, time, util, SCHED_CAPACITY_SCALE); } #else static inline void cpufreq_trigger(u64 time, unsigned long util) { } + +static inline void other_cpufreq_trigger(int cpu, u64 time, unsigned long util) +{ +} #endif /* CONFIG_CPU_FREQ */ #ifdef arch_scale_freq_capacity diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 75f98c549..a24cfb41d 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -603,19 +603,25 @@ static void cputime_adjust(struct task_cputime *curr, stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { + utime = rtime; goto update; } - if (stime == 0) { - utime = rtime; + if (utime == 0) { + stime = rtime; goto update; } stime = scale_stime((__force u64)stime, (__force u64)rtime, (__force u64)(stime + utime)); +update: /* * Make sure stime doesn't go backwards; this preserves monotonicity * for utime because rtime is monotonic. @@ -638,7 +644,6 @@ static void cputime_adjust(struct task_cputime *curr, stime = rtime - utime; } -update: prev->stime = stime; prev->utime = utime; out: diff --git a/kernel/skip_lists.c b/kernel/skip_lists.c new file mode 100644 index 000000000..4d82ee18b --- /dev/null +++ b/kernel/skip_lists.c @@ -0,0 +1,177 @@ +/* + Copyright (C) 2011,2016 Con Kolivas. + + Code based on example originally by William Pugh. + +Skip Lists are a probabilistic alternative to balanced trees, as +described in the June 1990 issue of CACM and were invented by +William Pugh in 1987. + +A couple of comments about this implementation: +The routine randomLevel has been hard-coded to generate random +levels using p=0.25. It can be easily changed. + +The insertion routine has been implemented so as to use the +dirty hack described in the CACM paper: if a random level is +generated that is more than the current maximum level, the +current maximum level plus one is used instead. + +Levels start at zero and go up to MaxLevel (which is equal to +MaxNumberOfLevels-1). + +The routines defined in this file are: + +init: defines slnode + +new_skiplist: returns a new, empty list + +randomLevel: Returns a random level based on a u64 random seed passed to it. +In BFS, the "niffy" time is used for this purpose. + +insert(l,key, value): inserts the binding (key, value) into l. This operation +occurs in O(log n) time. + +delnode(slnode, l, node): deletes any binding of key from the l based on the +actual node value. This operation occurs in O(k) time where k is the +number of levels of the node in question (max 16). The original delete +function occurred in O(log n) time and involved a search. + +BFS Notes: In this implementation of skiplists, there are bidirectional +next/prev pointers and the insert function returns a pointer to the actual +node the value is stored. The key here is chosen by the scheduler so as to +sort tasks according to the priority list requirements and is no longer used +by the scheduler after insertion. The scheduler lookup, however, occurs in +O(1) time because it is always the first item in the level 0 linked list. +Since the task struct stores a copy of the node pointer upon skiplist_insert, +it can also remove it much faster than the original implementation with the +aid of prev<->next pointer manipulation and no searching. + +*/ + +#include <linux/slab.h> +#include <linux/skip_lists.h> + +#define MaxNumberOfLevels 16 +#define MaxLevel (MaxNumberOfLevels - 1) +#define newNode kmalloc(sizeof(skiplist_node), GFP_ATOMIC) + +skiplist_node *skiplist_init(void) +{ + skiplist_node *slnode = newNode; + int i; + + BUG_ON(!slnode); + slnode->key = 0xFFFFFFFFFFFFFFFF; + slnode->level = 0; + slnode->value = NULL; + for (i = 0; i < MaxNumberOfLevels; i++) + slnode->next[i] = slnode->prev[i] = slnode; + return slnode; +} + +skiplist *new_skiplist(skiplist_node *slnode) +{ + skiplist *l = kmalloc(sizeof(skiplist), GFP_ATOMIC); + + BUG_ON(!l); + l->entries = 0; + l->level = 0; + l->header = slnode; + return l; +} + +void free_skiplist(skiplist_node *slnode, skiplist *l) +{ + skiplist_node *p, *q; + + p = l->header; + do { + q = p->next[0]; + p->next[0]->prev[0] = q->prev[0]; + kfree(p); + p = q; + } while (p != slnode); + kfree(l); +} + +/* + * Returns a pseudo-random number based on the randseed value by masking out + * 0-15. As many levels are not required when only few values are on the list, + * we limit the height of the levels according to how many list entries there + * are in a cheap manner. The height of the levels may have been higher while + * there were more entries queued previously but as this code is used only by + * the scheduler, entries are short lived and will be torn down regularly. + * + * 00-03 entries - 1 level + * 04-07 entries - 2 levels + * 08-15 entries - 4 levels + * 15-31 entries - 7 levels + * 32+ entries - max(16) levels + */ +static inline unsigned int randomLevel(int entries, unsigned int randseed) +{ + unsigned int mask; + + if (entries > 31) + mask = 0xF; + else if (entries > 15) + mask = 0x7; + else if (entries > 7) + mask = 0x3; + else if (entries > 3) + mask = 0x1; + else + return 0; + + return randseed & mask; +} + +skiplist_node *skiplist_insert(skiplist_node *slnode, skiplist *l, keyType key, valueType value, unsigned int randseed) +{ + skiplist_node *update[MaxNumberOfLevels]; + skiplist_node *p, *q; + int k = l->level; + + p = slnode; + do { + while (q = p->next[k], q->key <= key) + p = q; + update[k] = p; + } while (--k >= 0); + + k = randomLevel(++l->entries, randseed); + if (k > l->level) { + k = ++l->level; + update[k] = slnode; + } + + q = newNode; + q->level = k; + q->key = key; + q->value = value; + do { + p = update[k]; + q->next[k] = p->next[k]; + p->next[k] = q; + q->prev[k] = p; + q->next[k]->prev[k] = q; + } while (--k >= 0); + return q; +} + +void skiplist_delnode(skiplist_node *slnode, skiplist *l, skiplist_node *node) +{ + int k, m = node->level; + + for (k = 0; k <= m; k++) { + node->prev[k]->next[k] = node->next[k]; + node->next[k]->prev[k] = node->prev[k]; + } + kfree(node); + if (m == l->level) { + while (l->header->next[m] == slnode && l->header->prev[m] == slnode && m > 0) + m--; + l->level = m; + } + l->entries--; +} |