summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/bfs.c500
-rw-r--r--kernel/sched/bfs_sched.h2
-rw-r--r--kernel/sched/core.c141
-rw-r--r--kernel/sched/cpuacct.c114
-rw-r--r--kernel/sched/cpudeadline.c2
-rw-r--r--kernel/sched/cpufreq_schedutil.c74
-rw-r--r--kernel/sched/cputime.c203
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/debug.c2
-rw-r--r--kernel/sched/fair.c251
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/sched.h23
12 files changed, 727 insertions, 594 deletions
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c
index 67f93e752..bb5bac4b2 100644
--- a/kernel/sched/bfs.c
+++ b/kernel/sched/bfs.c
@@ -24,7 +24,7 @@
* 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
* 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
* Thomas Gleixner, Mike Kravetz
- * now Brainfuck deadline scheduling policy by Con Kolivas deletes
+ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes
* a whole lot of those previous things.
*/
@@ -137,7 +137,7 @@
void print_scheduler_version(void)
{
- printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n");
+ printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n");
}
/*
@@ -403,7 +403,6 @@ static inline void grq_lock_irq(void)
}
static inline void time_lock_grq(struct rq *rq)
- __acquires(grq.lock)
{
grq_lock();
update_clocks(rq);
@@ -429,86 +428,35 @@ static inline void grq_unlock_irqrestore(unsigned long *flags)
static inline struct rq
*task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
+ __acquires(p->pi_lock)
{
- grq_lock_irqsave(flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
+ grq_lock();
return task_rq(p);
}
static inline struct rq
*time_task_grq_lock(struct task_struct *p, unsigned long *flags)
- __acquires(grq.lock)
{
struct rq *rq = task_grq_lock(p, flags);
- update_clocks(rq);
- return rq;
-}
-static inline struct rq *task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock_irq();
- return task_rq(p);
-}
-
-static inline void time_task_grq_lock_irq(struct task_struct *p)
- __acquires(grq.lock)
-{
- struct rq *rq = task_grq_lock_irq(p);
update_clocks(rq);
+ return rq;
}
-static inline void task_grq_unlock_irq(void)
- __releases(grq.lock)
-{
- grq_unlock_irq();
-}
-
-static inline void task_grq_unlock(unsigned long *flags)
- __releases(grq.lock)
-{
- grq_unlock_irqrestore(flags);
-}
-
-/**
- * grunqueue_is_locked
- *
- * Returns true if the global runqueue is locked.
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-bool grunqueue_is_locked(void)
-{
- return raw_spin_is_locked(&grq.lock);
-}
-
-void grq_unlock_wait(void)
- __releases(grq.lock)
+static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags)
+ __releases(p->pi_lock)
{
- smp_mb(); /* spin-unlock-wait is not a full memory barrier */
- raw_spin_unlock_wait(&grq.lock);
+ grq_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
static inline void time_grq_lock(struct rq *rq, unsigned long *flags)
- __acquires(grq.lock)
{
local_irq_save(*flags);
time_lock_grq(rq);
}
-static inline struct rq *__task_grq_lock(struct task_struct *p)
- __acquires(grq.lock)
-{
- grq_lock();
- return task_rq(p);
-}
-
-static inline void __task_grq_unlock(void)
- __releases(grq.lock)
-{
- grq_unlock();
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
@@ -540,6 +488,40 @@ static inline bool deadline_after(u64 deadline, u64 time)
}
/*
+ * Deadline is "now" in niffies + (offset by priority). Setting the deadline
+ * is the key to everything. It distributes cpu fairly amongst tasks of the
+ * same nice value, it proportions cpu according to nice level, it means the
+ * task that last woke up the longest ago has the earliest deadline, thus
+ * ensuring that interactive tasks get low latency on wake up. The CPU
+ * proportion works out to the square of the virtual deadline difference, so
+ * this equation will give nice 19 3% CPU compared to nice 0.
+ */
+static inline u64 prio_deadline_diff(int user_prio)
+{
+ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
+}
+
+static inline u64 task_deadline_diff(struct task_struct *p)
+{
+ return prio_deadline_diff(TASK_USER_PRIO(p));
+}
+
+static inline u64 static_deadline_diff(int static_prio)
+{
+ return prio_deadline_diff(USER_PRIO(static_prio));
+}
+
+static inline int longest_deadline_diff(void)
+{
+ return prio_deadline_diff(39);
+}
+
+static inline int ms_longest_deadline_diff(void)
+{
+ return NS_TO_MS(longest_deadline_diff());
+}
+
+/*
* A task that is not running or queued will not have a node set.
* A task that is queued but not running will have a node set.
* A task that is currently running will have ->on_cpu set but no node set.
@@ -561,14 +543,23 @@ static void dequeue_task(struct task_struct *p)
sched_info_dequeued(task_rq(p), p);
}
+#ifdef CONFIG_PREEMPT_RCU
+static bool rcu_read_critical(struct task_struct *p)
+{
+ return p->rcu_read_unlock_special.b.blocked;
+}
+#else /* CONFIG_PREEMPT_RCU */
+#define rcu_read_critical(p) (false)
+#endif /* CONFIG_PREEMPT_RCU */
+
/*
* To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as
* an idle task, we ensure none of the following conditions are met.
*/
static bool idleprio_suitable(struct task_struct *p)
{
- return (!freezing(p) && !signal_pending(p) &&
- !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)));
+ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) &&
+ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p));
}
/*
@@ -612,9 +603,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq)
sl_id = p->prio;
else {
sl_id = p->deadline;
- /* Set it to cope with 4 left shifts with locality_diff */
- if (p->prio == IDLE_PRIO)
- sl_id |= 0x0F00000000000000;
+ if (idleprio_task(p)) {
+ /* Set it to cope with 4 left shifts with locality_diff */
+ if (p->prio == IDLE_PRIO)
+ sl_id |= 0x00FF000000000000;
+ else
+ sl_id += longest_deadline_diff();
+ }
}
/*
* Some architectures don't have better than microsecond resolution
@@ -1008,15 +1003,18 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq)
#ifdef CONFIG_SMP
void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
- unsigned int tcpu;
-
#ifdef CONFIG_LOCKDEP
/*
- * The caller should hold grq lock.
+ * The caller should hold either p->pi_lock or grq lock, when changing
+ * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks.
+ *
+ * Furthermore, all task_rq users should acquire both locks, see
+ * task_grq_lock().
*/
- WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock));
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&grq.lock)));
#endif
- if ((tcpu = task_cpu(p)) == cpu)
+ if (task_cpu(p) == cpu)
return;
trace_sched_migrate_task(p, cpu);
perf_event_task_migrate(p);
@@ -1027,6 +1025,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu)
* per-task data have been completed by this moment.
*/
smp_wmb();
+
if (p->on_rq) {
struct rq *rq = task_rq(p);
@@ -1166,7 +1165,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -1292,9 +1291,7 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu)
static void try_preempt(struct task_struct *p, struct rq *this_rq)
{
- int cpu, pcpu, highest_prio, highest_cpu;
- struct rq *highest_prio_rq;
- u64 latest_deadline;
+ int i, this_entries = this_rq->soft_affined;
cpumask_t tmp;
if (suitable_idle_cpus(p) && resched_best_idle(p))
@@ -1306,56 +1303,32 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq)
cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed);
- /* See if this task can preempt the task on the current CPU first. */
- pcpu = cpu_of(this_rq);
- if (likely(cpumask_test_cpu(pcpu, &tmp))) {
- if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) {
- resched_curr(this_rq);
- return;
- }
- cpumask_clear_cpu(pcpu, &tmp);
- }
-
- highest_prio = latest_deadline = 0;
- highest_prio_rq = NULL;
-
- /* Now look for the CPU with the latest deadline */
- for_each_cpu(cpu, &tmp) {
- struct rq *rq;
- int rq_prio;
- u64 dl;
+ /*
+ * We iterate over CPUs in locality order using rq_order, finding the
+ * first one we can preempt if possible, thus staying closest in
+ * locality.
+ */
+ for (i = 0; i < num_possible_cpus(); i++) {
+ struct rq *rq = this_rq->rq_order[i];
- rq = cpu_rq(cpu);
- rq_prio = rq->rq_prio;
- if (rq_prio < highest_prio)
+ if (!cpumask_test_cpu(rq->cpu, &tmp))
continue;
- dl = rq->rq_deadline;
- if (!sched_interactive && pcpu != cpu)
- dl <<= locality_diff(pcpu, rq);
- if (rq_prio > highest_prio ||
- deadline_after(dl, latest_deadline)) {
- latest_deadline = dl;
- highest_prio = rq_prio;
- highest_cpu = cpu;
- highest_prio_rq = rq;
+ if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries)
+ continue;
+ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) {
+ /*
+ * If we have decided this task should preempt this CPU,
+ * set the task's CPU to match thereby speeding up matching
+ * this task in earliest_deadline_task.
+ */
+ set_task_cpu(p, rq->cpu);
+ resched_curr(rq);
+ return;
}
}
-
- if (unlikely(!highest_prio_rq))
- return;
- if (!smt_schedule(p, highest_prio_rq))
- return;
- if (can_preempt(p, highest_prio, latest_deadline)) {
- /*
- * If we have decided this task should preempt this CPU,
- * set the task's CPU to match thereby speeding up matching
- * this task in earliest_deadline_task.
- */
- set_task_cpu(p, highest_cpu);
- resched_curr(highest_prio_rq);
- }
}
+
static int __set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask, bool check);
#else /* CONFIG_SMP */
@@ -1501,8 +1474,6 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
struct rq *rq;
int cpu;
- get_cpu();
-
/*
* If we are going to wake up a thread waiting for CONDITION we
* need to ensure that CONDITION=1 done by the caller can not be
@@ -1533,13 +1504,11 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state,
out_running:
ttwu_post_activation(p, rq, success);
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
- put_cpu();
-
return success;
}
@@ -1629,6 +1598,13 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p)
skiplist_node_init(&p->node);
/*
+ * We mark the process as NEW here. This guarantees that
+ * nobody will actually run it, and a signal or other external
+ * event cannot wake it up and insert it on the runqueue either.
+ */
+ p->state = TASK_NEW;
+
+ /*
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
@@ -1744,12 +1720,16 @@ static inline void init_schedstats(void) {}
*/
void wake_up_new_task(struct task_struct *p)
{
- struct task_struct *parent;
+ struct task_struct *parent, *rq_curr;
+ struct rq *rq, *new_rq;
unsigned long flags;
- struct rq *rq;
parent = p->parent;
rq = task_grq_lock(p, &flags);
+ if (unlikely(needs_other_cpu(p, task_cpu(p))))
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ rq_curr = rq->curr;
+ p->state = TASK_RUNNING;
/*
* Reinit new task deadline as its creator deadline could have changed
@@ -1757,22 +1737,20 @@ void wake_up_new_task(struct task_struct *p)
*/
p->deadline = rq->rq_deadline;
- /*
- * If the task is a new process, current and parent are the same. If
- * the task is a new thread in the thread group, it will have much more
- * in common with current than with the parent.
- */
- set_task_cpu(p, task_cpu(rq->curr));
+ /* The new task might not be able to run on the same CPU as rq->curr */
+ if (unlikely(needs_other_cpu(p, task_cpu(p)))) {
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
+ new_rq = task_rq(p);
+ } else
+ new_rq = rq;
/*
* Make sure we do not leak PI boosting priority to the child.
*/
- p->prio = rq->curr->normal_prio;
+ p->prio = rq_curr->normal_prio;
activate_task(p, rq);
trace_sched_wakeup_new(p);
- if (unlikely(p->policy == SCHED_FIFO))
- goto after_ts_init;
/*
* Share the timeslice between parent and child, thus the
@@ -1784,33 +1762,39 @@ void wake_up_new_task(struct task_struct *p)
* is always equal to current->deadline.
*/
p->last_ran = rq->rq_last_ran;
- if (likely(rq->rq_time_slice >= RESCHED_US * 2)) {
+ if (likely(rq_curr->policy != SCHED_FIFO)) {
rq->rq_time_slice /= 2;
- p->time_slice = rq->rq_time_slice;
-after_ts_init:
- if (rq->curr == parent && !suitable_idle_cpus(p)) {
+ if (unlikely(rq->rq_time_slice < RESCHED_US)) {
/*
- * The VM isn't cloned, so we're in a good position to
- * do child-runs-first in anticipation of an exec. This
- * usually avoids a lot of COW overhead.
+ * Forking task has run out of timeslice. Reschedule it and
+ * start its child with a new time slice and deadline. The
+ * child will end up running first because its deadline will
+ * be slightly earlier.
*/
- __set_tsk_resched(parent);
- } else
- try_preempt(p, rq);
- } else {
- if (rq->curr == parent) {
- /*
- * Forking task has run out of timeslice. Reschedule it and
- * start its child with a new time slice and deadline. The
- * child will end up running first because its deadline will
- * be slightly earlier.
- */
rq->rq_time_slice = 0;
- __set_tsk_resched(parent);
+ __set_tsk_resched(rq_curr);
+ time_slice_expired(p);
+ if (suitable_idle_cpus(p))
+ resched_best_idle(p);
+ else if (unlikely(rq != new_rq))
+ try_preempt(p, new_rq);
+ } else {
+ p->time_slice = rq->rq_time_slice;
+ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) {
+ /*
+ * The VM isn't cloned, so we're in a good position to
+ * do child-runs-first in anticipation of an exec. This
+ * usually avoids a lot of COW overhead.
+ */
+ __set_tsk_resched(rq_curr);
+ } else
+ try_preempt(p, new_rq);
}
+ } else {
time_slice_expired(p);
+ try_preempt(p, new_rq);
}
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2724,7 +2708,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_grq_lock(p, &flags);
ns = p->sched_time + do_task_delta_exec(p, rq);
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
return ns;
}
@@ -2978,7 +2962,7 @@ static void task_running_tick(struct rq *rq)
grq_lock();
requeue_task(p);
- __set_tsk_resched(p);
+ resched_task(p);
grq_unlock();
}
@@ -3083,40 +3067,6 @@ static inline void preempt_latency_stop(int val) { }
#endif
/*
- * Deadline is "now" in niffies + (offset by priority). Setting the deadline
- * is the key to everything. It distributes cpu fairly amongst tasks of the
- * same nice value, it proportions cpu according to nice level, it means the
- * task that last woke up the longest ago has the earliest deadline, thus
- * ensuring that interactive tasks get low latency on wake up. The CPU
- * proportion works out to the square of the virtual deadline difference, so
- * this equation will give nice 19 3% CPU compared to nice 0.
- */
-static inline u64 prio_deadline_diff(int user_prio)
-{
- return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128));
-}
-
-static inline u64 task_deadline_diff(struct task_struct *p)
-{
- return prio_deadline_diff(TASK_USER_PRIO(p));
-}
-
-static inline u64 static_deadline_diff(int static_prio)
-{
- return prio_deadline_diff(USER_PRIO(static_prio));
-}
-
-static inline int longest_deadline_diff(void)
-{
- return prio_deadline_diff(39);
-}
-
-static inline int ms_longest_deadline_diff(void)
-{
- return NS_TO_MS(longest_deadline_diff());
-}
-
-/*
* The time_slice is only refilled when it is empty and that is when we set a
* new deadline.
*/
@@ -3215,13 +3165,12 @@ found_middle:
static inline struct
task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle)
{
- struct task_struct *edt = idle;
skiplist_node *node = &grq.node;
+ struct task_struct *edt = idle;
u64 earliest_deadline = ~0ULL;
while ((node = node->next[0]) != &grq.node) {
struct task_struct *p = node->value;
- int tcpu;
/* Make sure affinity is ok */
if (needs_other_cpu(p, cpu))
@@ -3230,22 +3179,24 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *
if (!smt_schedule(p, rq))
continue;
- if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) {
- u64 dl = p->deadline << locality_diff(tcpu, rq);
+ if (!sched_interactive) {
+ int tcpu;
+
+ if ((tcpu = task_cpu(p)) != cpu) {
+ u64 dl = p->deadline << locality_diff(tcpu, rq);
- if (unlikely(!deadline_before(dl, earliest_deadline)))
+ if (!deadline_before(dl, earliest_deadline))
+ continue;
+ earliest_deadline = dl;
+ edt = p;
+ /* We continue even though we've found the earliest
+ * deadline task as the locality offset means there
+ * may be a better candidate after it. */
continue;
- earliest_deadline = dl;
- edt = p;
- /* We continue even though we've found the earliest
- * deadline task as the locality offset means there
- * may be a better candidate after it. */
- continue;
+ }
}
- /* This wouldn't happen if we encountered a better deadline from
- * another CPU and have already set edt. */
- if (likely(p->deadline < earliest_deadline))
- edt = p;
+ /* We've encountered the best deadline local task */
+ edt = p;
break;
}
if (likely(edt != idle))
@@ -3275,6 +3226,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -3316,10 +3270,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p)
rq->rq_mm = p->mm;
rq->rq_smt_bias = p->smt_bias;
#endif
- if (p != rq->idle)
- rq->rq_running = true;
- else
- rq->rq_running = false;
}
static void reset_rq_task(struct rq *rq, struct task_struct *p)
@@ -3353,7 +3303,7 @@ static void check_smt_siblings(struct rq *this_rq)
if (unlikely(!rq->online))
continue;
p = rq->curr;
- if (!smt_should_schedule(p, this_rq)) {
+ if (!smt_schedule(p, this_rq)) {
set_tsk_need_resched(p);
smp_send_reschedule(other_cpu);
}
@@ -3546,8 +3496,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the grq */
- cpu = cpu_of(rq);
- idle = rq->idle;
} else {
check_siblings(rq);
grq_unlock_irq();
@@ -3766,8 +3714,8 @@ EXPORT_SYMBOL(default_wake_function);
void rt_mutex_setprio(struct task_struct *p, int prio)
{
unsigned long flags;
- int queued, oldprio;
struct rq *rq;
+ int oldprio;
BUG_ON(prio < 0 || prio > MAX_PRIO);
@@ -3793,19 +3741,18 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
p->prio = prio;
- if (task_running(p) && prio > oldprio)
- resched_task(p);
- if (queued) {
+ if (task_running(p)){
+ if (prio > oldprio)
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(p);
enqueue_task(p, rq);
- try_preempt(p, rq);
+ if (prio < oldprio)
+ try_preempt(p, rq);
}
-
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
#endif
@@ -3821,7 +3768,7 @@ static inline void adjust_deadline(struct task_struct *p, int new_prio)
void set_user_nice(struct task_struct *p, long nice)
{
- int queued, new_static, old_static;
+ int new_static, old_static;
unsigned long flags;
struct rq *rq;
@@ -3843,16 +3790,14 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = new_static;
goto out_unlock;
}
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
adjust_deadline(p, new_static);
old_static = p->static_prio;
p->static_prio = new_static;
p->prio = effective_prio(p);
- if (queued) {
+ if (task_queued(p)) {
+ dequeue_task(p);
enqueue_task(p, rq);
if (new_static < old_static)
try_preempt(p, rq);
@@ -3862,7 +3807,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(p);
}
out_unlock:
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4002,11 +3947,15 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy,
p->prio = rt_mutex_get_effective_prio(p, p->normal_prio);
} else
p->prio = p->normal_prio;
+
if (task_running(p)) {
reset_rq_task(rq, p);
- /* Resched only if we might now be preempted */
- if (p->prio > oldprio || p->rt_priority > oldrtprio)
- resched_task(p);
+ resched_task(p);
+ } else if (task_queued(p)) {
+ dequeue_task(p);
+ enqueue_task(p, rq);
+ if (p->prio < oldprio || p->rt_priority > oldrtprio)
+ try_preempt(p, rq);
}
}
@@ -4031,8 +3980,8 @@ __sched_setscheduler(struct task_struct *p, int policy,
const struct sched_param *param, bool user, bool pi)
{
struct sched_param zero_param = { .sched_priority = 0 };
- int queued, retval, oldpolicy = -1;
unsigned long flags, rlim_rtprio = 0;
+ int retval, oldpolicy = -1;
int reset_on_fork;
struct rq *rq;
@@ -4142,20 +4091,17 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
+ *
* To be able to change p->policy safely, the grunqueue lock must be
* held.
*/
- rq = __task_grq_lock(p);
+ rq = task_grq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
return -EINVAL;
}
@@ -4165,31 +4111,21 @@ recheck:
if (unlikely(policy == p->policy && (!is_rt_policy(policy) ||
param->sched_priority == p->rt_priority))) {
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
return 0;
}
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
goto recheck;
}
update_clocks(rq);
p->sched_reset_on_fork = reset_on_fork;
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
__setscheduler(p, rq, policy, param->sched_priority, pi);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
- __task_grq_unlock();
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_grq_unlock(p, &flags);
if (pi)
rt_mutex_adjust_pi(p);
@@ -4706,7 +4642,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -5113,6 +5050,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
cpumask_copy(tsk_cpus_allowed(p), new_mask);
+ if (needs_other_cpu(p, task_cpu(p)))
+ set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p)));
}
#endif
@@ -5376,6 +5315,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
{
const struct cpumask *cpu_valid_mask = cpu_active_mask;
bool running_wrong = false;
+ struct cpumask old_mask;
bool queued = false;
unsigned long flags;
struct rq *rq;
@@ -5399,7 +5339,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
- if (cpumask_equal(tsk_cpus_allowed(p), new_mask))
+ cpumask_copy(&old_mask, tsk_cpus_allowed(p));
+ if (cpumask_equal(&old_mask, new_mask))
goto out;
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
@@ -5436,12 +5377,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask));
out:
- if (queued)
+ if (queued && !cpumask_subset(new_mask, &old_mask))
try_preempt(p, rq);
- task_grq_unlock(&flags);
-
if (running_wrong)
- preempt_schedule_common();
+ preempt_disable();
+ task_grq_unlock(p, &flags);
+
+ if (running_wrong) {
+ __schedule(true);
+ preempt_enable();
+ }
return ret;
}
@@ -5471,6 +5416,11 @@ static void bind_zero(int src_cpu)
cpumask_set_cpu(0, tsk_cpus_allowed(p));
p->zerobound = true;
bound++;
+ if (task_cpu(p) == src_cpu) {
+ set_task_cpu(p, 0);
+ if (task_running(p))
+ resched_task(p);
+ }
}
} while_each_thread(t, p);
@@ -7008,6 +6958,7 @@ void __init sched_init_smp(void)
#ifdef CONFIG_SCHED_SMT
bool smt_threads = false;
#endif
+ struct rq *rq;
cpumask_var_t non_isolated_cpus;
@@ -7045,7 +6996,7 @@ void __init sched_init_smp(void)
* nodes) are treated as very distant.
*/
for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ rq = cpu_rq(cpu);
/* First check if this cpu is in the same node */
for_each_domain(cpu, sd) {
@@ -7084,6 +7035,17 @@ void __init sched_init_smp(void)
}
#endif
}
+ for_each_possible_cpu(cpu) {
+ int total_cpus = 0, locality;
+
+ rq = cpu_rq(cpu);
+ for (locality = 0; locality <= 4; locality++) {
+ for_each_possible_cpu(other_cpu) {
+ if (rq->cpu_locality[other_cpu] == locality)
+ rq->rq_order[total_cpus++] = cpu_rq(other_cpu);
+ }
+ }
+ }
#ifdef CONFIG_SMT_NICE
if (smt_threads) {
check_siblings = &check_smt_siblings;
@@ -7095,7 +7057,8 @@ void __init sched_init_smp(void)
mutex_unlock(&sched_domains_mutex);
for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ rq = cpu_rq(cpu);
+
for_each_online_cpu(other_cpu) {
if (other_cpu <= cpu)
continue;
@@ -7220,6 +7183,10 @@ void __init sched_init(void)
else
rq->cpu_locality[j] = 4;
}
+ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC);
+ rq->rq_order[0] = rq;
+ for (j = 1; j < cpu_ids; j++)
+ rq->rq_order[j] = cpu_rq(j);
}
#endif
@@ -7323,7 +7290,6 @@ static inline void normalise_rt_tasks(void)
struct task_struct *g, *p;
unsigned long flags;
struct rq *rq;
- int queued;
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
@@ -7337,16 +7303,8 @@ static inline void normalise_rt_tasks(void)
continue;
rq = task_grq_lock(p, &flags);
- queued = task_queued(p);
- if (queued)
- dequeue_task(p);
__setscheduler(p, rq, SCHED_NORMAL, 0, false);
- if (queued) {
- enqueue_task(p, rq);
- try_preempt(p, rq);
- }
-
- task_grq_unlock(&flags);
+ task_grq_unlock(p, &flags);
}
read_unlock(&tasklist_lock);
}
diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h
index e7fe1d0a5..00a16ba0a 100644
--- a/kernel/sched/bfs_sched.h
+++ b/kernel/sched/bfs_sched.h
@@ -22,7 +22,6 @@ struct rq {
int rq_time_slice;
u64 rq_last_ran;
int rq_prio;
- bool rq_running; /* There is a task running */
int soft_affined; /* Running or queued tasks with this set as their rq */
u64 load_update; /* When we last updated load */
unsigned long load_avg; /* Rolling load average */
@@ -43,6 +42,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
int *cpu_locality; /* CPU relative cache distance */
+ struct rq **rq_order; /* RQs ordered by relative cache distance */
#ifdef CONFIG_SCHED_SMT
cpumask_t thread_mask;
bool (*siblings_idle)(struct rq *rq);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 38eacc323..44817c640 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
#include <linux/context_tracking.h>
#include <linux/compiler.h>
#include <linux/frame.h>
+#include <linux/prefetch.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -1937,7 +1938,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* chain to provide order. Instead we do:
*
* 1) smp_store_release(X->on_cpu, 0)
- * 2) smp_cond_acquire(!X->on_cpu)
+ * 2) smp_cond_load_acquire(!X->on_cpu)
*
* Example:
*
@@ -1948,7 +1949,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* sched-out X
* smp_store_release(X->on_cpu, 0);
*
- * smp_cond_acquire(!X->on_cpu);
+ * smp_cond_load_acquire(&X->on_cpu, !VAL);
* X->state = WAKING
* set_task_cpu(X,2)
*
@@ -1974,7 +1975,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
* This means that any means of doing remote wakeups must order the CPU doing
* the wakeup against the CPU the task is going to end up running on. This,
* however, is already required for the regular Program-Order guarantee above,
- * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire).
+ * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
*
*/
@@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* This ensures that tasks getting woken will be fully ordered against
* their previous state and preserve Program Order.
*/
- smp_cond_acquire(!p->on_cpu);
+ smp_cond_load_acquire(&p->on_cpu, !VAL);
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2364,11 +2365,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
__sched_fork(clone_flags, p);
/*
- * We mark the process as running here. This guarantees that
+ * We mark the process as NEW here. This guarantees that
* nobody will actually run it, and a signal or other external
* event cannot wake it up and insert it on the runqueue either.
*/
- p->state = TASK_RUNNING;
+ p->state = TASK_NEW;
/*
* Make sure we do not leak PI boosting priority to the child.
@@ -2405,8 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->sched_class = &fair_sched_class;
}
- if (p->sched_class->task_fork)
- p->sched_class->task_fork(p);
+ init_entity_runnable_average(&p->se);
/*
* The child is not yet in the pid-hash so no cgroup attach races,
@@ -2416,7 +2416,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
* Silence PROVE_RCU.
*/
raw_spin_lock_irqsave(&p->pi_lock, flags);
- set_task_cpu(p, cpu);
+ /*
+ * We're setting the cpu for the first time, we don't migrate,
+ * so use __set_task_cpu().
+ */
+ __set_task_cpu(p, cpu);
+ if (p->sched_class->task_fork)
+ p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#ifdef CONFIG_SCHED_INFO
@@ -2548,16 +2554,18 @@ void wake_up_new_task(struct task_struct *p)
struct rq_flags rf;
struct rq *rq;
- /* Initialize new task's runnable average */
- init_entity_runnable_average(&p->se);
raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+ p->state = TASK_RUNNING;
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
+ *
+ * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+ * as we're not fully set-up yet.
*/
- set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+ __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
#endif
rq = __task_rq_lock(p, &rf);
post_init_entity_util_avg(&p->se);
@@ -2987,6 +2995,23 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
+ * The function fair_sched_class.update_curr accesses the struct curr
+ * and its field curr->exec_start; when called from task_sched_runtime(),
+ * we observe a high rate of cache misses in practice.
+ * Prefetching this data results in improved performance.
+ */
+static inline void prefetch_curr_exec_start(struct task_struct *p)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ struct sched_entity *curr = (&p->se)->cfs_rq->curr;
+#else
+ struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
+#endif
+ prefetch(curr);
+ prefetch(&curr->exec_start);
+}
+
+/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -3020,6 +3045,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
* thread, breaking clock_gettime().
*/
if (task_current(rq, p) && task_on_rq_queued(p)) {
+ prefetch_curr_exec_start(p);
update_rq_clock(rq);
p->sched_class->update_curr(rq);
}
@@ -3183,6 +3209,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
pr_cont("\n");
}
#endif
+ if (panic_on_warn)
+ panic("scheduling while atomic\n");
+
dump_stack();
add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
}
@@ -4774,7 +4803,8 @@ out_unlock:
* @len: length in bytes of the bitmask pointed to by user_mask_ptr
* @user_mask_ptr: user-space pointer to hold the current cpu mask
*
- * Return: 0 on success. An error code otherwise.
+ * Return: size of CPU mask copied to user_mask_ptr on success. An
+ * error code otherwise.
*/
SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
unsigned long __user *, user_mask_ptr)
@@ -7255,7 +7285,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
struct rq *rq = cpu_rq(cpu);
rq->calc_load_update = calc_load_update;
- account_reset_rq(rq);
update_max_interval();
}
@@ -7735,6 +7764,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
INIT_LIST_HEAD(&tg->children);
list_add_rcu(&tg->siblings, &parent->children);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ online_fair_sched_group(tg);
}
/* rcu callback to free various structures associated with a task group */
@@ -7763,27 +7794,9 @@ void sched_offline_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-/* change task's runqueue when it moves between groups.
- * The caller of this function should have put the task in its new group
- * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to
- * reflect its new group.
- */
-void sched_move_task(struct task_struct *tsk)
+static void sched_change_group(struct task_struct *tsk, int type)
{
struct task_group *tg;
- int queued, running;
- struct rq_flags rf;
- struct rq *rq;
-
- rq = task_rq_lock(tsk, &rf);
-
- running = task_current(rq, tsk);
- queued = task_on_rq_queued(tsk);
-
- if (queued)
- dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
- if (unlikely(running))
- put_prev_task(rq, tsk);
/*
* All callers are synchronized by task_rq_lock(); we do not use RCU
@@ -7796,11 +7809,37 @@ void sched_move_task(struct task_struct *tsk)
tsk->sched_task_group = tg;
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->task_move_group)
- tsk->sched_class->task_move_group(tsk);
+ if (tsk->sched_class->task_change_group)
+ tsk->sched_class->task_change_group(tsk, type);
else
#endif
set_task_rq(tsk, task_cpu(tsk));
+}
+
+/*
+ * Change task's runqueue when it moves between groups.
+ *
+ * The caller of this function should have put the task in its new group by
+ * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
+ * its new group.
+ */
+void sched_move_task(struct task_struct *tsk)
+{
+ int queued, running;
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(tsk, &rf);
+
+ running = task_current(rq, tsk);
+ queued = task_on_rq_queued(tsk);
+
+ if (queued)
+ dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
+ if (unlikely(running))
+ put_prev_task(rq, tsk);
+
+ sched_change_group(tsk, TASK_MOVE_GROUP);
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
@@ -8228,15 +8267,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_free_group(tg);
}
+/*
+ * This is called before wake_up_new_task(), therefore we really only
+ * have to set its group bits, all the other stuff does not apply.
+ */
static void cpu_cgroup_fork(struct task_struct *task)
{
- sched_move_task(task);
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(task, &rf);
+
+ sched_change_group(task, TASK_SET_GROUP);
+
+ task_rq_unlock(rq, task, &rf);
}
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
struct task_struct *task;
struct cgroup_subsys_state *css;
+ int ret = 0;
cgroup_taskset_for_each(task, css, tset) {
#ifdef CONFIG_RT_GROUP_SCHED
@@ -8247,8 +8298,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (task->sched_class != &fair_sched_class)
return -EINVAL;
#endif
+ /*
+ * Serialize against wake_up_new_task() such that if its
+ * running, we're sure to observe its full state.
+ */
+ raw_spin_lock_irq(&task->pi_lock);
+ /*
+ * Avoid calling sched_move_task() before wake_up_new_task()
+ * has happened. This would lead to problems with PELT, due to
+ * move wanting to detach+attach while we're not attached yet.
+ */
+ if (task->state == TASK_NEW)
+ ret = -EINVAL;
+ raw_spin_unlock_irq(&task->pi_lock);
+
+ if (ret)
+ break;
}
- return 0;
+ return ret;
}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 41f85c4d0..bc0b309c3 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -25,15 +25,13 @@ enum cpuacct_stat_index {
CPUACCT_STAT_NSTATS,
};
-enum cpuacct_usage_index {
- CPUACCT_USAGE_USER, /* ... user mode */
- CPUACCT_USAGE_SYSTEM, /* ... kernel mode */
-
- CPUACCT_USAGE_NRUSAGE,
+static const char * const cpuacct_stat_desc[] = {
+ [CPUACCT_STAT_USER] = "user",
+ [CPUACCT_STAT_SYSTEM] = "system",
};
struct cpuacct_usage {
- u64 usages[CPUACCT_USAGE_NRUSAGE];
+ u64 usages[CPUACCT_STAT_NSTATS];
};
/* track cpu usage of a group of tasks and its child groups */
@@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
}
static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
u64 data;
/*
- * We allow index == CPUACCT_USAGE_NRUSAGE here to read
+ * We allow index == CPUACCT_STAT_NSTATS here to read
* the sum of suages.
*/
- BUG_ON(index > CPUACCT_USAGE_NRUSAGE);
+ BUG_ON(index > CPUACCT_STAT_NSTATS);
#ifndef CONFIG_64BIT
/*
@@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu,
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- if (index == CPUACCT_USAGE_NRUSAGE) {
+ if (index == CPUACCT_STAT_NSTATS) {
int i = 0;
data = 0;
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
data += cpuusage->usages[i];
} else {
data = cpuusage->usages[index];
@@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
raw_spin_lock_irq(&cpu_rq(cpu)->lock);
#endif
- for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++)
+ for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
cpuusage->usages[i] = val;
#ifndef CONFIG_64BIT
@@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
/* return total cpu usage (in nanoseconds) of a group */
static u64 __cpuusage_read(struct cgroup_subsys_state *css,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(css);
u64 totalcpuusage = 0;
@@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css,
static u64 cpuusage_user_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_USER);
+ return __cpuusage_read(css, CPUACCT_STAT_USER);
}
static u64 cpuusage_sys_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM);
+ return __cpuusage_read(css, CPUACCT_STAT_SYSTEM);
}
static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE);
+ return __cpuusage_read(css, CPUACCT_STAT_NSTATS);
}
static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft,
}
static int __cpuacct_percpu_seq_show(struct seq_file *m,
- enum cpuacct_usage_index index)
+ enum cpuacct_stat_index index)
{
struct cpuacct *ca = css_ca(seq_css(m));
u64 percpu;
@@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m,
static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER);
}
static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM);
}
static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
{
- return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE);
+ return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS);
}
-static const char * const cpuacct_stat_desc[] = {
- [CPUACCT_STAT_USER] = "user",
- [CPUACCT_STAT_SYSTEM] = "system",
-};
+static int cpuacct_all_seq_show(struct seq_file *m, void *V)
+{
+ struct cpuacct *ca = css_ca(seq_css(m));
+ int index;
+ int cpu;
+
+ seq_puts(m, "cpu");
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++)
+ seq_printf(m, " %s", cpuacct_stat_desc[index]);
+ seq_puts(m, "\n");
+
+ for_each_possible_cpu(cpu) {
+ struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
+
+ seq_printf(m, "%d", cpu);
+
+ for (index = 0; index < CPUACCT_STAT_NSTATS; index++) {
+#ifndef CONFIG_64BIT
+ /*
+ * Take rq->lock to make 64-bit read safe on 32-bit
+ * platforms.
+ */
+ raw_spin_lock_irq(&cpu_rq(cpu)->lock);
+#endif
+
+ seq_printf(m, " %llu", cpuusage->usages[index]);
+
+#ifndef CONFIG_64BIT
+ raw_spin_unlock_irq(&cpu_rq(cpu)->lock);
+#endif
+ }
+ seq_puts(m, "\n");
+ }
+ return 0;
+}
static int cpuacct_stats_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
+ s64 val[CPUACCT_STAT_NSTATS];
int cpu;
- s64 val = 0;
+ int stat;
+ memset(val, 0, sizeof(val));
for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_USER];
- val += kcpustat->cpustat[CPUTIME_NICE];
- }
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+ u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat;
- val = 0;
- for_each_possible_cpu(cpu) {
- struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
- val += kcpustat->cpustat[CPUTIME_SYSTEM];
- val += kcpustat->cpustat[CPUTIME_IRQ];
- val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER];
+ val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ];
+ val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ];
}
- val = cputime64_to_clock_t(val);
- seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+ for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
+ seq_printf(sf, "%s %lld\n",
+ cpuacct_stat_desc[stat],
+ cputime64_to_clock_t(val[stat]));
+ }
return 0;
}
@@ -302,6 +330,10 @@ static struct cftype files[] = {
.seq_show = cpuacct_percpu_sys_seq_show,
},
{
+ .name = "usage_all",
+ .seq_show = cpuacct_all_seq_show,
+ },
+ {
.name = "stat",
.seq_show = cpuacct_stats_show,
},
@@ -316,11 +348,11 @@ static struct cftype files[] = {
void cpuacct_charge(struct task_struct *tsk, u64 cputime)
{
struct cpuacct *ca;
- int index = CPUACCT_USAGE_SYSTEM;
+ int index = CPUACCT_STAT_SYSTEM;
struct pt_regs *regs = task_pt_regs(tsk);
if (regs && user_mode(regs))
- index = CPUACCT_USAGE_USER;
+ index = CPUACCT_STAT_USER;
rcu_read_lock();
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 5be588204..d4184498c 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
if (old_idx == IDX_INVALID) {
cp->size++;
- cp->elements[cp->size - 1].dl = 0;
+ cp->elements[cp->size - 1].dl = dl;
cp->elements[cp->size - 1].cpu = cpu;
cp->elements[cpu].idx = cp->size - 1;
cpudl_change_key(cp, cp->size - 1, dl);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 3d3ab8205..eba226d7c 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -51,6 +51,8 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
+ unsigned int cached_raw_freq;
+
/* The fields below are only needed when sharing a policy. */
unsigned long util;
unsigned long max;
@@ -110,7 +112,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
/**
* get_next_freq - Compute a new frequency for a given cpufreq policy.
- * @policy: cpufreq policy object to compute the new frequency for.
+ * @sg_cpu: schedutil cpu object to compute the new frequency for.
* @util: Current CPU utilization.
* @max: CPU capacity.
*
@@ -125,14 +127,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
* next_freq = C * curr_freq * util_raw / max
*
* Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ *
+ * The lowest driver-supported frequency which is equal or greater than the raw
+ * next_freq (as calculated above) is returned, subject to policy min/max and
+ * cpufreq driver limitations.
*/
-static unsigned int get_next_freq(struct cpufreq_policy *policy,
- unsigned long util, unsigned long max)
+static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
+ unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
unsigned int freq = arch_scale_freq_invariant() ?
policy->cpuinfo.max_freq : policy->cur;
- return (freq + (freq >> 2)) * util / max;
+ freq = (freq + (freq >> 2)) * util / max;
+
+ if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ return sg_policy->next_freq;
+ sg_cpu->cached_raw_freq = freq;
+ return cpufreq_driver_resolve_freq(policy, freq);
}
static void sugov_update_single(struct update_util_data *hook, u64 time,
@@ -147,13 +160,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
return;
next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
- get_next_freq(policy, util, max);
+ get_next_freq(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
-static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
unsigned long util, unsigned long max)
{
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
unsigned int max_f = policy->cpuinfo.max_freq;
u64 last_freq_update_time = sg_policy->last_freq_update_time;
@@ -193,7 +207,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
}
}
- return get_next_freq(policy, util, max);
+ return get_next_freq(sg_cpu, util, max);
}
static void sugov_update_shared(struct update_util_data *hook, u64 time,
@@ -210,7 +224,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
sg_cpu->last_update = time;
if (sugov_should_update_freq(sg_policy, time)) {
- next_f = sugov_next_freq_shared(sg_policy, util, max);
+ next_f = sugov_next_freq_shared(sg_cpu, util, max);
sugov_update_commit(sg_policy, time, next_f);
}
@@ -398,7 +412,7 @@ static int sugov_init(struct cpufreq_policy *policy)
return ret;
}
-static int sugov_exit(struct cpufreq_policy *policy)
+static void sugov_exit(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
struct sugov_tunables *tunables = sg_policy->tunables;
@@ -416,7 +430,6 @@ static int sugov_exit(struct cpufreq_policy *policy)
mutex_unlock(&global_tunables_lock);
sugov_policy_free(sg_policy);
- return 0;
}
static int sugov_start(struct cpufreq_policy *policy)
@@ -438,6 +451,7 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_cpu->util = ULONG_MAX;
sg_cpu->max = 0;
sg_cpu->last_update = 0;
+ sg_cpu->cached_raw_freq = 0;
cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
sugov_update_shared);
} else {
@@ -448,7 +462,7 @@ static int sugov_start(struct cpufreq_policy *policy)
return 0;
}
-static int sugov_stop(struct cpufreq_policy *policy)
+static void sugov_stop(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
unsigned int cpu;
@@ -460,53 +474,29 @@ static int sugov_stop(struct cpufreq_policy *policy)
irq_work_sync(&sg_policy->irq_work);
cancel_work_sync(&sg_policy->work);
- return 0;
}
-static int sugov_limits(struct cpufreq_policy *policy)
+static void sugov_limits(struct cpufreq_policy *policy)
{
struct sugov_policy *sg_policy = policy->governor_data;
if (!policy->fast_switch_enabled) {
mutex_lock(&sg_policy->work_lock);
-
- if (policy->max < policy->cur)
- __cpufreq_driver_target(policy, policy->max,
- CPUFREQ_RELATION_H);
- else if (policy->min > policy->cur)
- __cpufreq_driver_target(policy, policy->min,
- CPUFREQ_RELATION_L);
-
+ cpufreq_policy_apply_limits(policy);
mutex_unlock(&sg_policy->work_lock);
}
sg_policy->need_freq_update = true;
- return 0;
-}
-
-int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
-{
- if (event == CPUFREQ_GOV_POLICY_INIT) {
- return sugov_init(policy);
- } else if (policy->governor_data) {
- switch (event) {
- case CPUFREQ_GOV_POLICY_EXIT:
- return sugov_exit(policy);
- case CPUFREQ_GOV_START:
- return sugov_start(policy);
- case CPUFREQ_GOV_STOP:
- return sugov_stop(policy);
- case CPUFREQ_GOV_LIMITS:
- return sugov_limits(policy);
- }
- }
- return -EINVAL;
}
static struct cpufreq_governor schedutil_gov = {
.name = "schedutil",
- .governor = sugov_governor,
.owner = THIS_MODULE,
+ .init = sugov_init,
+ .exit = sugov_exit,
+ .start = sugov_start,
+ .stop = sugov_stop,
+ .limits = sugov_limits,
};
static int __init sugov_module_init(void)
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a24cfb41d..a846cf89e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq);
*/
void irqtime_account_irq(struct task_struct *curr)
{
- unsigned long flags;
s64 delta;
int cpu;
if (!sched_clock_irqtime)
return;
- local_irq_save(flags);
-
cpu = smp_processor_id();
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr)
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
- local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static int irqtime_account_hi_update(void)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t irq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_hardirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
- ret = 1;
+ irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
+ cpustat[CPUTIME_IRQ];
+ irq_cputime = min(irq_cputime, maxtime);
+ cpustat[CPUTIME_IRQ] += irq_cputime;
local_irq_restore(flags);
- return ret;
+ return irq_cputime;
}
-static int irqtime_account_si_update(void)
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
unsigned long flags;
- u64 latest_ns;
- int ret = 0;
+ cputime_t softirq_cputime;
local_irq_save(flags);
- latest_ns = this_cpu_read(cpu_softirq_time);
- if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
- ret = 1;
+ softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+ cpustat[CPUTIME_SOFTIRQ];
+ softirq_cputime = min(softirq_cputime, maxtime);
+ cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
local_irq_restore(flags);
- return ret;
+ return softirq_cputime;
}
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
#define sched_clock_irqtime (0)
+static cputime_t irqtime_account_hi_update(cputime_t dummy)
+{
+ return 0;
+}
+
+static cputime_t irqtime_account_si_update(cputime_t dummy)
+{
+ return 0;
+}
+
#endif /* !CONFIG_IRQ_TIME_ACCOUNTING */
static inline void task_group_account_field(struct task_struct *p, int index,
@@ -257,29 +263,47 @@ void account_idle_time(cputime_t cputime)
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
}
-static __always_inline bool steal_account_process_tick(void)
+/*
+ * When a guest is interrupted for a longer amount of time, missed clock
+ * ticks are not redelivered later. Due to that, this function may on
+ * occasion account more time than the calling functions think elapsed.
+ */
+static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
{
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
+ cputime_t steal_cputime;
u64 steal;
- unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
- /*
- * steal is in nsecs but our caller is expecting steal
- * time in jiffies. Lets cast the result to jiffies
- * granularity and account the rest on the next rounds.
- */
- steal_jiffies = nsecs_to_jiffies(steal);
- this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
+ steal_cputime = min(nsecs_to_cputime(steal), maxtime);
+ account_steal_time(steal_cputime);
+ this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime);
- account_steal_time(jiffies_to_cputime(steal_jiffies));
- return steal_jiffies;
+ return steal_cputime;
}
#endif
- return false;
+ return 0;
+}
+
+/*
+ * Account how much elapsed time was spent in steal, irq, or softirq time.
+ */
+static inline cputime_t account_other_time(cputime_t max)
+{
+ cputime_t accounted;
+
+ accounted = steal_account_process_time(max);
+
+ if (accounted < max)
+ accounted += irqtime_account_hi_update(max - accounted);
+
+ if (accounted < max)
+ accounted += irqtime_account_si_update(max - accounted);
+
+ return accounted;
}
/*
@@ -342,21 +366,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
struct rq *rq, int ticks)
{
- cputime_t scaled = cputime_to_scaled(cputime_one_jiffy);
- u64 cputime = (__force u64) cputime_one_jiffy;
- u64 *cpustat = kcpustat_this_cpu->cpustat;
+ u64 cputime = (__force u64) cputime_one_jiffy * ticks;
+ cputime_t scaled, other;
- if (steal_account_process_tick())
+ /*
+ * When returning from idle, many ticks can get accounted at
+ * once, including some ticks of steal, irq, and softirq time.
+ * Subtract those ticks from the amount of time accounted to
+ * idle, or potentially user or system time. Due to rounding,
+ * other time can exceed ticks occasionally.
+ */
+ other = account_other_time(ULONG_MAX);
+ if (other >= cputime)
return;
+ cputime -= other;
+ scaled = cputime_to_scaled(cputime);
- cputime *= ticks;
- scaled *= ticks;
-
- if (irqtime_account_hi_update()) {
- cpustat[CPUTIME_IRQ] += cputime;
- } else if (irqtime_account_si_update()) {
- cpustat[CPUTIME_SOFTIRQ] += cputime;
- } else if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
@@ -406,6 +432,10 @@ void vtime_common_task_switch(struct task_struct *prev)
}
#endif
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
+
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
/*
* Archs that account the whole time spent in the idle task
* (outside irq) as idle time can rely on this and just implement
@@ -415,33 +445,16 @@ void vtime_common_task_switch(struct task_struct *prev)
* vtime_account().
*/
#ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_common_account_irq_enter(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
{
- if (!in_interrupt()) {
- /*
- * If we interrupted user, context_tracking_in_user()
- * is 1 because the context tracking don't hook
- * on irq entry/exit. This way we know if
- * we need to flush user time on kernel entry.
- */
- if (context_tracking_in_user()) {
- vtime_account_user(tsk);
- return;
- }
-
- if (is_idle_task(tsk)) {
- vtime_account_idle(tsk);
- return;
- }
- }
- vtime_account_system(tsk);
+ if (!in_interrupt() && is_idle_task(tsk))
+ vtime_account_idle(tsk);
+ else
+ vtime_account_system(tsk);
}
-EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
-
-#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
*ut = p->utime;
@@ -466,7 +479,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
*/
void account_process_tick(struct task_struct *p, int user_tick)
{
- cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime_t cputime, scaled, steal;
struct rq *rq = this_rq();
if (vtime_accounting_cpu_enabled())
@@ -477,26 +490,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
return;
}
- if (steal_account_process_tick())
+ cputime = cputime_one_jiffy;
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
return;
+ cputime -= steal;
+ scaled = cputime_to_scaled(cputime);
+
if (user_tick)
- account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ account_user_time(p, cputime, scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
- account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy,
- one_jiffy_scaled);
+ account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
else
- account_idle_time(cputime_one_jiffy);
-}
-
-/*
- * Account multiple ticks of steal time.
- * @p: the process from which the cpu time has been stolen
- * @ticks: number of stolen ticks
- */
-void account_steal_ticks(unsigned long ticks)
-{
- account_steal_time(jiffies_to_cputime(ticks));
+ account_idle_time(cputime);
}
/*
@@ -505,13 +513,21 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+ cputime_t cputime, steal;
if (sched_clock_irqtime) {
irqtime_account_idle_ticks(ticks);
return;
}
- account_idle_time(jiffies_to_cputime(ticks));
+ cputime = jiffies_to_cputime(ticks);
+ steal = steal_account_process_time(ULONG_MAX);
+
+ if (steal >= cputime)
+ return;
+
+ cputime -= steal;
+ account_idle_time(cputime);
}
/*
@@ -686,12 +702,21 @@ static cputime_t vtime_delta(struct task_struct *tsk)
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long now = READ_ONCE(jiffies);
- unsigned long delta = now - tsk->vtime_snap;
+ cputime_t delta, other;
+ /*
+ * Unlike tick based timing, vtime based timing never has lost
+ * ticks, and no need for steal time accounting to make up for
+ * lost ticks. Vtime accounts a rounded version of actual
+ * elapsed time. Limit account_other_time to prevent rounding
+ * errors from causing elapsed vtime to go negative.
+ */
+ delta = jiffies_to_cputime(now - tsk->vtime_snap);
+ other = account_other_time(delta);
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap = now;
- return jiffies_to_cputime(delta);
+ return delta - other;
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -711,16 +736,6 @@ void vtime_account_system(struct task_struct *tsk)
write_seqcount_end(&tsk->vtime_seqcount);
}
-void vtime_gen_account_irq_exit(struct task_struct *tsk)
-{
- write_seqcount_begin(&tsk->vtime_seqcount);
- if (vtime_delta(tsk))
- __vtime_account_system(tsk);
- if (context_tracking_in_user())
- tsk->vtime_snap_whence = VTIME_USER;
- write_seqcount_end(&tsk->vtime_seqcount);
-}
-
void vtime_account_user(struct task_struct *tsk)
{
cputime_t delta_cpu;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fcb7f0217..1ce886728 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
*
* XXX figure out if select_task_rq_dl() deals with offline cpus.
*/
- if (unlikely(!rq->online))
+ if (unlikely(!rq->online)) {
+ lockdep_unpin_lock(&rq->lock, rf.cookie);
rq = dl_task_offline_migration(rq, p);
+ rf.cookie = lockdep_pin_lock(&rq->lock);
+ }
/*
* Queueing this task back might have overloaded rq, check if we need
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0368c393a..2a0a99952 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
-#ifdef CONFIG_SCHEDSTATS
P(se.nr_migrations);
+#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b5743d5b0..4309c8e76 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -715,6 +715,11 @@ void init_entity_runnable_average(struct sched_entity *se)
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
+static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
+static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
+
/*
* With new tasks being created, their initial util_avgs are extrapolated
* based on the cfs_rq's current util_avg:
@@ -745,6 +750,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
@@ -758,16 +765,42 @@ void post_init_entity_util_avg(struct sched_entity *se)
}
sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
+
+ if (entity_is_task(se)) {
+ struct task_struct *p = task_of(se);
+ if (p->sched_class != &fair_sched_class) {
+ /*
+ * For !fair tasks do:
+ *
+ update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ switched_from_fair(rq, p);
+ *
+ * such that the next switched_to_fair() has the
+ * expected state.
+ */
+ se->avg.last_update_time = now;
+ return;
+ }
+ }
+
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+ attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
-#else
+#else /* !CONFIG_SMP */
void init_entity_runnable_average(struct sched_entity *se)
{
}
void post_init_entity_util_avg(struct sched_entity *se)
{
}
-#endif
+static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
+{
+}
+#endif /* CONFIG_SMP */
/*
* Update the current task's runtime statistics.
@@ -1328,6 +1361,8 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1395,31 +1430,11 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
-
- raw_spin_lock_irq(&dst_rq->lock);
- cur = dst_rq->curr;
- /*
- * No need to move the exiting task or idle task.
- */
- if ((cur->flags & PF_EXITING) || is_idle_task(cur))
+ cur = task_rcu_dereference(&dst_rq->curr);
+ if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur)))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
- raw_spin_unlock_irq(&dst_rq->lock);
/*
* Because we have preemption enabled we can get migrated around and
@@ -1502,7 +1517,6 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1528,16 +1542,9 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2891,8 +2898,6 @@ void set_task_rq_fair(struct sched_entity *se,
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-
static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -2939,7 +2944,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
WRITE_ONCE(*ptr, res); \
} while (0)
-/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
+/**
+ * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
+ * @now: current time, as per cfs_rq_clock_task()
+ * @cfs_rq: cfs_rq to update
+ * @update_freq: should we call cfs_rq_util_change() or will the call do so
+ *
+ * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
+ * avg. The immediate corollary is that all (fair) tasks must be attached, see
+ * post_init_entity_util_avg().
+ *
+ * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
+ *
+ * Returns true if the load decayed or we removed utilization. It is expected
+ * that one calls update_tg_load_avg() on this condition, but after you've
+ * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * avg up.
+ */
static inline int
update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
@@ -2994,6 +3015,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
update_tg_load_avg(cfs_rq, 0);
}
+/**
+ * attach_entity_load_avg - attach this entity to its cfs_rq load avg
+ * @cfs_rq: cfs_rq to attach to
+ * @se: sched_entity to attach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (!sched_feat(ATTACH_AGE_LOAD))
@@ -3002,6 +3031,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
/*
* If we got migrated (either between CPUs or between cgroups) we'll
* have aged the average right before clearing @last_update_time.
+ *
+ * Or we're fresh through post_init_entity_util_avg().
*/
if (se->avg.last_update_time) {
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3023,6 +3054,14 @@ skip_aging:
cfs_rq_util_change(cfs_rq);
}
+/**
+ * detach_entity_load_avg - detach this entity from its cfs_rq load avg
+ * @cfs_rq: cfs_rq to detach from
+ * @se: sched_entity to detach
+ *
+ * Must call update_cfs_rq_load_avg() before this, since we rely on
+ * cfs_rq->avg.last_update_time being current.
+ */
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
@@ -3107,11 +3146,14 @@ void remove_entity_load_avg(struct sched_entity *se)
u64 last_update_time;
/*
- * Newly created task or never used group entity should not be removed
- * from its (source) cfs_rq
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * cfs_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
*/
- if (se->avg.last_update_time == 0)
- return;
last_update_time = cfs_rq_last_update_time(cfs_rq);
@@ -3134,6 +3176,12 @@ static int idle_balance(struct rq *this_rq);
#else /* CONFIG_SMP */
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
+{
+ return 0;
+}
+
static inline void update_load_avg(struct sched_entity *se, int not_used)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
@@ -3723,7 +3771,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task;
+ return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
}
@@ -3861,13 +3909,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
cfs_rq->throttle_count--;
-#ifdef CONFIG_SMP
if (!cfs_rq->throttle_count) {
/* adjust cfs_rq_clock_task() */
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
}
-#endif
return 0;
}
@@ -4220,26 +4266,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
if (!cfs_bandwidth_used())
return;
- /* Synchronize hierarchical throttle counter: */
- if (unlikely(!cfs_rq->throttle_uptodate)) {
- struct rq *rq = rq_of(cfs_rq);
- struct cfs_rq *pcfs_rq;
- struct task_group *tg;
-
- cfs_rq->throttle_uptodate = 1;
-
- /* Get closest up-to-date node, because leaves go first: */
- for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
- pcfs_rq = tg->cfs_rq[cpu_of(rq)];
- if (pcfs_rq->throttle_uptodate)
- break;
- }
- if (tg) {
- cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
- }
- }
-
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -4254,6 +4280,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
throttle_cfs_rq(cfs_rq);
}
+static void sync_throttle(struct task_group *tg, int cpu)
+{
+ struct cfs_rq *pcfs_rq, *cfs_rq;
+
+ if (!cfs_bandwidth_used())
+ return;
+
+ if (!tg->parent)
+ return;
+
+ cfs_rq = tg->cfs_rq[cpu];
+ pcfs_rq = tg->parent->cfs_rq[cpu];
+
+ cfs_rq->throttle_count = pcfs_rq->throttle_count;
+ cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+}
+
/* conditionally throttle active cfs_rq's from put_prev_entity() */
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
@@ -4393,6 +4436,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static inline void sync_throttle(struct task_group *tg, int cpu) {}
static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
@@ -4501,7 +4545,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
- */
+ */
if (cfs_rq_throttled(cfs_rq))
break;
cfs_rq->h_nr_running++;
@@ -8342,31 +8386,17 @@ static void task_fork_fair(struct task_struct *p)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se, *curr;
- int this_cpu = smp_processor_id();
struct rq *rq = this_rq();
- unsigned long flags;
-
- raw_spin_lock_irqsave(&rq->lock, flags);
+ raw_spin_lock(&rq->lock);
update_rq_clock(rq);
cfs_rq = task_cfs_rq(current);
curr = cfs_rq->curr;
-
- /*
- * Not only the cpu but also the task_group of the parent might have
- * been changed after parent->se.parent,cfs_rq were copied to
- * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
- * of child point to valid ones.
- */
- rcu_read_lock();
- __set_task_cpu(p, this_cpu);
- rcu_read_unlock();
-
- update_curr(cfs_rq);
-
- if (curr)
+ if (curr) {
+ update_curr(cfs_rq);
se->vruntime = curr->vruntime;
+ }
place_entity(cfs_rq, se, 1);
if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8379,8 +8409,7 @@ static void task_fork_fair(struct task_struct *p)
}
se->vruntime -= cfs_rq->min_vruntime;
-
- raw_spin_unlock_irqrestore(&rq->lock, flags);
+ raw_spin_unlock(&rq->lock);
}
/*
@@ -8436,6 +8465,8 @@ static void detach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
if (!vruntime_normalized(p)) {
/*
@@ -8447,13 +8478,18 @@ static void detach_task_cfs_rq(struct task_struct *p)
}
/* Catch up with the cfs_rq and remove our load when we leave */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
detach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
}
static void attach_task_cfs_rq(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 now = cfs_rq_clock_task(cfs_rq);
+ int tg_update;
#ifdef CONFIG_FAIR_GROUP_SCHED
/*
@@ -8464,7 +8500,10 @@ static void attach_task_cfs_rq(struct task_struct *p)
#endif
/* Synchronize task with its cfs_rq */
+ tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
attach_entity_load_avg(cfs_rq, se);
+ if (tg_update)
+ update_tg_load_avg(cfs_rq, false);
if (!vruntime_normalized(p))
se->vruntime += cfs_rq->min_vruntime;
@@ -8524,6 +8563,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
+static void task_set_group_fair(struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+
+ set_task_rq(p, task_cpu(p));
+ se->depth = se->parent ? se->parent->depth + 1 : 0;
+}
+
static void task_move_group_fair(struct task_struct *p)
{
detach_task_cfs_rq(p);
@@ -8536,6 +8583,19 @@ static void task_move_group_fair(struct task_struct *p)
attach_task_cfs_rq(p);
}
+static void task_change_group_fair(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_fair(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_fair(p);
+ break;
+ }
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8587,10 +8647,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
init_cfs_rq(cfs_rq);
init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
init_entity_runnable_average(se);
-
- raw_spin_lock_irq(&rq->lock);
- post_init_entity_util_avg(se);
- raw_spin_unlock_irq(&rq->lock);
}
return 1;
@@ -8601,6 +8657,23 @@ err:
return 0;
}
+void online_fair_sched_group(struct task_group *tg)
+{
+ struct sched_entity *se;
+ struct rq *rq;
+ int i;
+
+ for_each_possible_cpu(i) {
+ rq = cpu_rq(i);
+ se = tg->se[i];
+
+ raw_spin_lock_irq(&rq->lock);
+ post_init_entity_util_avg(se);
+ sync_throttle(tg, i);
+ raw_spin_unlock_irq(&rq->lock);
+ }
+}
+
void unregister_fair_sched_group(struct task_group *tg)
{
unsigned long flags;
@@ -8705,6 +8778,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
return 1;
}
+void online_fair_sched_group(struct task_group *tg) { }
+
void unregister_fair_sched_group(struct task_group *tg) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8764,7 +8839,7 @@ const struct sched_class fair_sched_class = {
.update_curr = update_curr_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .task_move_group = task_move_group_fair,
+ .task_change_group = task_change_group_fair,
#endif
};
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e362a836c..1e855dcbd 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -205,6 +205,8 @@ exit_idle:
*/
static void cpu_idle_loop(void)
{
+ int cpu = smp_processor_id();
+
while (1) {
/*
* If the arch has a polling bit, we maintain an invariant:
@@ -223,7 +225,7 @@ static void cpu_idle_loop(void)
check_pgt_cache();
rmb();
- if (cpu_is_offline(smp_processor_id())) {
+ if (cpu_is_offline(cpu)) {
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 898c0d2f1..c64fc5114 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data);
extern void free_fair_sched_group(struct task_group *tg);
extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void online_fair_sched_group(struct task_group *tg);
extern void unregister_fair_sched_group(struct task_group *tg);
extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
@@ -437,7 +438,7 @@ struct cfs_rq {
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count, throttle_uptodate;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
* In particular, the load of prev->state in finish_task_switch() must
* happen before this.
*
- * Pairs with the smp_cond_acquire() in try_to_wake_up().
+ * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
*/
smp_store_release(&prev->on_cpu, 0);
#endif
@@ -1246,8 +1247,11 @@ struct sched_class {
void (*update_curr) (struct rq *rq);
+#define TASK_SET_GROUP 0
+#define TASK_MOVE_GROUP 1
+
#ifdef CONFIG_FAIR_GROUP_SCHED
- void (*task_move_group) (struct task_struct *p);
+ void (*task_change_group) (struct task_struct *p, int type);
#endif
};
@@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {}
#else /* arch_scale_freq_capacity */
#define arch_scale_freq_invariant() (false)
#endif
-
-static inline void account_reset_rq(struct rq *rq)
-{
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
- rq->prev_irq_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT
- rq->prev_steal_time = 0;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
- rq->prev_steal_time_rq = 0;
-#endif
-}