diff options
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/bfs.c | 500 | ||||
-rw-r--r-- | kernel/sched/bfs_sched.h | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 141 | ||||
-rw-r--r-- | kernel/sched/cpuacct.c | 114 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 74 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 203 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 5 | ||||
-rw-r--r-- | kernel/sched/debug.c | 2 | ||||
-rw-r--r-- | kernel/sched/fair.c | 251 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/sched.h | 23 |
12 files changed, 727 insertions, 594 deletions
diff --git a/kernel/sched/bfs.c b/kernel/sched/bfs.c index 67f93e752..bb5bac4b2 100644 --- a/kernel/sched/bfs.c +++ b/kernel/sched/bfs.c @@ -24,7 +24,7 @@ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, * Thomas Gleixner, Mike Kravetz - * now Brainfuck deadline scheduling policy by Con Kolivas deletes + * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes * a whole lot of those previous things. */ @@ -137,7 +137,7 @@ void print_scheduler_version(void) { - printk(KERN_INFO "BFS CPU scheduler v0.502 by Con Kolivas.\n"); + printk(KERN_INFO "BFS CPU scheduler v0.512 by Con Kolivas.\n"); } /* @@ -403,7 +403,6 @@ static inline void grq_lock_irq(void) } static inline void time_lock_grq(struct rq *rq) - __acquires(grq.lock) { grq_lock(); update_clocks(rq); @@ -429,86 +428,35 @@ static inline void grq_unlock_irqrestore(unsigned long *flags) static inline struct rq *task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) + __acquires(p->pi_lock) { - grq_lock_irqsave(flags); + raw_spin_lock_irqsave(&p->pi_lock, *flags); + grq_lock(); return task_rq(p); } static inline struct rq *time_task_grq_lock(struct task_struct *p, unsigned long *flags) - __acquires(grq.lock) { struct rq *rq = task_grq_lock(p, flags); - update_clocks(rq); - return rq; -} -static inline struct rq *task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) -{ - grq_lock_irq(); - return task_rq(p); -} - -static inline void time_task_grq_lock_irq(struct task_struct *p) - __acquires(grq.lock) -{ - struct rq *rq = task_grq_lock_irq(p); update_clocks(rq); + return rq; } -static inline void task_grq_unlock_irq(void) - __releases(grq.lock) -{ - grq_unlock_irq(); -} - -static inline void task_grq_unlock(unsigned long *flags) - __releases(grq.lock) -{ - grq_unlock_irqrestore(flags); -} - -/** - * grunqueue_is_locked - * - * Returns true if the global runqueue is locked. - * This interface allows printk to be called with the runqueue lock - * held and know whether or not it is OK to wake up the klogd. - */ -bool grunqueue_is_locked(void) -{ - return raw_spin_is_locked(&grq.lock); -} - -void grq_unlock_wait(void) - __releases(grq.lock) +static inline void task_grq_unlock(struct task_struct *p, unsigned long *flags) + __releases(p->pi_lock) { - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&grq.lock); + grq_unlock(); + raw_spin_unlock_irqrestore(&p->pi_lock, *flags); } static inline void time_grq_lock(struct rq *rq, unsigned long *flags) - __acquires(grq.lock) { local_irq_save(*flags); time_lock_grq(rq); } -static inline struct rq *__task_grq_lock(struct task_struct *p) - __acquires(grq.lock) -{ - grq_lock(); - return task_rq(p); -} - -static inline void __task_grq_unlock(void) - __releases(grq.lock) -{ - grq_unlock(); -} - static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) { } @@ -540,6 +488,40 @@ static inline bool deadline_after(u64 deadline, u64 time) } /* + * Deadline is "now" in niffies + (offset by priority). Setting the deadline + * is the key to everything. It distributes cpu fairly amongst tasks of the + * same nice value, it proportions cpu according to nice level, it means the + * task that last woke up the longest ago has the earliest deadline, thus + * ensuring that interactive tasks get low latency on wake up. The CPU + * proportion works out to the square of the virtual deadline difference, so + * this equation will give nice 19 3% CPU compared to nice 0. + */ +static inline u64 prio_deadline_diff(int user_prio) +{ + return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); +} + +static inline u64 task_deadline_diff(struct task_struct *p) +{ + return prio_deadline_diff(TASK_USER_PRIO(p)); +} + +static inline u64 static_deadline_diff(int static_prio) +{ + return prio_deadline_diff(USER_PRIO(static_prio)); +} + +static inline int longest_deadline_diff(void) +{ + return prio_deadline_diff(39); +} + +static inline int ms_longest_deadline_diff(void) +{ + return NS_TO_MS(longest_deadline_diff()); +} + +/* * A task that is not running or queued will not have a node set. * A task that is queued but not running will have a node set. * A task that is currently running will have ->on_cpu set but no node set. @@ -561,14 +543,23 @@ static void dequeue_task(struct task_struct *p) sched_info_dequeued(task_rq(p), p); } +#ifdef CONFIG_PREEMPT_RCU +static bool rcu_read_critical(struct task_struct *p) +{ + return p->rcu_read_unlock_special.b.blocked; +} +#else /* CONFIG_PREEMPT_RCU */ +#define rcu_read_critical(p) (false) +#endif /* CONFIG_PREEMPT_RCU */ + /* * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as * an idle task, we ensure none of the following conditions are met. */ static bool idleprio_suitable(struct task_struct *p) { - return (!freezing(p) && !signal_pending(p) && - !(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING))); + return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && + !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); } /* @@ -612,9 +603,13 @@ static void enqueue_task(struct task_struct *p, struct rq *rq) sl_id = p->prio; else { sl_id = p->deadline; - /* Set it to cope with 4 left shifts with locality_diff */ - if (p->prio == IDLE_PRIO) - sl_id |= 0x0F00000000000000; + if (idleprio_task(p)) { + /* Set it to cope with 4 left shifts with locality_diff */ + if (p->prio == IDLE_PRIO) + sl_id |= 0x00FF000000000000; + else + sl_id += longest_deadline_diff(); + } } /* * Some architectures don't have better than microsecond resolution @@ -1008,15 +1003,18 @@ static inline void deactivate_task(struct task_struct *p, struct rq *rq) #ifdef CONFIG_SMP void set_task_cpu(struct task_struct *p, unsigned int cpu) { - unsigned int tcpu; - #ifdef CONFIG_LOCKDEP /* - * The caller should hold grq lock. + * The caller should hold either p->pi_lock or grq lock, when changing + * a task's CPU. ->pi_lock for waking tasks, grq lock for runnable tasks. + * + * Furthermore, all task_rq users should acquire both locks, see + * task_grq_lock(). */ - WARN_ON_ONCE(debug_locks && !lockdep_is_held(&grq.lock)); + WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || + lockdep_is_held(&grq.lock))); #endif - if ((tcpu = task_cpu(p)) == cpu) + if (task_cpu(p) == cpu) return; trace_sched_migrate_task(p, cpu); perf_event_task_migrate(p); @@ -1027,6 +1025,7 @@ void set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); + if (p->on_rq) { struct rq *rq = task_rq(p); @@ -1166,7 +1165,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ncsw = 0; if (!match_state || p->state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); /* * If it changed from the expected state, bail out now. @@ -1292,9 +1291,7 @@ static inline bool needs_other_cpu(struct task_struct *p, int cpu) static void try_preempt(struct task_struct *p, struct rq *this_rq) { - int cpu, pcpu, highest_prio, highest_cpu; - struct rq *highest_prio_rq; - u64 latest_deadline; + int i, this_entries = this_rq->soft_affined; cpumask_t tmp; if (suitable_idle_cpus(p) && resched_best_idle(p)) @@ -1306,56 +1303,32 @@ static void try_preempt(struct task_struct *p, struct rq *this_rq) cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); - /* See if this task can preempt the task on the current CPU first. */ - pcpu = cpu_of(this_rq); - if (likely(cpumask_test_cpu(pcpu, &tmp))) { - if (smt_schedule(p, this_rq) && can_preempt(p, this_rq->rq_prio, this_rq->rq_deadline)) { - resched_curr(this_rq); - return; - } - cpumask_clear_cpu(pcpu, &tmp); - } - - highest_prio = latest_deadline = 0; - highest_prio_rq = NULL; - - /* Now look for the CPU with the latest deadline */ - for_each_cpu(cpu, &tmp) { - struct rq *rq; - int rq_prio; - u64 dl; + /* + * We iterate over CPUs in locality order using rq_order, finding the + * first one we can preempt if possible, thus staying closest in + * locality. + */ + for (i = 0; i < num_possible_cpus(); i++) { + struct rq *rq = this_rq->rq_order[i]; - rq = cpu_rq(cpu); - rq_prio = rq->rq_prio; - if (rq_prio < highest_prio) + if (!cpumask_test_cpu(rq->cpu, &tmp)) continue; - dl = rq->rq_deadline; - if (!sched_interactive && pcpu != cpu) - dl <<= locality_diff(pcpu, rq); - if (rq_prio > highest_prio || - deadline_after(dl, latest_deadline)) { - latest_deadline = dl; - highest_prio = rq_prio; - highest_cpu = cpu; - highest_prio_rq = rq; + if (!sched_interactive && rq != this_rq && rq->soft_affined <= this_entries) + continue; + if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { + /* + * If we have decided this task should preempt this CPU, + * set the task's CPU to match thereby speeding up matching + * this task in earliest_deadline_task. + */ + set_task_cpu(p, rq->cpu); + resched_curr(rq); + return; } } - - if (unlikely(!highest_prio_rq)) - return; - if (!smt_schedule(p, highest_prio_rq)) - return; - if (can_preempt(p, highest_prio, latest_deadline)) { - /* - * If we have decided this task should preempt this CPU, - * set the task's CPU to match thereby speeding up matching - * this task in earliest_deadline_task. - */ - set_task_cpu(p, highest_cpu); - resched_curr(highest_prio_rq); - } } + static int __set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask, bool check); #else /* CONFIG_SMP */ @@ -1501,8 +1474,6 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state, struct rq *rq; int cpu; - get_cpu(); - /* * If we are going to wake up a thread waiting for CONDITION we * need to ensure that CONDITION=1 done by the caller can not be @@ -1533,13 +1504,11 @@ static bool try_to_wake_up(struct task_struct *p, unsigned int state, out_running: ttwu_post_activation(p, rq, success); out_unlock: - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); if (schedstat_enabled()) ttwu_stat(p, cpu, wake_flags); - put_cpu(); - return success; } @@ -1629,6 +1598,13 @@ int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) skiplist_node_init(&p->node); /* + * We mark the process as NEW here. This guarantees that + * nobody will actually run it, and a signal or other external + * event cannot wake it up and insert it on the runqueue either. + */ + p->state = TASK_NEW; + + /* * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { @@ -1744,12 +1720,16 @@ static inline void init_schedstats(void) {} */ void wake_up_new_task(struct task_struct *p) { - struct task_struct *parent; + struct task_struct *parent, *rq_curr; + struct rq *rq, *new_rq; unsigned long flags; - struct rq *rq; parent = p->parent; rq = task_grq_lock(p, &flags); + if (unlikely(needs_other_cpu(p, task_cpu(p)))) + set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + rq_curr = rq->curr; + p->state = TASK_RUNNING; /* * Reinit new task deadline as its creator deadline could have changed @@ -1757,22 +1737,20 @@ void wake_up_new_task(struct task_struct *p) */ p->deadline = rq->rq_deadline; - /* - * If the task is a new process, current and parent are the same. If - * the task is a new thread in the thread group, it will have much more - * in common with current than with the parent. - */ - set_task_cpu(p, task_cpu(rq->curr)); + /* The new task might not be able to run on the same CPU as rq->curr */ + if (unlikely(needs_other_cpu(p, task_cpu(p)))) { + set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); + new_rq = task_rq(p); + } else + new_rq = rq; /* * Make sure we do not leak PI boosting priority to the child. */ - p->prio = rq->curr->normal_prio; + p->prio = rq_curr->normal_prio; activate_task(p, rq); trace_sched_wakeup_new(p); - if (unlikely(p->policy == SCHED_FIFO)) - goto after_ts_init; /* * Share the timeslice between parent and child, thus the @@ -1784,33 +1762,39 @@ void wake_up_new_task(struct task_struct *p) * is always equal to current->deadline. */ p->last_ran = rq->rq_last_ran; - if (likely(rq->rq_time_slice >= RESCHED_US * 2)) { + if (likely(rq_curr->policy != SCHED_FIFO)) { rq->rq_time_slice /= 2; - p->time_slice = rq->rq_time_slice; -after_ts_init: - if (rq->curr == parent && !suitable_idle_cpus(p)) { + if (unlikely(rq->rq_time_slice < RESCHED_US)) { /* - * The VM isn't cloned, so we're in a good position to - * do child-runs-first in anticipation of an exec. This - * usually avoids a lot of COW overhead. + * Forking task has run out of timeslice. Reschedule it and + * start its child with a new time slice and deadline. The + * child will end up running first because its deadline will + * be slightly earlier. */ - __set_tsk_resched(parent); - } else - try_preempt(p, rq); - } else { - if (rq->curr == parent) { - /* - * Forking task has run out of timeslice. Reschedule it and - * start its child with a new time slice and deadline. The - * child will end up running first because its deadline will - * be slightly earlier. - */ rq->rq_time_slice = 0; - __set_tsk_resched(parent); + __set_tsk_resched(rq_curr); + time_slice_expired(p); + if (suitable_idle_cpus(p)) + resched_best_idle(p); + else if (unlikely(rq != new_rq)) + try_preempt(p, new_rq); + } else { + p->time_slice = rq->rq_time_slice; + if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { + /* + * The VM isn't cloned, so we're in a good position to + * do child-runs-first in anticipation of an exec. This + * usually avoids a lot of COW overhead. + */ + __set_tsk_resched(rq_curr); + } else + try_preempt(p, new_rq); } + } else { time_slice_expired(p); + try_preempt(p, new_rq); } - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -2724,7 +2708,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_grq_lock(p, &flags); ns = p->sched_time + do_task_delta_exec(p, rq); - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); return ns; } @@ -2978,7 +2962,7 @@ static void task_running_tick(struct rq *rq) grq_lock(); requeue_task(p); - __set_tsk_resched(p); + resched_task(p); grq_unlock(); } @@ -3083,40 +3067,6 @@ static inline void preempt_latency_stop(int val) { } #endif /* - * Deadline is "now" in niffies + (offset by priority). Setting the deadline - * is the key to everything. It distributes cpu fairly amongst tasks of the - * same nice value, it proportions cpu according to nice level, it means the - * task that last woke up the longest ago has the earliest deadline, thus - * ensuring that interactive tasks get low latency on wake up. The CPU - * proportion works out to the square of the virtual deadline difference, so - * this equation will give nice 19 3% CPU compared to nice 0. - */ -static inline u64 prio_deadline_diff(int user_prio) -{ - return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); -} - -static inline u64 task_deadline_diff(struct task_struct *p) -{ - return prio_deadline_diff(TASK_USER_PRIO(p)); -} - -static inline u64 static_deadline_diff(int static_prio) -{ - return prio_deadline_diff(USER_PRIO(static_prio)); -} - -static inline int longest_deadline_diff(void) -{ - return prio_deadline_diff(39); -} - -static inline int ms_longest_deadline_diff(void) -{ - return NS_TO_MS(longest_deadline_diff()); -} - -/* * The time_slice is only refilled when it is empty and that is when we set a * new deadline. */ @@ -3215,13 +3165,12 @@ found_middle: static inline struct task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) { - struct task_struct *edt = idle; skiplist_node *node = &grq.node; + struct task_struct *edt = idle; u64 earliest_deadline = ~0ULL; while ((node = node->next[0]) != &grq.node) { struct task_struct *p = node->value; - int tcpu; /* Make sure affinity is ok */ if (needs_other_cpu(p, cpu)) @@ -3230,22 +3179,24 @@ task_struct *earliest_deadline_task(struct rq *rq, int cpu, struct task_struct * if (!smt_schedule(p, rq)) continue; - if (!sched_interactive && (tcpu = task_cpu(p)) != cpu) { - u64 dl = p->deadline << locality_diff(tcpu, rq); + if (!sched_interactive) { + int tcpu; + + if ((tcpu = task_cpu(p)) != cpu) { + u64 dl = p->deadline << locality_diff(tcpu, rq); - if (unlikely(!deadline_before(dl, earliest_deadline))) + if (!deadline_before(dl, earliest_deadline)) + continue; + earliest_deadline = dl; + edt = p; + /* We continue even though we've found the earliest + * deadline task as the locality offset means there + * may be a better candidate after it. */ continue; - earliest_deadline = dl; - edt = p; - /* We continue even though we've found the earliest - * deadline task as the locality offset means there - * may be a better candidate after it. */ - continue; + } } - /* This wouldn't happen if we encountered a better deadline from - * another CPU and have already set edt. */ - if (likely(p->deadline < earliest_deadline)) - edt = p; + /* We've encountered the best deadline local task */ + edt = p; break; } if (likely(edt != idle)) @@ -3275,6 +3226,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -3316,10 +3270,6 @@ static inline void set_rq_task(struct rq *rq, struct task_struct *p) rq->rq_mm = p->mm; rq->rq_smt_bias = p->smt_bias; #endif - if (p != rq->idle) - rq->rq_running = true; - else - rq->rq_running = false; } static void reset_rq_task(struct rq *rq, struct task_struct *p) @@ -3353,7 +3303,7 @@ static void check_smt_siblings(struct rq *this_rq) if (unlikely(!rq->online)) continue; p = rq->curr; - if (!smt_should_schedule(p, this_rq)) { + if (!smt_schedule(p, this_rq)) { set_tsk_need_resched(p); smp_send_reschedule(other_cpu); } @@ -3546,8 +3496,6 @@ static void __sched notrace __schedule(bool preempt) trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the grq */ - cpu = cpu_of(rq); - idle = rq->idle; } else { check_siblings(rq); grq_unlock_irq(); @@ -3766,8 +3714,8 @@ EXPORT_SYMBOL(default_wake_function); void rt_mutex_setprio(struct task_struct *p, int prio) { unsigned long flags; - int queued, oldprio; struct rq *rq; + int oldprio; BUG_ON(prio < 0 || prio > MAX_PRIO); @@ -3793,19 +3741,18 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; - queued = task_queued(p); - if (queued) - dequeue_task(p); p->prio = prio; - if (task_running(p) && prio > oldprio) - resched_task(p); - if (queued) { + if (task_running(p)){ + if (prio > oldprio) + resched_task(p); + } else if (task_queued(p)) { + dequeue_task(p); enqueue_task(p, rq); - try_preempt(p, rq); + if (prio < oldprio) + try_preempt(p, rq); } - out_unlock: - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); } #endif @@ -3821,7 +3768,7 @@ static inline void adjust_deadline(struct task_struct *p, int new_prio) void set_user_nice(struct task_struct *p, long nice) { - int queued, new_static, old_static; + int new_static, old_static; unsigned long flags; struct rq *rq; @@ -3843,16 +3790,14 @@ void set_user_nice(struct task_struct *p, long nice) p->static_prio = new_static; goto out_unlock; } - queued = task_queued(p); - if (queued) - dequeue_task(p); adjust_deadline(p, new_static); old_static = p->static_prio; p->static_prio = new_static; p->prio = effective_prio(p); - if (queued) { + if (task_queued(p)) { + dequeue_task(p); enqueue_task(p, rq); if (new_static < old_static) try_preempt(p, rq); @@ -3862,7 +3807,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_task(p); } out_unlock: - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); } EXPORT_SYMBOL(set_user_nice); @@ -4002,11 +3947,15 @@ static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, p->prio = rt_mutex_get_effective_prio(p, p->normal_prio); } else p->prio = p->normal_prio; + if (task_running(p)) { reset_rq_task(rq, p); - /* Resched only if we might now be preempted */ - if (p->prio > oldprio || p->rt_priority > oldrtprio) - resched_task(p); + resched_task(p); + } else if (task_queued(p)) { + dequeue_task(p); + enqueue_task(p, rq); + if (p->prio < oldprio || p->rt_priority > oldrtprio) + try_preempt(p, rq); } } @@ -4031,8 +3980,8 @@ __sched_setscheduler(struct task_struct *p, int policy, const struct sched_param *param, bool user, bool pi) { struct sched_param zero_param = { .sched_priority = 0 }; - int queued, retval, oldpolicy = -1; unsigned long flags, rlim_rtprio = 0; + int retval, oldpolicy = -1; int reset_on_fork; struct rq *rq; @@ -4142,20 +4091,17 @@ recheck: /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: - */ - raw_spin_lock_irqsave(&p->pi_lock, flags); - /* + * * To be able to change p->policy safely, the grunqueue lock must be * held. */ - rq = __task_grq_lock(p); + rq = task_grq_lock(p, &flags); /* * Changing the policy of the stop threads its a very bad idea */ if (p == rq->stop) { - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_grq_unlock(p, &flags); return -EINVAL; } @@ -4165,31 +4111,21 @@ recheck: if (unlikely(policy == p->policy && (!is_rt_policy(policy) || param->sched_priority == p->rt_priority))) { - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_grq_unlock(p, &flags); return 0; } /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_grq_unlock(p, &flags); goto recheck; } update_clocks(rq); p->sched_reset_on_fork = reset_on_fork; - queued = task_queued(p); - if (queued) - dequeue_task(p); __setscheduler(p, rq, policy, param->sched_priority, pi); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - __task_grq_unlock(); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); + task_grq_unlock(p, &flags); if (pi) rt_mutex_adjust_pi(p); @@ -4706,7 +4642,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -5113,6 +5050,8 @@ void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_ma void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { cpumask_copy(tsk_cpus_allowed(p), new_mask); + if (needs_other_cpu(p, task_cpu(p))) + set_task_cpu(p, cpumask_any(tsk_cpus_allowed(p))); } #endif @@ -5376,6 +5315,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, { const struct cpumask *cpu_valid_mask = cpu_active_mask; bool running_wrong = false; + struct cpumask old_mask; bool queued = false; unsigned long flags; struct rq *rq; @@ -5399,7 +5339,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(tsk_cpus_allowed(p), new_mask)) + cpumask_copy(&old_mask, tsk_cpus_allowed(p)); + if (cpumask_equal(&old_mask, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { @@ -5436,12 +5377,16 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, set_task_cpu(p, cpumask_any_and(cpu_valid_mask, new_mask)); out: - if (queued) + if (queued && !cpumask_subset(new_mask, &old_mask)) try_preempt(p, rq); - task_grq_unlock(&flags); - if (running_wrong) - preempt_schedule_common(); + preempt_disable(); + task_grq_unlock(p, &flags); + + if (running_wrong) { + __schedule(true); + preempt_enable(); + } return ret; } @@ -5471,6 +5416,11 @@ static void bind_zero(int src_cpu) cpumask_set_cpu(0, tsk_cpus_allowed(p)); p->zerobound = true; bound++; + if (task_cpu(p) == src_cpu) { + set_task_cpu(p, 0); + if (task_running(p)) + resched_task(p); + } } } while_each_thread(t, p); @@ -7008,6 +6958,7 @@ void __init sched_init_smp(void) #ifdef CONFIG_SCHED_SMT bool smt_threads = false; #endif + struct rq *rq; cpumask_var_t non_isolated_cpus; @@ -7045,7 +6996,7 @@ void __init sched_init_smp(void) * nodes) are treated as very distant. */ for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + rq = cpu_rq(cpu); /* First check if this cpu is in the same node */ for_each_domain(cpu, sd) { @@ -7084,6 +7035,17 @@ void __init sched_init_smp(void) } #endif } + for_each_possible_cpu(cpu) { + int total_cpus = 0, locality; + + rq = cpu_rq(cpu); + for (locality = 0; locality <= 4; locality++) { + for_each_possible_cpu(other_cpu) { + if (rq->cpu_locality[other_cpu] == locality) + rq->rq_order[total_cpus++] = cpu_rq(other_cpu); + } + } + } #ifdef CONFIG_SMT_NICE if (smt_threads) { check_siblings = &check_smt_siblings; @@ -7095,7 +7057,8 @@ void __init sched_init_smp(void) mutex_unlock(&sched_domains_mutex); for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); + rq = cpu_rq(cpu); + for_each_online_cpu(other_cpu) { if (other_cpu <= cpu) continue; @@ -7220,6 +7183,10 @@ void __init sched_init(void) else rq->cpu_locality[j] = 4; } + rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); + rq->rq_order[0] = rq; + for (j = 1; j < cpu_ids; j++) + rq->rq_order[j] = cpu_rq(j); } #endif @@ -7323,7 +7290,6 @@ static inline void normalise_rt_tasks(void) struct task_struct *g, *p; unsigned long flags; struct rq *rq; - int queued; read_lock(&tasklist_lock); for_each_process_thread(g, p) { @@ -7337,16 +7303,8 @@ static inline void normalise_rt_tasks(void) continue; rq = task_grq_lock(p, &flags); - queued = task_queued(p); - if (queued) - dequeue_task(p); __setscheduler(p, rq, SCHED_NORMAL, 0, false); - if (queued) { - enqueue_task(p, rq); - try_preempt(p, rq); - } - - task_grq_unlock(&flags); + task_grq_unlock(p, &flags); } read_unlock(&tasklist_lock); } diff --git a/kernel/sched/bfs_sched.h b/kernel/sched/bfs_sched.h index e7fe1d0a5..00a16ba0a 100644 --- a/kernel/sched/bfs_sched.h +++ b/kernel/sched/bfs_sched.h @@ -22,7 +22,6 @@ struct rq { int rq_time_slice; u64 rq_last_ran; int rq_prio; - bool rq_running; /* There is a task running */ int soft_affined; /* Running or queued tasks with this set as their rq */ u64 load_update; /* When we last updated load */ unsigned long load_avg; /* Rolling load average */ @@ -43,6 +42,7 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; int *cpu_locality; /* CPU relative cache distance */ + struct rq **rq_order; /* RQs ordered by relative cache distance */ #ifdef CONFIG_SCHED_SMT cpumask_t thread_mask; bool (*siblings_idle)(struct rq *rq); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 38eacc323..44817c640 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -74,6 +74,7 @@ #include <linux/context_tracking.h> #include <linux/compiler.h> #include <linux/frame.h> +#include <linux/prefetch.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -1937,7 +1938,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * chain to provide order. Instead we do: * * 1) smp_store_release(X->on_cpu, 0) - * 2) smp_cond_acquire(!X->on_cpu) + * 2) smp_cond_load_acquire(!X->on_cpu) * * Example: * @@ -1948,7 +1949,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * sched-out X * smp_store_release(X->on_cpu, 0); * - * smp_cond_acquire(!X->on_cpu); + * smp_cond_load_acquire(&X->on_cpu, !VAL); * X->state = WAKING * set_task_cpu(X,2) * @@ -1974,7 +1975,7 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * This means that any means of doing remote wakeups must order the CPU doing * the wakeup against the CPU the task is going to end up running on. This, * however, is already required for the regular Program-Order guarantee above, - * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire). * */ @@ -2069,7 +2070,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * This ensures that tasks getting woken will be fully ordered against * their previous state and preserve Program Order. */ - smp_cond_acquire(!p->on_cpu); + smp_cond_load_acquire(&p->on_cpu, !VAL); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2364,11 +2365,11 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) __sched_fork(clone_flags, p); /* - * We mark the process as running here. This guarantees that + * We mark the process as NEW here. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ - p->state = TASK_RUNNING; + p->state = TASK_NEW; /* * Make sure we do not leak PI boosting priority to the child. @@ -2405,8 +2406,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->sched_class = &fair_sched_class; } - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); + init_entity_runnable_average(&p->se); /* * The child is not yet in the pid-hash so no cgroup attach races, @@ -2416,7 +2416,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) * Silence PROVE_RCU. */ raw_spin_lock_irqsave(&p->pi_lock, flags); - set_task_cpu(p, cpu); + /* + * We're setting the cpu for the first time, we don't migrate, + * so use __set_task_cpu(). + */ + __set_task_cpu(p, cpu); + if (p->sched_class->task_fork) + p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); #ifdef CONFIG_SCHED_INFO @@ -2548,16 +2554,18 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; - /* Initialize new task's runnable average */ - init_entity_runnable_average(&p->se); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* * Fork balancing, do it here and not earlier because: * - cpus_allowed can change in the fork path * - any previously selected cpu might disappear through hotplug + * + * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq, + * as we're not fully set-up yet. */ - set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); + __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); post_init_entity_util_avg(&p->se); @@ -2987,6 +2995,23 @@ EXPORT_PER_CPU_SYMBOL(kstat); EXPORT_PER_CPU_SYMBOL(kernel_cpustat); /* + * The function fair_sched_class.update_curr accesses the struct curr + * and its field curr->exec_start; when called from task_sched_runtime(), + * we observe a high rate of cache misses in practice. + * Prefetching this data results in improved performance. + */ +static inline void prefetch_curr_exec_start(struct task_struct *p) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + struct sched_entity *curr = (&p->se)->cfs_rq->curr; +#else + struct sched_entity *curr = (&task_rq(p)->cfs)->curr; +#endif + prefetch(curr); + prefetch(&curr->exec_start); +} + +/* * Return accounted runtime for the task. * In case the task is currently running, return the runtime plus current's * pending runtime that have not been accounted yet. @@ -3020,6 +3045,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) * thread, breaking clock_gettime(). */ if (task_current(rq, p) && task_on_rq_queued(p)) { + prefetch_curr_exec_start(p); update_rq_clock(rq); p->sched_class->update_curr(rq); } @@ -3183,6 +3209,9 @@ static noinline void __schedule_bug(struct task_struct *prev) pr_cont("\n"); } #endif + if (panic_on_warn) + panic("scheduling while atomic\n"); + dump_stack(); add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } @@ -4774,7 +4803,8 @@ out_unlock: * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask * - * Return: 0 on success. An error code otherwise. + * Return: size of CPU mask copied to user_mask_ptr on success. An + * error code otherwise. */ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, unsigned long __user *, user_mask_ptr) @@ -7255,7 +7285,6 @@ static void sched_rq_cpu_starting(unsigned int cpu) struct rq *rq = cpu_rq(cpu); rq->calc_load_update = calc_load_update; - account_reset_rq(rq); update_max_interval(); } @@ -7735,6 +7764,8 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) INIT_LIST_HEAD(&tg->children); list_add_rcu(&tg->siblings, &parent->children); spin_unlock_irqrestore(&task_group_lock, flags); + + online_fair_sched_group(tg); } /* rcu callback to free various structures associated with a task group */ @@ -7763,27 +7794,9 @@ void sched_offline_group(struct task_group *tg) spin_unlock_irqrestore(&task_group_lock, flags); } -/* change task's runqueue when it moves between groups. - * The caller of this function should have put the task in its new group - * by now. This function just updates tsk->se.cfs_rq and tsk->se.parent to - * reflect its new group. - */ -void sched_move_task(struct task_struct *tsk) +static void sched_change_group(struct task_struct *tsk, int type) { struct task_group *tg; - int queued, running; - struct rq_flags rf; - struct rq *rq; - - rq = task_rq_lock(tsk, &rf); - - running = task_current(rq, tsk); - queued = task_on_rq_queued(tsk); - - if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); - if (unlikely(running)) - put_prev_task(rq, tsk); /* * All callers are synchronized by task_rq_lock(); we do not use RCU @@ -7796,11 +7809,37 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_task_group = tg; #ifdef CONFIG_FAIR_GROUP_SCHED - if (tsk->sched_class->task_move_group) - tsk->sched_class->task_move_group(tsk); + if (tsk->sched_class->task_change_group) + tsk->sched_class->task_change_group(tsk, type); else #endif set_task_rq(tsk, task_cpu(tsk)); +} + +/* + * Change task's runqueue when it moves between groups. + * + * The caller of this function should have put the task in its new group by + * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect + * its new group. + */ +void sched_move_task(struct task_struct *tsk) +{ + int queued, running; + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(tsk, &rf); + + running = task_current(rq, tsk); + queued = task_on_rq_queued(tsk); + + if (queued) + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + if (unlikely(running)) + put_prev_task(rq, tsk); + + sched_change_group(tsk, TASK_MOVE_GROUP); if (unlikely(running)) tsk->sched_class->set_curr_task(rq); @@ -8228,15 +8267,27 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) sched_free_group(tg); } +/* + * This is called before wake_up_new_task(), therefore we really only + * have to set its group bits, all the other stuff does not apply. + */ static void cpu_cgroup_fork(struct task_struct *task) { - sched_move_task(task); + struct rq_flags rf; + struct rq *rq; + + rq = task_rq_lock(task, &rf); + + sched_change_group(task, TASK_SET_GROUP); + + task_rq_unlock(rq, task, &rf); } static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; struct cgroup_subsys_state *css; + int ret = 0; cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED @@ -8247,8 +8298,24 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) if (task->sched_class != &fair_sched_class) return -EINVAL; #endif + /* + * Serialize against wake_up_new_task() such that if its + * running, we're sure to observe its full state. + */ + raw_spin_lock_irq(&task->pi_lock); + /* + * Avoid calling sched_move_task() before wake_up_new_task() + * has happened. This would lead to problems with PELT, due to + * move wanting to detach+attach while we're not attached yet. + */ + if (task->state == TASK_NEW) + ret = -EINVAL; + raw_spin_unlock_irq(&task->pi_lock); + + if (ret) + break; } - return 0; + return ret; } static void cpu_cgroup_attach(struct cgroup_taskset *tset) diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 41f85c4d0..bc0b309c3 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -25,15 +25,13 @@ enum cpuacct_stat_index { CPUACCT_STAT_NSTATS, }; -enum cpuacct_usage_index { - CPUACCT_USAGE_USER, /* ... user mode */ - CPUACCT_USAGE_SYSTEM, /* ... kernel mode */ - - CPUACCT_USAGE_NRUSAGE, +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", }; struct cpuacct_usage { - u64 usages[CPUACCT_USAGE_NRUSAGE]; + u64 usages[CPUACCT_STAT_NSTATS]; }; /* track cpu usage of a group of tasks and its child groups */ @@ -108,16 +106,16 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) } static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); u64 data; /* - * We allow index == CPUACCT_USAGE_NRUSAGE here to read + * We allow index == CPUACCT_STAT_NSTATS here to read * the sum of suages. */ - BUG_ON(index > CPUACCT_USAGE_NRUSAGE); + BUG_ON(index > CPUACCT_STAT_NSTATS); #ifndef CONFIG_64BIT /* @@ -126,11 +124,11 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu, raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - if (index == CPUACCT_USAGE_NRUSAGE) { + if (index == CPUACCT_STAT_NSTATS) { int i = 0; data = 0; - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) data += cpuusage->usages[i]; } else { data = cpuusage->usages[index]; @@ -155,7 +153,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) raw_spin_lock_irq(&cpu_rq(cpu)->lock); #endif - for (i = 0; i < CPUACCT_USAGE_NRUSAGE; i++) + for (i = 0; i < CPUACCT_STAT_NSTATS; i++) cpuusage->usages[i] = val; #ifndef CONFIG_64BIT @@ -165,7 +163,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) /* return total cpu usage (in nanoseconds) of a group */ static u64 __cpuusage_read(struct cgroup_subsys_state *css, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(css); u64 totalcpuusage = 0; @@ -180,18 +178,18 @@ static u64 __cpuusage_read(struct cgroup_subsys_state *css, static u64 cpuusage_user_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_USER); + return __cpuusage_read(css, CPUACCT_STAT_USER); } static u64 cpuusage_sys_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_SYSTEM); + return __cpuusage_read(css, CPUACCT_STAT_SYSTEM); } static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return __cpuusage_read(css, CPUACCT_USAGE_NRUSAGE); + return __cpuusage_read(css, CPUACCT_STAT_NSTATS); } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, @@ -213,7 +211,7 @@ static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, } static int __cpuacct_percpu_seq_show(struct seq_file *m, - enum cpuacct_usage_index index) + enum cpuacct_stat_index index) { struct cpuacct *ca = css_ca(seq_css(m)); u64 percpu; @@ -229,48 +227,78 @@ static int __cpuacct_percpu_seq_show(struct seq_file *m, static int cpuacct_percpu_user_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_USER); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_USER); } static int cpuacct_percpu_sys_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_SYSTEM); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_SYSTEM); } static int cpuacct_percpu_seq_show(struct seq_file *m, void *V) { - return __cpuacct_percpu_seq_show(m, CPUACCT_USAGE_NRUSAGE); + return __cpuacct_percpu_seq_show(m, CPUACCT_STAT_NSTATS); } -static const char * const cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; +static int cpuacct_all_seq_show(struct seq_file *m, void *V) +{ + struct cpuacct *ca = css_ca(seq_css(m)); + int index; + int cpu; + + seq_puts(m, "cpu"); + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) + seq_printf(m, " %s", cpuacct_stat_desc[index]); + seq_puts(m, "\n"); + + for_each_possible_cpu(cpu) { + struct cpuacct_usage *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + + seq_printf(m, "%d", cpu); + + for (index = 0; index < CPUACCT_STAT_NSTATS; index++) { +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit + * platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); +#endif + + seq_printf(m, " %llu", cpuusage->usages[index]); + +#ifndef CONFIG_64BIT + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#endif + } + seq_puts(m, "\n"); + } + return 0; +} static int cpuacct_stats_show(struct seq_file *sf, void *v) { struct cpuacct *ca = css_ca(seq_css(sf)); + s64 val[CPUACCT_STAT_NSTATS]; int cpu; - s64 val = 0; + int stat; + memset(val, 0, sizeof(val)); for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val); + u64 *cpustat = per_cpu_ptr(ca->cpustat, cpu)->cpustat; - val = 0; - for_each_possible_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_USER]; + val[CPUACCT_STAT_USER] += cpustat[CPUTIME_NICE]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SYSTEM]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_IRQ]; + val[CPUACCT_STAT_SYSTEM] += cpustat[CPUTIME_SOFTIRQ]; } - val = cputime64_to_clock_t(val); - seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) { + seq_printf(sf, "%s %lld\n", + cpuacct_stat_desc[stat], + cputime64_to_clock_t(val[stat])); + } return 0; } @@ -302,6 +330,10 @@ static struct cftype files[] = { .seq_show = cpuacct_percpu_sys_seq_show, }, { + .name = "usage_all", + .seq_show = cpuacct_all_seq_show, + }, + { .name = "stat", .seq_show = cpuacct_stats_show, }, @@ -316,11 +348,11 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int index = CPUACCT_USAGE_SYSTEM; + int index = CPUACCT_STAT_SYSTEM; struct pt_regs *regs = task_pt_regs(tsk); if (regs && user_mode(regs)) - index = CPUACCT_USAGE_USER; + index = CPUACCT_STAT_USER; rcu_read_lock(); diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 5be588204..d4184498c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid) if (old_idx == IDX_INVALID) { cp->size++; - cp->elements[cp->size - 1].dl = 0; + cp->elements[cp->size - 1].dl = dl; cp->elements[cp->size - 1].cpu = cpu; cp->elements[cpu].idx = cp->size - 1; cpudl_change_key(cp, cp->size - 1, dl); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 3d3ab8205..eba226d7c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -51,6 +51,8 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + unsigned int cached_raw_freq; + /* The fields below are only needed when sharing a policy. */ unsigned long util; unsigned long max; @@ -110,7 +112,7 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, /** * get_next_freq - Compute a new frequency for a given cpufreq policy. - * @policy: cpufreq policy object to compute the new frequency for. + * @sg_cpu: schedutil cpu object to compute the new frequency for. * @util: Current CPU utilization. * @max: CPU capacity. * @@ -125,14 +127,25 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, * next_freq = C * curr_freq * util_raw / max * * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8. + * + * The lowest driver-supported frequency which is equal or greater than the raw + * next_freq (as calculated above) is returned, subject to policy min/max and + * cpufreq driver limitations. */ -static unsigned int get_next_freq(struct cpufreq_policy *policy, - unsigned long util, unsigned long max) +static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util, + unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; + struct cpufreq_policy *policy = sg_policy->policy; unsigned int freq = arch_scale_freq_invariant() ? policy->cpuinfo.max_freq : policy->cur; - return (freq + (freq >> 2)) * util / max; + freq = (freq + (freq >> 2)) * util / max; + + if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX) + return sg_policy->next_freq; + sg_cpu->cached_raw_freq = freq; + return cpufreq_driver_resolve_freq(policy, freq); } static void sugov_update_single(struct update_util_data *hook, u64 time, @@ -147,13 +160,14 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, return; next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq : - get_next_freq(policy, util, max); + get_next_freq(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, unsigned long util, unsigned long max) { + struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; @@ -193,7 +207,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy, } } - return get_next_freq(policy, util, max); + return get_next_freq(sg_cpu, util, max); } static void sugov_update_shared(struct update_util_data *hook, u64 time, @@ -210,7 +224,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_policy, util, max); + next_f = sugov_next_freq_shared(sg_cpu, util, max); sugov_update_commit(sg_policy, time, next_f); } @@ -398,7 +412,7 @@ static int sugov_init(struct cpufreq_policy *policy) return ret; } -static int sugov_exit(struct cpufreq_policy *policy) +static void sugov_exit(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; struct sugov_tunables *tunables = sg_policy->tunables; @@ -416,7 +430,6 @@ static int sugov_exit(struct cpufreq_policy *policy) mutex_unlock(&global_tunables_lock); sugov_policy_free(sg_policy); - return 0; } static int sugov_start(struct cpufreq_policy *policy) @@ -438,6 +451,7 @@ static int sugov_start(struct cpufreq_policy *policy) sg_cpu->util = ULONG_MAX; sg_cpu->max = 0; sg_cpu->last_update = 0; + sg_cpu->cached_raw_freq = 0; cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util, sugov_update_shared); } else { @@ -448,7 +462,7 @@ static int sugov_start(struct cpufreq_policy *policy) return 0; } -static int sugov_stop(struct cpufreq_policy *policy) +static void sugov_stop(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; unsigned int cpu; @@ -460,53 +474,29 @@ static int sugov_stop(struct cpufreq_policy *policy) irq_work_sync(&sg_policy->irq_work); cancel_work_sync(&sg_policy->work); - return 0; } -static int sugov_limits(struct cpufreq_policy *policy) +static void sugov_limits(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy = policy->governor_data; if (!policy->fast_switch_enabled) { mutex_lock(&sg_policy->work_lock); - - if (policy->max < policy->cur) - __cpufreq_driver_target(policy, policy->max, - CPUFREQ_RELATION_H); - else if (policy->min > policy->cur) - __cpufreq_driver_target(policy, policy->min, - CPUFREQ_RELATION_L); - + cpufreq_policy_apply_limits(policy); mutex_unlock(&sg_policy->work_lock); } sg_policy->need_freq_update = true; - return 0; -} - -int sugov_governor(struct cpufreq_policy *policy, unsigned int event) -{ - if (event == CPUFREQ_GOV_POLICY_INIT) { - return sugov_init(policy); - } else if (policy->governor_data) { - switch (event) { - case CPUFREQ_GOV_POLICY_EXIT: - return sugov_exit(policy); - case CPUFREQ_GOV_START: - return sugov_start(policy); - case CPUFREQ_GOV_STOP: - return sugov_stop(policy); - case CPUFREQ_GOV_LIMITS: - return sugov_limits(policy); - } - } - return -EINVAL; } static struct cpufreq_governor schedutil_gov = { .name = "schedutil", - .governor = sugov_governor, .owner = THIS_MODULE, + .init = sugov_init, + .exit = sugov_exit, + .start = sugov_start, + .stop = sugov_stop, + .limits = sugov_limits, }; static int __init sugov_module_init(void) diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index a24cfb41d..a846cf89e 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -49,15 +49,12 @@ DEFINE_PER_CPU(seqcount_t, irq_time_seq); */ void irqtime_account_irq(struct task_struct *curr) { - unsigned long flags; s64 delta; int cpu; if (!sched_clock_irqtime) return; - local_irq_save(flags); - cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); __this_cpu_add(irq_start_time, delta); @@ -75,44 +72,53 @@ void irqtime_account_irq(struct task_struct *curr) __this_cpu_add(cpu_softirq_time, delta); irq_time_write_end(); - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(irqtime_account_irq); -static int irqtime_account_hi_update(void) +static cputime_t irqtime_account_hi_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t irq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_hardirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ]) - ret = 1; + irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) - + cpustat[CPUTIME_IRQ]; + irq_cputime = min(irq_cputime, maxtime); + cpustat[CPUTIME_IRQ] += irq_cputime; local_irq_restore(flags); - return ret; + return irq_cputime; } -static int irqtime_account_si_update(void) +static cputime_t irqtime_account_si_update(cputime_t maxtime) { u64 *cpustat = kcpustat_this_cpu->cpustat; unsigned long flags; - u64 latest_ns; - int ret = 0; + cputime_t softirq_cputime; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); - if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) - ret = 1; + softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) - + cpustat[CPUTIME_SOFTIRQ]; + softirq_cputime = min(softirq_cputime, maxtime); + cpustat[CPUTIME_SOFTIRQ] += softirq_cputime; local_irq_restore(flags); - return ret; + return softirq_cputime; } #else /* CONFIG_IRQ_TIME_ACCOUNTING */ #define sched_clock_irqtime (0) +static cputime_t irqtime_account_hi_update(cputime_t dummy) +{ + return 0; +} + +static cputime_t irqtime_account_si_update(cputime_t dummy) +{ + return 0; +} + #endif /* !CONFIG_IRQ_TIME_ACCOUNTING */ static inline void task_group_account_field(struct task_struct *p, int index, @@ -257,29 +263,47 @@ void account_idle_time(cputime_t cputime) cpustat[CPUTIME_IDLE] += (__force u64) cputime; } -static __always_inline bool steal_account_process_tick(void) +/* + * When a guest is interrupted for a longer amount of time, missed clock + * ticks are not redelivered later. Due to that, this function may on + * occasion account more time than the calling functions think elapsed. + */ +static __always_inline cputime_t steal_account_process_time(cputime_t maxtime) { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { + cputime_t steal_cputime; u64 steal; - unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - /* - * steal is in nsecs but our caller is expecting steal - * time in jiffies. Lets cast the result to jiffies - * granularity and account the rest on the next rounds. - */ - steal_jiffies = nsecs_to_jiffies(steal); - this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); + steal_cputime = min(nsecs_to_cputime(steal), maxtime); + account_steal_time(steal_cputime); + this_rq()->prev_steal_time += cputime_to_nsecs(steal_cputime); - account_steal_time(jiffies_to_cputime(steal_jiffies)); - return steal_jiffies; + return steal_cputime; } #endif - return false; + return 0; +} + +/* + * Account how much elapsed time was spent in steal, irq, or softirq time. + */ +static inline cputime_t account_other_time(cputime_t max) +{ + cputime_t accounted; + + accounted = steal_account_process_time(max); + + if (accounted < max) + accounted += irqtime_account_hi_update(max - accounted); + + if (accounted < max) + accounted += irqtime_account_si_update(max - accounted); + + return accounted; } /* @@ -342,21 +366,23 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) static void irqtime_account_process_tick(struct task_struct *p, int user_tick, struct rq *rq, int ticks) { - cputime_t scaled = cputime_to_scaled(cputime_one_jiffy); - u64 cputime = (__force u64) cputime_one_jiffy; - u64 *cpustat = kcpustat_this_cpu->cpustat; + u64 cputime = (__force u64) cputime_one_jiffy * ticks; + cputime_t scaled, other; - if (steal_account_process_tick()) + /* + * When returning from idle, many ticks can get accounted at + * once, including some ticks of steal, irq, and softirq time. + * Subtract those ticks from the amount of time accounted to + * idle, or potentially user or system time. Due to rounding, + * other time can exceed ticks occasionally. + */ + other = account_other_time(ULONG_MAX); + if (other >= cputime) return; + cputime -= other; + scaled = cputime_to_scaled(cputime); - cputime *= ticks; - scaled *= ticks; - - if (irqtime_account_hi_update()) { - cpustat[CPUTIME_IRQ] += cputime; - } else if (irqtime_account_si_update()) { - cpustat[CPUTIME_SOFTIRQ] += cputime; - } else if (this_cpu_ksoftirqd() == p) { + if (this_cpu_ksoftirqd() == p) { /* * ksoftirqd time do not get accounted in cpu_softirq_time. * So, we have to handle it separately here. @@ -406,6 +432,10 @@ void vtime_common_task_switch(struct task_struct *prev) } #endif +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement @@ -415,33 +445,16 @@ void vtime_common_task_switch(struct task_struct *prev) * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_common_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq_enter(struct task_struct *tsk) { - if (!in_interrupt()) { - /* - * If we interrupted user, context_tracking_in_user() - * is 1 because the context tracking don't hook - * on irq entry/exit. This way we know if - * we need to flush user time on kernel entry. - */ - if (context_tracking_in_user()) { - vtime_account_user(tsk); - return; - } - - if (is_idle_task(tsk)) { - vtime_account_idle(tsk); - return; - } - } - vtime_account_system(tsk); + if (!in_interrupt() && is_idle_task(tsk)) + vtime_account_idle(tsk); + else + vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { *ut = p->utime; @@ -466,7 +479,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime */ void account_process_tick(struct task_struct *p, int user_tick) { - cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); + cputime_t cputime, scaled, steal; struct rq *rq = this_rq(); if (vtime_accounting_cpu_enabled()) @@ -477,26 +490,21 @@ void account_process_tick(struct task_struct *p, int user_tick) return; } - if (steal_account_process_tick()) + cputime = cputime_one_jiffy; + steal = steal_account_process_time(ULONG_MAX); + + if (steal >= cputime) return; + cputime -= steal; + scaled = cputime_to_scaled(cputime); + if (user_tick) - account_user_time(p, cputime_one_jiffy, one_jiffy_scaled); + account_user_time(p, cputime, scaled); else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET)) - account_system_time(p, HARDIRQ_OFFSET, cputime_one_jiffy, - one_jiffy_scaled); + account_system_time(p, HARDIRQ_OFFSET, cputime, scaled); else - account_idle_time(cputime_one_jiffy); -} - -/* - * Account multiple ticks of steal time. - * @p: the process from which the cpu time has been stolen - * @ticks: number of stolen ticks - */ -void account_steal_ticks(unsigned long ticks) -{ - account_steal_time(jiffies_to_cputime(ticks)); + account_idle_time(cputime); } /* @@ -505,13 +513,21 @@ void account_steal_ticks(unsigned long ticks) */ void account_idle_ticks(unsigned long ticks) { + cputime_t cputime, steal; if (sched_clock_irqtime) { irqtime_account_idle_ticks(ticks); return; } - account_idle_time(jiffies_to_cputime(ticks)); + cputime = jiffies_to_cputime(ticks); + steal = steal_account_process_time(ULONG_MAX); + + if (steal >= cputime) + return; + + cputime -= steal; + account_idle_time(cputime); } /* @@ -686,12 +702,21 @@ static cputime_t vtime_delta(struct task_struct *tsk) static cputime_t get_vtime_delta(struct task_struct *tsk) { unsigned long now = READ_ONCE(jiffies); - unsigned long delta = now - tsk->vtime_snap; + cputime_t delta, other; + /* + * Unlike tick based timing, vtime based timing never has lost + * ticks, and no need for steal time accounting to make up for + * lost ticks. Vtime accounts a rounded version of actual + * elapsed time. Limit account_other_time to prevent rounding + * errors from causing elapsed vtime to go negative. + */ + delta = jiffies_to_cputime(now - tsk->vtime_snap); + other = account_other_time(delta); WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); tsk->vtime_snap = now; - return jiffies_to_cputime(delta); + return delta - other; } static void __vtime_account_system(struct task_struct *tsk) @@ -711,16 +736,6 @@ void vtime_account_system(struct task_struct *tsk) write_seqcount_end(&tsk->vtime_seqcount); } -void vtime_gen_account_irq_exit(struct task_struct *tsk) -{ - write_seqcount_begin(&tsk->vtime_seqcount); - if (vtime_delta(tsk)) - __vtime_account_system(tsk); - if (context_tracking_in_user()) - tsk->vtime_snap_whence = VTIME_USER; - write_seqcount_end(&tsk->vtime_seqcount); -} - void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index fcb7f0217..1ce886728 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) * * XXX figure out if select_task_rq_dl() deals with offline cpus. */ - if (unlikely(!rq->online)) + if (unlikely(!rq->online)) { + lockdep_unpin_lock(&rq->lock, rf.cookie); rq = dl_task_offline_migration(rq, p); + rf.cookie = lockdep_pin_lock(&rq->lock); + } /* * Queueing this task back might have overloaded rq, check if we need diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 0368c393a..2a0a99952 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -879,9 +879,9 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; -#ifdef CONFIG_SCHEDSTATS P(se.nr_migrations); +#ifdef CONFIG_SCHEDSTATS if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5743d5b0..4309c8e76 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -715,6 +715,11 @@ void init_entity_runnable_average(struct sched_entity *se) /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } +static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); +static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq); +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force); +static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se); + /* * With new tasks being created, their initial util_avgs are extrapolated * based on the cfs_rq's current util_avg: @@ -745,6 +750,8 @@ void post_init_entity_util_avg(struct sched_entity *se) struct cfs_rq *cfs_rq = cfs_rq_of(se); struct sched_avg *sa = &se->avg; long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2; + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { @@ -758,16 +765,42 @@ void post_init_entity_util_avg(struct sched_entity *se) } sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } + + if (entity_is_task(se)) { + struct task_struct *p = task_of(se); + if (p->sched_class != &fair_sched_class) { + /* + * For !fair tasks do: + * + update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + switched_from_fair(rq, p); + * + * such that the next switched_to_fair() has the + * expected state. + */ + se->avg.last_update_time = now; + return; + } + } + + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); + attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } -#else +#else /* !CONFIG_SMP */ void init_entity_runnable_average(struct sched_entity *se) { } void post_init_entity_util_avg(struct sched_entity *se) { } -#endif +static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) +{ +} +#endif /* CONFIG_SMP */ /* * Update the current task's runtime statistics. @@ -1328,6 +1361,8 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); + if (p) + get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1395,31 +1430,11 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; - bool assigned = false; rcu_read_lock(); - - raw_spin_lock_irq(&dst_rq->lock); - cur = dst_rq->curr; - /* - * No need to move the exiting task or idle task. - */ - if ((cur->flags & PF_EXITING) || is_idle_task(cur)) + cur = task_rcu_dereference(&dst_rq->curr); + if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) cur = NULL; - else { - /* - * The task_struct must be protected here to protect the - * p->numa_faults access in the task_weight since the - * numa_faults could already be freed in the following path: - * finish_task_switch() - * --> put_task_struct() - * --> __put_task_struct() - * --> task_numa_free() - */ - get_task_struct(cur); - } - - raw_spin_unlock_irq(&dst_rq->lock); /* * Because we have preemption enabled we can get migrated around and @@ -1502,7 +1517,6 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; - put_task_struct(cur); cur = NULL; goto assign; } @@ -1528,16 +1542,9 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: - assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); - /* - * The dst_rq->curr isn't assigned. The protection for task_struct is - * finished. - */ - if (cur && !assigned) - put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -2891,8 +2898,6 @@ void set_task_rq_fair(struct sched_entity *se, static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ -static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); - static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) { struct rq *rq = rq_of(cfs_rq); @@ -2939,7 +2944,23 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) WRITE_ONCE(*ptr, res); \ } while (0) -/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */ +/** + * update_cfs_rq_load_avg - update the cfs_rq's load/util averages + * @now: current time, as per cfs_rq_clock_task() + * @cfs_rq: cfs_rq to update + * @update_freq: should we call cfs_rq_util_change() or will the call do so + * + * The cfs_rq avg is the direct sum of all its entities (blocked and runnable) + * avg. The immediate corollary is that all (fair) tasks must be attached, see + * post_init_entity_util_avg(). + * + * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example. + * + * Returns true if the load decayed or we removed utilization. It is expected + * that one calls update_tg_load_avg() on this condition, but after you've + * modified the cfs_rq avg (attach/detach), such that we propagate the new + * avg up. + */ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) { @@ -2994,6 +3015,14 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) update_tg_load_avg(cfs_rq, 0); } +/** + * attach_entity_load_avg - attach this entity to its cfs_rq load avg + * @cfs_rq: cfs_rq to attach to + * @se: sched_entity to attach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { if (!sched_feat(ATTACH_AGE_LOAD)) @@ -3002,6 +3031,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s /* * If we got migrated (either between CPUs or between cgroups) we'll * have aged the average right before clearing @last_update_time. + * + * Or we're fresh through post_init_entity_util_avg(). */ if (se->avg.last_update_time) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3023,6 +3054,14 @@ skip_aging: cfs_rq_util_change(cfs_rq); } +/** + * detach_entity_load_avg - detach this entity from its cfs_rq load avg + * @cfs_rq: cfs_rq to detach from + * @se: sched_entity to detach + * + * Must call update_cfs_rq_load_avg() before this, since we rely on + * cfs_rq->avg.last_update_time being current. + */ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)), @@ -3107,11 +3146,14 @@ void remove_entity_load_avg(struct sched_entity *se) u64 last_update_time; /* - * Newly created task or never used group entity should not be removed - * from its (source) cfs_rq + * tasks cannot exit without having gone through wake_up_new_task() -> + * post_init_entity_util_avg() which will have added things to the + * cfs_rq, so we can remove unconditionally. + * + * Similarly for groups, they will have passed through + * post_init_entity_util_avg() before unregister_sched_fair_group() + * calls this. */ - if (se->avg.last_update_time == 0) - return; last_update_time = cfs_rq_last_update_time(cfs_rq); @@ -3134,6 +3176,12 @@ static int idle_balance(struct rq *this_rq); #else /* CONFIG_SMP */ +static inline int +update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) +{ + return 0; +} + static inline void update_load_avg(struct sched_entity *se, int not_used) { struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -3723,7 +3771,7 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) { if (unlikely(cfs_rq->throttle_count)) - return cfs_rq->throttled_clock_task; + return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; } @@ -3861,13 +3909,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; cfs_rq->throttle_count--; -#ifdef CONFIG_SMP if (!cfs_rq->throttle_count) { /* adjust cfs_rq_clock_task() */ cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - cfs_rq->throttled_clock_task; } -#endif return 0; } @@ -4220,26 +4266,6 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) if (!cfs_bandwidth_used()) return; - /* Synchronize hierarchical throttle counter: */ - if (unlikely(!cfs_rq->throttle_uptodate)) { - struct rq *rq = rq_of(cfs_rq); - struct cfs_rq *pcfs_rq; - struct task_group *tg; - - cfs_rq->throttle_uptodate = 1; - - /* Get closest up-to-date node, because leaves go first: */ - for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) { - pcfs_rq = tg->cfs_rq[cpu_of(rq)]; - if (pcfs_rq->throttle_uptodate) - break; - } - if (tg) { - cfs_rq->throttle_count = pcfs_rq->throttle_count; - cfs_rq->throttled_clock_task = rq_clock_task(rq); - } - } - /* an active group must be handled by the update_curr()->put() path */ if (!cfs_rq->runtime_enabled || cfs_rq->curr) return; @@ -4254,6 +4280,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq) throttle_cfs_rq(cfs_rq); } +static void sync_throttle(struct task_group *tg, int cpu) +{ + struct cfs_rq *pcfs_rq, *cfs_rq; + + if (!cfs_bandwidth_used()) + return; + + if (!tg->parent) + return; + + cfs_rq = tg->cfs_rq[cpu]; + pcfs_rq = tg->parent->cfs_rq[cpu]; + + cfs_rq->throttle_count = pcfs_rq->throttle_count; + cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu)); +} + /* conditionally throttle active cfs_rq's from put_prev_entity() */ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { @@ -4393,6 +4436,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} +static inline void sync_throttle(struct task_group *tg, int cpu) {} static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {} static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq) @@ -4501,7 +4545,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. - */ + */ if (cfs_rq_throttled(cfs_rq)) break; cfs_rq->h_nr_running++; @@ -8342,31 +8386,17 @@ static void task_fork_fair(struct task_struct *p) { struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; - int this_cpu = smp_processor_id(); struct rq *rq = this_rq(); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); + raw_spin_lock(&rq->lock); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); curr = cfs_rq->curr; - - /* - * Not only the cpu but also the task_group of the parent might have - * been changed after parent->se.parent,cfs_rq were copied to - * child->se.parent,cfs_rq. So call __set_task_cpu() to make those - * of child point to valid ones. - */ - rcu_read_lock(); - __set_task_cpu(p, this_cpu); - rcu_read_unlock(); - - update_curr(cfs_rq); - - if (curr) + if (curr) { + update_curr(cfs_rq); se->vruntime = curr->vruntime; + } place_entity(cfs_rq, se, 1); if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) { @@ -8379,8 +8409,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - - raw_spin_unlock_irqrestore(&rq->lock, flags); + raw_spin_unlock(&rq->lock); } /* @@ -8436,6 +8465,8 @@ static void detach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; if (!vruntime_normalized(p)) { /* @@ -8447,13 +8478,18 @@ static void detach_task_cfs_rq(struct task_struct *p) } /* Catch up with the cfs_rq and remove our load when we leave */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); detach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); } static void attach_task_cfs_rq(struct task_struct *p) { struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 now = cfs_rq_clock_task(cfs_rq); + int tg_update; #ifdef CONFIG_FAIR_GROUP_SCHED /* @@ -8464,7 +8500,10 @@ static void attach_task_cfs_rq(struct task_struct *p) #endif /* Synchronize task with its cfs_rq */ + tg_update = update_cfs_rq_load_avg(now, cfs_rq, false); attach_entity_load_avg(cfs_rq, se); + if (tg_update) + update_tg_load_avg(cfs_rq, false); if (!vruntime_normalized(p)) se->vruntime += cfs_rq->min_vruntime; @@ -8524,6 +8563,14 @@ void init_cfs_rq(struct cfs_rq *cfs_rq) } #ifdef CONFIG_FAIR_GROUP_SCHED +static void task_set_group_fair(struct task_struct *p) +{ + struct sched_entity *se = &p->se; + + set_task_rq(p, task_cpu(p)); + se->depth = se->parent ? se->parent->depth + 1 : 0; +} + static void task_move_group_fair(struct task_struct *p) { detach_task_cfs_rq(p); @@ -8536,6 +8583,19 @@ static void task_move_group_fair(struct task_struct *p) attach_task_cfs_rq(p); } +static void task_change_group_fair(struct task_struct *p, int type) +{ + switch (type) { + case TASK_SET_GROUP: + task_set_group_fair(p); + break; + + case TASK_MOVE_GROUP: + task_move_group_fair(p); + break; + } +} + void free_fair_sched_group(struct task_group *tg) { int i; @@ -8587,10 +8647,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); init_entity_runnable_average(se); - - raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); - raw_spin_unlock_irq(&rq->lock); } return 1; @@ -8601,6 +8657,23 @@ err: return 0; } +void online_fair_sched_group(struct task_group *tg) +{ + struct sched_entity *se; + struct rq *rq; + int i; + + for_each_possible_cpu(i) { + rq = cpu_rq(i); + se = tg->se[i]; + + raw_spin_lock_irq(&rq->lock); + post_init_entity_util_avg(se); + sync_throttle(tg, i); + raw_spin_unlock_irq(&rq->lock); + } +} + void unregister_fair_sched_group(struct task_group *tg) { unsigned long flags; @@ -8705,6 +8778,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } +void online_fair_sched_group(struct task_group *tg) { } + void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -8764,7 +8839,7 @@ const struct sched_class fair_sched_class = { .update_curr = update_curr_fair, #ifdef CONFIG_FAIR_GROUP_SCHED - .task_move_group = task_move_group_fair, + .task_change_group = task_change_group_fair, #endif }; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index e362a836c..1e855dcbd 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -205,6 +205,8 @@ exit_idle: */ static void cpu_idle_loop(void) { + int cpu = smp_processor_id(); + while (1) { /* * If the arch has a polling bit, we maintain an invariant: @@ -223,7 +225,7 @@ static void cpu_idle_loop(void) check_pgt_cache(); rmb(); - if (cpu_is_offline(smp_processor_id())) { + if (cpu_is_offline(cpu)) { cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 898c0d2f1..c64fc5114 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -321,6 +321,7 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); +extern void online_fair_sched_group(struct task_group *tg); extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, @@ -437,7 +438,7 @@ struct cfs_rq { u64 throttled_clock, throttled_clock_task; u64 throttled_clock_task_time; - int throttled, throttle_count, throttle_uptodate; + int throttled, throttle_count; struct list_head throttled_list; #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -1113,7 +1114,7 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * In particular, the load of prev->state in finish_task_switch() must * happen before this. * - * Pairs with the smp_cond_acquire() in try_to_wake_up(). + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1246,8 +1247,11 @@ struct sched_class { void (*update_curr) (struct rq *rq); +#define TASK_SET_GROUP 0 +#define TASK_MOVE_GROUP 1 + #ifdef CONFIG_FAIR_GROUP_SCHED - void (*task_move_group) (struct task_struct *p); + void (*task_change_group) (struct task_struct *p, int type); #endif }; @@ -1809,16 +1813,3 @@ static inline void cpufreq_trigger_update(u64 time) {} #else /* arch_scale_freq_capacity */ #define arch_scale_freq_invariant() (false) #endif - -static inline void account_reset_rq(struct rq *rq) -{ -#ifdef CONFIG_IRQ_TIME_ACCOUNTING - rq->prev_irq_time = 0; -#endif -#ifdef CONFIG_PARAVIRT - rq->prev_steal_time = 0; -#endif -#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING - rq->prev_steal_time_rq = 0; -#endif -} |