From 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 Mon Sep 17 00:00:00 2001 From: André Fabian Silva Delgado Date: Fri, 25 Mar 2016 03:53:42 -0300 Subject: Linux-libre 4.5-gnu --- kernel/sched/fair.c | 599 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 400 insertions(+), 199 deletions(-) (limited to 'kernel/sched/fair.c') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 82e905862..ac7fb39c3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -20,8 +20,8 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include #include +#include #include #include #include @@ -763,16 +763,52 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; } /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -782,25 +818,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -808,8 +827,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ } +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif + /* * We are picking a new current task - update its stats: */ @@ -905,10 +957,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -967,6 +1020,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1116,27 +1181,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. - */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1218,8 +1279,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1287,20 +1346,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1384,6 +1453,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1409,9 +1479,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -1466,7 +1543,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1518,8 +1595,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1554,12 +1630,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1609,20 +1687,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1632,12 +1705,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1928,7 +2001,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -1972,14 +2045,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2093,6 +2166,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2133,9 +2207,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); @@ -2180,6 +2255,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2302,6 +2378,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* @@ -2695,12 +2782,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2834,48 +2973,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + return last_update_time; } - -/* - * Update the rq's load with the elapsed running time before entering - * idle. if the last scheduled task is not a CFS task, idle_enter will - * be the only way to update the runnable statistic. - */ -void idle_enter_fair(struct rq *this_rq) +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { + return cfs_rq->avg.last_update_time; } +#endif /* - * Update the rq's load with the elapsed idle time before a task is - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will - * be the only way to update the runnable statistic. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -void idle_exit_fair(struct rq *this_rq) +void remove_entity_load_avg(struct sched_entity *se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -3020,6 +3159,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { @@ -3040,11 +3199,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3111,19 +3274,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3197,7 +3349,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3210,7 +3363,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3293,9 +3446,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4265,42 +4422,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: + * + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog @@ -4331,14 +4483,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) return load; } -/* +/** + * __update_cpu_load - update the rq->cpu_load[] statistics + * @this_rq: The rq to update statistics for + * @this_load: The current load + * @pending_updates: The number of missed updates + * @active: !0 for NOHZ_FULL + * * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. + * scheduler tick (TICK_NSEC). + * + * This function computes a decaying average: + * + * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load + * + * Because of NOHZ it might not get called on every tick which gives need for + * the @pending_updates argument. + * + * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 + * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load + * = A * (A * load[i]_n-2 + B) + B + * = A * (A * (A * load[i]_n-3 + B) + B) + B + * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B + * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B + * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B + * = (1 - 1/2^i)^n * (load[i]_0 - load) + load + * + * In the above we've assumed load_n := load, which is true for NOHZ_FULL as + * any change in load would have resulted in the tick being turned back on. + * + * For regular NOHZ, this reduces to: + * + * load[i]_n = (1 - 1/2^i)^n * load[i]_0 + * + * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra + * term. See the @active paramter. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) + unsigned long pending_updates, int active) { + unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; int i, scale; this_rq->nr_load_updates++; @@ -4352,6 +4536,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4374,6 +4567,25 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load, + int active) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + __update_cpu_load(this_rq, load, pending_updates, active); + } +} + /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4391,46 +4603,31 @@ static unsigned long weighted_cpuload(const int cpu) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(void) +void update_cpu_load_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long pending_updates; + unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4442,10 +4639,10 @@ void update_cpu_load_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load, 1, 1); } /* @@ -5032,8 +5229,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -5746,8 +5942,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); } @@ -5880,8 +6076,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6327,7 +6523,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6344,7 +6540,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6352,7 +6549,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; } @@ -7273,8 +7473,6 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; - idle_enter_fair(this_rq); - /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -7355,10 +7553,8 @@ out: if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) { - idle_exit_fair(this_rq); + if (pulled_task) this_rq->idle_stamp = 0; - } return pulled_task; } @@ -7737,7 +7933,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_cpu_load_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8123,11 +8319,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8175,21 +8368,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8271,7 +8472,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ -- cgit v1.2.3-54-g00ecf