diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 15:23:14 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-11-26 15:23:14 -0800 |
commit | 77a05940eee7e9891cd6add7a690a3e762ee21b0 (patch) | |
tree | 74f4f9b315659093059261a2da8adf7c1aa2a92f /kernel | |
parent | 3f59dbcace56fae7e4ed303bab90f1bedadcfdf4 (diff) | |
parent | de881a341c4143650fa50ce95cf450a5c94faa9f (diff) | |
download | linux-77a05940eee7e9891cd6add7a690a3e762ee21b0.tar.gz linux-77a05940eee7e9891cd6add7a690a3e762ee21b0.tar.bz2 linux-77a05940eee7e9891cd6add7a690a3e762ee21b0.zip |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The biggest changes in this cycle were:
- Make kcpustat vtime aware (Frederic Weisbecker)
- Rework the CFS load_balance() logic (Vincent Guittot)
- Misc cleanups, smaller enhancements, fixes.
The load-balancing rework is the most intrusive change: it replaces
the old heuristics that have become less meaningful after the
introduction of the PELT metrics, with a grounds-up load-balancing
algorithm.
As such it's not really an iterative series, but replaces the old
load-balancing logic with the new one. We hope there are no
performance regressions left - but statistically it's highly probable
that there *is* going to be some workload that is hurting from these
chnages. If so then we'd prefer to have a look at that workload and
fix its scheduling, instead of reverting the changes"
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (46 commits)
rackmeter: Use vtime aware kcpustat accessor
leds: Use all-in-one vtime aware kcpustat accessor
cpufreq: Use vtime aware kcpustat accessors for user time
procfs: Use all-in-one vtime aware kcpustat accessor
sched/vtime: Bring up complete kcpustat accessor
sched/cputime: Support other fields on kcpustat_field()
sched/cpufreq: Move the cfs_rq_util_change() call to cpufreq_update_util()
sched/fair: Add comments for group_type and balancing at SD_NUMA level
sched/fair: Fix rework of find_idlest_group()
sched/uclamp: Fix overzealous type replacement
sched/Kconfig: Fix spelling mistake in user-visible help text
sched/core: Further clarify sched_class::set_next_task()
sched/fair: Use mul_u32_u32()
sched/core: Simplify sched_class::pick_next_task()
sched/core: Optimize pick_next_task()
sched/core: Make pick_next_task_idle() more consistent
sched/fair: Better document newidle_balance()
leds: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
cpufreq: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
procfs: Use vtime aware kcpustat accessor to fetch CPUTIME_SYSTEM
...
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Kconfig.preempt | 2 | ||||
-rw-r--r-- | kernel/context_tracking.c | 6 | ||||
-rw-r--r-- | kernel/sched/core.c | 18 | ||||
-rw-r--r-- | kernel/sched/cputime.c | 288 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 12 | ||||
-rw-r--r-- | kernel/sched/fair.c | 1427 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/idle.c | 10 | ||||
-rw-r--r-- | kernel/sched/rt.c | 12 | ||||
-rw-r--r-- | kernel/sched/sched.h | 25 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 9 | ||||
-rw-r--r-- | kernel/sched/topology.c | 9 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 2 |
13 files changed, 1177 insertions, 644 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index deff97217496..bf82259cff96 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -65,7 +65,7 @@ config PREEMPT_RT preemptible priority-inheritance aware variants, enforcing interrupt threading and introducing mechanisms to break up long non-preemptible sections. This makes the kernel, except for very - low level and critical code pathes (entry code, scheduler, low + low level and critical code paths (entry code, scheduler, low level interrupt handling) fully preemptible and brings most execution contexts under scheduler control. diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index be01a4d627c9..0296b4bda8f1 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -25,8 +25,8 @@ #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> -DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); -EXPORT_SYMBOL_GPL(context_tracking_enabled); +DEFINE_STATIC_KEY_FALSE(context_tracking_key); +EXPORT_SYMBOL_GPL(context_tracking_key); DEFINE_PER_CPU(struct context_tracking, context_tracking); EXPORT_SYMBOL_GPL(context_tracking); @@ -192,7 +192,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_branch_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_key); } if (initialized) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 80b60ca7767f..d82e2f6ac41d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -811,7 +811,7 @@ static inline unsigned int uclamp_bucket_base_value(unsigned int clamp_value) return UCLAMP_BUCKET_DELTA * uclamp_bucket_id(clamp_value); } -static inline enum uclamp_id uclamp_none(enum uclamp_id clamp_id) +static inline unsigned int uclamp_none(enum uclamp_id clamp_id) { if (clamp_id == UCLAMP_MIN) return 0; @@ -854,7 +854,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, } static inline -enum uclamp_id uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, +unsigned int uclamp_rq_max_value(struct rq *rq, enum uclamp_id clamp_id, unsigned int clamp_value) { struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket; @@ -919,7 +919,7 @@ uclamp_eff_get(struct task_struct *p, enum uclamp_id clamp_id) return uc_req; } -enum uclamp_id uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) +unsigned int uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id) { struct uclamp_se uc_eff; @@ -3918,13 +3918,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) prev->sched_class == &fair_sched_class) && rq->nr_running == rq->cfs.h_nr_running)) { - p = fair_sched_class.pick_next_task(rq, prev, rf); + p = pick_next_task_fair(rq, prev, rf); if (unlikely(p == RETRY_TASK)) goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ - if (unlikely(!p)) - p = idle_sched_class.pick_next_task(rq, prev, rf); + if (!p) { + put_prev_task(rq, prev); + p = pick_next_task_idle(rq); + } return p; } @@ -3948,7 +3950,7 @@ restart: put_prev_task(rq, prev); for_each_class(class) { - p = class->pick_next_task(rq, NULL, NULL); + p = class->pick_next_task(rq); if (p) return p; } @@ -6217,7 +6219,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) struct task_struct *next; for_each_class(class) { - next = class->pick_next_task(rq, NULL, NULL); + next = class->pick_next_task(rq); if (next) { next->sched_class->put_prev_task(rq, next); return next; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 46ed4e1383e2..d43318a489f2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -405,27 +405,25 @@ static inline void irqtime_account_process_tick(struct task_struct *p, int user_ /* * Use precise platform statistics if available: */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + # ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_common_task_switch(struct task_struct *prev) +void vtime_task_switch(struct task_struct *prev) { if (is_idle_task(prev)) vtime_account_idle(prev); else - vtime_account_system(prev); + vtime_account_kernel(prev); vtime_flush(prev); arch_vtime_task_switch(prev); } # endif -#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ - -#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE /* * Archs that account the whole time spent in the idle task * (outside irq) as idle time can rely on this and just implement - * vtime_account_system() and vtime_account_idle(). Archs that + * vtime_account_kernel() and vtime_account_idle(). Archs that * have other meaning of the idle time (s390 only includes the * time spent by the CPU when it's in low power mode) must override * vtime_account(). @@ -436,7 +434,7 @@ void vtime_account_irq_enter(struct task_struct *tsk) if (!in_interrupt() && is_idle_task(tsk)) vtime_account_idle(tsk); else - vtime_account_system(tsk); + vtime_account_kernel(tsk); } EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ @@ -477,7 +475,7 @@ void account_process_tick(struct task_struct *p, int user_tick) u64 cputime, steal; struct rq *rq = this_rq(); - if (vtime_accounting_cpu_enabled()) + if (vtime_accounting_enabled_this_cpu()) return; if (sched_clock_irqtime) { @@ -711,8 +709,8 @@ static u64 get_vtime_delta(struct vtime *vtime) return delta - other; } -static void __vtime_account_system(struct task_struct *tsk, - struct vtime *vtime) +static void vtime_account_system(struct task_struct *tsk, + struct vtime *vtime) { vtime->stime += get_vtime_delta(vtime); if (vtime->stime >= TICK_NSEC) { @@ -731,7 +729,17 @@ static void vtime_account_guest(struct task_struct *tsk, } } -void vtime_account_system(struct task_struct *tsk) +static void __vtime_account_kernel(struct task_struct *tsk, + struct vtime *vtime) +{ + /* We might have scheduled out from guest path */ + if (vtime->state == VTIME_GUEST) + vtime_account_guest(tsk, vtime); + else + vtime_account_system(tsk, vtime); +} + +void vtime_account_kernel(struct task_struct *tsk) { struct vtime *vtime = &tsk->vtime; @@ -739,11 +747,7 @@ void vtime_account_system(struct task_struct *tsk) return; write_seqcount_begin(&vtime->seqcount); - /* We might have scheduled out from guest path */ - if (tsk->flags & PF_VCPU) - vtime_account_guest(tsk, vtime); - else - __vtime_account_system(tsk, vtime); + __vtime_account_kernel(tsk, vtime); write_seqcount_end(&vtime->seqcount); } @@ -752,7 +756,7 @@ void vtime_user_enter(struct task_struct *tsk) struct vtime *vtime = &tsk->vtime; write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); vtime->state = VTIME_USER; write_seqcount_end(&vtime->seqcount); } @@ -782,8 +786,9 @@ void vtime_guest_enter(struct task_struct *tsk) * that can thus safely catch up with a tickless delta. */ write_seqcount_begin(&vtime->seqcount); - __vtime_account_system(tsk, vtime); + vtime_account_system(tsk, vtime); tsk->flags |= PF_VCPU; + vtime->state = VTIME_GUEST; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); @@ -795,6 +800,7 @@ void vtime_guest_exit(struct task_struct *tsk) write_seqcount_begin(&vtime->seqcount); vtime_account_guest(tsk, vtime); tsk->flags &= ~PF_VCPU; + vtime->state = VTIME_SYS; write_seqcount_end(&vtime->seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -804,19 +810,30 @@ void vtime_account_idle(struct task_struct *tsk) account_idle_time(get_vtime_delta(&tsk->vtime)); } -void arch_vtime_task_switch(struct task_struct *prev) +void vtime_task_switch_generic(struct task_struct *prev) { struct vtime *vtime = &prev->vtime; write_seqcount_begin(&vtime->seqcount); + if (vtime->state == VTIME_IDLE) + vtime_account_idle(prev); + else + __vtime_account_kernel(prev, vtime); vtime->state = VTIME_INACTIVE; + vtime->cpu = -1; write_seqcount_end(&vtime->seqcount); vtime = ¤t->vtime; write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + if (is_idle_task(current)) + vtime->state = VTIME_IDLE; + else if (current->flags & PF_VCPU) + vtime->state = VTIME_GUEST; + else + vtime->state = VTIME_SYS; vtime->starttime = sched_clock(); + vtime->cpu = smp_processor_id(); write_seqcount_end(&vtime->seqcount); } @@ -827,8 +844,9 @@ void vtime_init_idle(struct task_struct *t, int cpu) local_irq_save(flags); write_seqcount_begin(&vtime->seqcount); - vtime->state = VTIME_SYS; + vtime->state = VTIME_IDLE; vtime->starttime = sched_clock(); + vtime->cpu = cpu; write_seqcount_end(&vtime->seqcount); local_irq_restore(flags); } @@ -846,7 +864,7 @@ u64 task_gtime(struct task_struct *t) seq = read_seqcount_begin(&vtime->seqcount); gtime = t->gtime; - if (vtime->state == VTIME_SYS && t->flags & PF_VCPU) + if (vtime->state == VTIME_GUEST) gtime += vtime->gtime + vtime_delta(vtime); } while (read_seqcount_retry(&vtime->seqcount, seq)); @@ -877,20 +895,230 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) *utime = t->utime; *stime = t->stime; - /* Task is sleeping, nothing to add */ - if (vtime->state == VTIME_INACTIVE || is_idle_task(t)) + /* Task is sleeping or idle, nothing to add */ + if (vtime->state < VTIME_SYS) continue; delta = vtime_delta(vtime); /* - * Task runs either in user or kernel space, add pending nohz time to - * the right place. + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. */ - if (vtime->state == VTIME_USER || t->flags & PF_VCPU) - *utime += vtime->utime + delta; - else if (vtime->state == VTIME_SYS) + if (vtime->state == VTIME_SYS) *stime += vtime->stime + delta; + else + *utime += vtime->utime + delta; + } while (read_seqcount_retry(&vtime->seqcount, seq)); +} + +static int vtime_state_check(struct vtime *vtime, int cpu) +{ + /* + * We raced against a context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + return 0; +} + +static u64 kcpustat_user_vtime(struct vtime *vtime) +{ + if (vtime->state == VTIME_USER) + return vtime->utime + vtime_delta(vtime); + else if (vtime->state == VTIME_GUEST) + return vtime->gtime + vtime_delta(vtime); + return 0; +} + +static int kcpustat_field_vtime(u64 *cpustat, + struct task_struct *tsk, + enum cpu_usage_stat usage, + int cpu, u64 *val) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *val = cpustat[usage]; + + /* + * Nice VS unnice cputime accounting may be inaccurate if + * the nice value has changed since the last vtime update. + * But proper fix would involve interrupting target on nice + * updates which is a no go on nohz_full (although the scheduler + * may still interrupt the target if rescheduling is needed...) + */ + switch (usage) { + case CPUTIME_SYSTEM: + if (vtime->state == VTIME_SYS) + *val += vtime->stime + vtime_delta(vtime); + break; + case CPUTIME_USER: + if (task_nice(tsk) <= 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_NICE: + if (task_nice(tsk) > 0) + *val += kcpustat_user_vtime(vtime); + break; + case CPUTIME_GUEST: + if (vtime->state == VTIME_GUEST && task_nice(tsk) <= 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + case CPUTIME_GUEST_NICE: + if (vtime->state == VTIME_GUEST && task_nice(tsk) > 0) + *val += vtime->gtime + vtime_delta(vtime); + break; + default: + break; + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return 0; +} + +u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + u64 *cpustat = kcpustat->cpustat; + struct rq *rq; + u64 val; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) + return cpustat[usage]; + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + return cpustat[usage]; + } + + err = kcpustat_field_vtime(cpustat, curr, usage, cpu, &val); + rcu_read_unlock(); + + if (!err) + return val; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_field); + +static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, + const struct kernel_cpustat *src, + struct task_struct *tsk, int cpu) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + u64 *cpustat; + u64 delta; + + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *dst = *src; + cpustat = dst->cpustat; + + /* Task is sleeping, dead or idle, nothing to add */ + if (vtime->state < VTIME_SYS) + continue; + + delta = vtime_delta(vtime); + + /* + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. + */ + if (vtime->state == VTIME_SYS) { + cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; + } else if (vtime->state == VTIME_USER) { + if (task_nice(tsk) > 0) + cpustat[CPUTIME_NICE] += vtime->utime + delta; + else + cpustat[CPUTIME_USER] += vtime->utime + delta; + } else { + WARN_ON_ONCE(vtime->state != VTIME_GUEST); + if (task_nice(tsk) > 0) { + cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; + cpustat[CPUTIME_NICE] += vtime->gtime + delta; + } else { + cpustat[CPUTIME_GUEST] += vtime->gtime + delta; + cpustat[CPUTIME_USER] += vtime->gtime + delta; + } + } } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return err; +} + +void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + const struct kernel_cpustat *src = &kcpustat_cpu(cpu); + struct rq *rq; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) { + *dst = *src; + return; + } + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + *dst = *src; + return; + } + + err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); + rcu_read_unlock(); + + if (!err) + return; + + cpu_relax(); + } } +EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a8a08030a8f7..43323f875cb9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1743,13 +1743,16 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static void set_next_task_dl(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + if (!first) + return; + if (hrtick_enabled(rq)) start_hrtick_dl(rq, p); @@ -1770,22 +1773,19 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, return rb_entry(left, struct sched_dl_entity, rb_node); } -static struct task_struct * -pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) +static struct task_struct *pick_next_task_dl(struct rq *rq) { struct sched_dl_entity *dl_se; struct dl_rq *dl_rq = &rq->dl; struct task_struct *p; - WARN_ON_ONCE(prev || rf); - if (!sched_dl_runnable(rq)) return NULL; dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); p = dl_task_of(dl_se); - set_next_task_dl(rq, p); + set_next_task_dl(rq, p, true); return p; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 69a81a5709ff..08a233e97a01 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -229,8 +229,7 @@ static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight } } - /* hint to use a 32x32->64 mul */ - fact = (u64)(u32)fact * lw->inv_weight; + fact = mul_u32_u32(fact, lw->inv_weight); while (fact >> 32) { fact >>= 1; @@ -1474,7 +1473,12 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } -static unsigned long cpu_runnable_load(struct rq *rq); +static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq); + +static unsigned long cpu_runnable_load(struct rq *rq) +{ + return cfs_rq_runnable_load_avg(&rq->cfs); +} /* Cached statistics for all CPUs within a node */ struct numa_stats { @@ -3504,9 +3508,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) cfs_rq->load_last_update_time_copy = sa->last_update_time; #endif - if (decayed) - cfs_rq_util_change(cfs_rq, 0); - return decayed; } @@ -3616,8 +3617,12 @@ static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s attach_entity_load_avg(cfs_rq, se, SCHED_CPUFREQ_MIGRATION); update_tg_load_avg(cfs_rq, 0); - } else if (decayed && (flags & UPDATE_TG)) - update_tg_load_avg(cfs_rq, 0); + } else if (decayed) { + cfs_rq_util_change(cfs_rq, 0); + + if (flags & UPDATE_TG) + update_tg_load_avg(cfs_rq, 0); + } } #ifndef CONFIG_64BIT @@ -3764,10 +3769,21 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) return; /* + * Reset EWMA on utilization increases, the moving average is used only + * to smooth utilization decreases. + */ + ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); + if (sched_feat(UTIL_EST_FASTUP)) { + if (ue.ewma < ue.enqueued) { + ue.ewma = ue.enqueued; + goto done; + } + } + + /* * Skip update of task's estimated utilization when its EWMA is * already ~1% close to its last activation value. */ - ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED); last_ewma_diff = ue.enqueued - ue.ewma; if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100))) return; @@ -3800,6 +3816,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) ue.ewma <<= UTIL_EST_WEIGHT_SHIFT; ue.ewma += last_ewma_diff; ue.ewma >>= UTIL_EST_WEIGHT_SHIFT; +done: WRITE_ONCE(p->se.avg.util_est, ue); } @@ -5370,26 +5387,45 @@ static int sched_idle_cpu(int cpu) rq->nr_running); } -static unsigned long cpu_runnable_load(struct rq *rq) +static unsigned long cpu_load(struct rq *rq) { - return cfs_rq_runnable_load_avg(&rq->cfs); + return cfs_rq_load_avg(&rq->cfs); } -static unsigned long capacity_of(int cpu) +/* + * cpu_load_without - compute CPU load without any contributions from *p + * @cpu: the CPU which load is requested + * @p: the task which load should be discounted + * + * The load of a CPU is defined by the load of tasks currently enqueued on that + * CPU as well as tasks which are currently sleeping after an execution on that + * CPU. + * + * This method returns the load of the specified CPU by discounting the load of + * the specified task, whenever the task is currently contributing to the CPU + * load. + */ +static unsigned long cpu_load_without(struct rq *rq, struct task_struct *p) { - return cpu_rq(cpu)->cpu_capacity; -} + struct cfs_rq *cfs_rq; + unsigned int load; -static unsigned long cpu_avg_load_per_task(int cpu) -{ - struct rq *rq = cpu_rq(cpu); - unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running); - unsigned long load_avg = cpu_runnable_load(rq); + /* Task has no contribution or is new */ + if (cpu_of(rq) != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) + return cpu_load(rq); - if (nr_running) - return load_avg / nr_running; + cfs_rq = &rq->cfs; + load = READ_ONCE(cfs_rq->avg.load_avg); - return 0; + /* Discount task's util from CPU's util */ + lsub_positive(&load, task_h_load(p)); + + return load; +} + +static unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; } static void record_wakee(struct task_struct *p) @@ -5482,7 +5518,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = cpu_runnable_load(cpu_rq(this_cpu)); + this_eff_load = cpu_load(cpu_rq(this_cpu)); if (sync) { unsigned long current_load = task_h_load(current); @@ -5500,7 +5536,7 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = cpu_runnable_load(cpu_rq(prev_cpu)); + prev_eff_load = cpu_load(cpu_rq(prev_cpu)); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -5538,149 +5574,9 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, return target; } -static unsigned long cpu_util_without(int cpu, struct task_struct *p); - -static unsigned long capacity_spare_without(int cpu, struct task_struct *p) -{ - return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - * - * Assumes p is allowed on at least one CPU in sd. - */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, - int this_cpu, int sd_flag) -{ - struct sched_group *idlest = NULL, *group = sd->groups; - struct sched_group *most_spare_sg = NULL; - unsigned long min_runnable_load = ULONG_MAX; - unsigned long this_runnable_load = ULONG_MAX; - unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; - unsigned long most_spare = 0, this_spare = 0; - int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; - unsigned long imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - - do { - unsigned long load, avg_load, runnable_load; - unsigned long spare_cap, max_spare_cap; - int local_group; - int i; - - /* Skip over this group if it has no CPUs allowed */ - if (!cpumask_intersects(sched_group_span(group), - p->cpus_ptr)) - continue; - - local_group = cpumask_test_cpu(this_cpu, - sched_group_span(group)); - - /* - * Tally up the load of all CPUs in the group and find - * the group containing the CPU with most spare capacity. - */ - avg_load = 0; - runnable_load = 0; - max_spare_cap = 0; - - for_each_cpu(i, sched_group_span(group)) { - load = cpu_runnable_load(cpu_rq(i)); - runnable_load += load; - - avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); - - spare_cap = capacity_spare_without(i, p); - - if (spare_cap > max_spare_cap) - max_spare_cap = spare_cap; - } - - /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / - group->sgc->capacity; - - if (local_group) { - this_runnable_load = runnable_load; - this_avg_load = avg_load; - this_spare = max_spare_cap; - } else { - if (min_runnable_load > (runnable_load + imbalance)) { - /* - * The runnable load is significantly smaller - * so we can pick this new CPU: - */ - min_runnable_load = runnable_load; - min_avg_load = avg_load; - idlest = group; - } else if ((runnable_load < (min_runnable_load + imbalance)) && - (100*min_avg_load > imbalance_scale*avg_load)) { - /* - * The runnable loads are close so take the - * blocked load into account through avg_load: - */ - min_avg_load = avg_load; - idlest = group; - } - - if (most_spare < max_spare_cap) { - most_spare = max_spare_cap; - most_spare_sg = group; - } - } - } while (group = group->next, group != sd->groups); - - /* - * The cross-over point between using spare capacity or least load - * is too conservative for high utilization tasks on partially - * utilized systems if we require spare_capacity > task_util(p), - * so we allow for some task stuffing by using - * spare_capacity > task_util(p)/2. - * - * Spare capacity can't be used for fork because the utilization has - * not been set yet, we must first select a rq to compute the initial - * utilization. - */ - if (sd_flag & SD_BALANCE_FORK) - goto skip_spare; - - if (this_spare > task_util(p) / 2 && - imbalance_scale*this_spare > 100*most_spare) - return NULL; - - if (most_spare > task_util(p) / 2) - return most_spare_sg; - -skip_spare: - if (!idlest) - return NULL; - - /* - * When comparing groups across NUMA domains, it's possible for the - * local domain to be very lightly loaded relative to the remote - * domains but "imbalance" skews the comparison making remote CPUs - * look much more favourable. When considering cross-domain, add - * imbalance to the runnable load on the remote node and consider - * staying local. - */ - if ((sd->flags & SD_NUMA) && - min_runnable_load + imbalance >= this_runnable_load) - return NULL; - - if (min_runnable_load > (this_runnable_load + imbalance)) - return NULL; - - if ((this_runnable_load < (min_runnable_load + imbalance)) && - (100*this_avg_load < imbalance_scale*min_avg_load)) - return NULL; - - return idlest; -} + int this_cpu, int sd_flag); /* * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. @@ -5729,7 +5625,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this continue; } - load = cpu_runnable_load(cpu_rq(i)); + load = cpu_load(cpu_rq(i)); if (load < min_load) { min_load = load; least_loaded_cpu = i; @@ -5753,7 +5649,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return prev_cpu; /* - * We need task's util for capacity_spare_without, sync it up to + * We need task's util for cpu_util_without, sync it up to * prev_cpu's last_update_time. */ if (!(sd_flag & SD_BALANCE_FORK)) @@ -6746,7 +6642,7 @@ preempt: set_last_buddy(se); } -static struct task_struct * +struct task_struct * pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct cfs_rq *cfs_rq = &rq->cfs; @@ -6890,6 +6786,11 @@ idle: return NULL; } +static struct task_struct *__pick_next_task_fair(struct rq *rq) +{ + return pick_next_task_fair(rq, NULL, NULL); +} + /* * Account for a descheduled task: */ @@ -7079,11 +6980,49 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10; enum fbq_type { regular, remote, all }; +/* + * 'group_type' describes the group of CPUs at the moment of load balancing. + * + * The enum is ordered by pulling priority, with the group with lowest priority + * first so the group_type can simply be compared when selecting the busiest + * group. See update_sd_pick_busiest(). + */ enum group_type { - group_other = 0, + /* The group has spare capacity that can be used to run more tasks. */ + group_has_spare = 0, + /* + * The group is fully used and the tasks don't compete for more CPU + * cycles. Nevertheless, some tasks might wait before running. + */ + group_fully_busy, + /* + * SD_ASYM_CPUCAPACITY only: One task doesn't fit with CPU's capacity + * and must be migrated to a more powerful CPU. + */ group_misfit_task, + /* + * SD_ASYM_PACKING only: One local CPU with higher capacity is available, + * and the task should be migrated to it instead of running on the + * current CPU. + */ |