diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-12 15:33:42 -0800 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-12-12 15:33:42 -0800 |
| commit | bf57ae2165bad1cb273095a5c09708ab503cd874 (patch) | |
| tree | 17633a2dc20054112c798ec2f82f495d962e2156 /kernel | |
| parent | add76959575736c194b3118d96e43f8cd7bcec82 (diff) | |
| parent | d6962c4fe8f96f7d384d6489b6b5ab5bf3e35991 (diff) | |
| download | linux-bf57ae2165bad1cb273095a5c09708ab503cd874.tar.gz linux-bf57ae2165bad1cb273095a5c09708ab503cd874.tar.bz2 linux-bf57ae2165bad1cb273095a5c09708ab503cd874.zip | |
Merge tag 'sched-core-2022-12-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- Implement persistent user-requested affinity: introduce
affinity_context::user_mask and unconditionally preserve the
user-requested CPU affinity masks, for long-lived tasks to better
interact with cpusets & CPU hotplug events over longer timespans,
without destroying the original affinity intent if the underlying
topology changes.
- Uclamp updates: fix relationship between uclamp and fits_capacity()
- PSI fixes
- Misc fixes & updates
* tag 'sched-core-2022-12-12' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched: Clear ttwu_pending after enqueue_task()
sched/psi: Use task->psi_flags to clear in CPU migration
sched/psi: Stop relying on timer_pending() for poll_work rescheduling
sched/psi: Fix avgs_work re-arm in psi_avgs_work()
sched/psi: Fix possible missing or delayed pending event
sched: Always clear user_cpus_ptr in do_set_cpus_allowed()
sched: Enforce user requested affinity
sched: Always preserve the user requested cpumask
sched: Introduce affinity_context
sched: Add __releases annotations to affine_move_task()
sched/fair: Check if prev_cpu has highest spare cap in feec()
sched/fair: Consider capacity inversion in util_fits_cpu()
sched/fair: Detect capacity inversion
sched/uclamp: Cater for uclamp in find_energy_efficient_cpu()'s early exit condition
sched/uclamp: Make cpu_overutilized() use util_fits_cpu()
sched/uclamp: Make asym_fits_capacity() use util_fits_cpu()
sched/uclamp: Make select_idle_capacity() use util_fits_cpu()
sched/uclamp: Fix fits_capacity() check in feec()
sched/uclamp: Make task_fits_capacity() use util_fits_cpu()
sched/uclamp: Fix relationship between uclamp and migration margin
Diffstat (limited to 'kernel')
| -rw-r--r-- | kernel/sched/core.c | 259 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 7 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 303 | ||||
| -rw-r--r-- | kernel/sched/psi.c | 100 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 92 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 22 |
6 files changed, 603 insertions, 180 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index daff72f00385..78b2d5cabcc5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1392,7 +1392,7 @@ static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id, if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) return; - WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value); + uclamp_rq_set(rq, clamp_id, clamp_value); } static inline @@ -1543,8 +1543,8 @@ static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p, if (bucket->tasks == 1 || uc_se->value > bucket->value) bucket->value = uc_se->value; - if (uc_se->value > READ_ONCE(uc_rq->value)) - WRITE_ONCE(uc_rq->value, uc_se->value); + if (uc_se->value > uclamp_rq_get(rq, clamp_id)) + uclamp_rq_set(rq, clamp_id, uc_se->value); } /* @@ -1610,7 +1610,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, if (likely(bucket->tasks)) return; - rq_clamp = READ_ONCE(uc_rq->value); + rq_clamp = uclamp_rq_get(rq, clamp_id); /* * Defensive programming: this should never happen. If it happens, * e.g. due to future modification, warn and fixup the expected value. @@ -1618,7 +1618,7 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, SCHED_WARN_ON(bucket->value > rq_clamp); if (bucket->value >= rq_clamp) { bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value); - WRITE_ONCE(uc_rq->value, bkt_clamp); + uclamp_rq_set(rq, clamp_id, bkt_clamp); } } @@ -2053,7 +2053,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) if (!(flags & ENQUEUE_RESTORE)) { sched_info_enqueue(rq, p); - psi_enqueue(p, flags & ENQUEUE_WAKEUP); + psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); } uclamp_rq_inc(rq, p); @@ -2189,14 +2189,18 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP static void -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx); static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags); + struct affinity_context *ctx); static void migrate_disable_switch(struct rq *rq, struct task_struct *p) { + struct affinity_context ac = { + .new_mask = cpumask_of(rq->cpu), + .flags = SCA_MIGRATE_DISABLE, + }; + if (likely(!p->migration_disabled)) return; @@ -2206,7 +2210,7 @@ static void migrate_disable_switch(struct rq *rq, struct task_struct *p) /* * Violates locking rules! see comment in __do_set_cpus_allowed(). */ - __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); + __do_set_cpus_allowed(p, &ac); } void migrate_disable(void) @@ -2228,6 +2232,10 @@ EXPORT_SYMBOL_GPL(migrate_disable); void migrate_enable(void) { struct task_struct *p = current; + struct affinity_context ac = { + .new_mask = &p->cpus_mask, + .flags = SCA_MIGRATE_ENABLE, + }; if (p->migration_disabled > 1) { p->migration_disabled--; @@ -2243,7 +2251,7 @@ void migrate_enable(void) */ preempt_disable(); if (p->cpus_ptr != &p->cpus_mask) - __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + __set_cpus_allowed_ptr(p, &ac); /* * Mustn't clear migration_disabled() until cpus_ptr points back at the * regular cpus_mask, otherwise things that race (eg. @@ -2523,19 +2531,25 @@ out_unlock: * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) +void set_cpus_allowed_common(struct task_struct *p, struct affinity_context *ctx) { - if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { - p->cpus_ptr = new_mask; + if (ctx->flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = ctx->new_mask; return; } - cpumask_copy(&p->cpus_mask, new_mask); - p->nr_cpus_allowed = cpumask_weight(new_mask); + cpumask_copy(&p->cpus_mask, ctx->new_mask); + p->nr_cpus_allowed = cpumask_weight(ctx->new_mask); + + /* + * Swap in a new user_cpus_ptr if SCA_USER flag set + */ + if (ctx->flags & SCA_USER) + swap(p->user_cpus_ptr, ctx->user_mask); } static void -__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) +__do_set_cpus_allowed(struct task_struct *p, struct affinity_context *ctx) { struct rq *rq = task_rq(p); bool queued, running; @@ -2552,7 +2566,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 * * XXX do further audits, this smells like something putrid. */ - if (flags & SCA_MIGRATE_DISABLE) + if (ctx->flags & SCA_MIGRATE_DISABLE) SCHED_WARN_ON(!p->on_cpu); else lockdep_assert_held(&p->pi_lock); @@ -2571,7 +2585,7 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask, flags); + p->sched_class->set_cpus_allowed(p, ctx); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -2579,14 +2593,27 @@ __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 set_next_task(rq, p); } +/* + * Used for kthread_bind() and select_fallback_rq(), in both cases the user + * affinity (if any) should be destroyed too. + */ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) { - __do_set_cpus_allowed(p, new_mask, 0); + struct affinity_context ac = { + .new_mask = new_mask, + .user_mask = NULL, + .flags = SCA_USER, /* clear the user requested mask */ + }; + + __do_set_cpus_allowed(p, &ac); + kfree(ac.user_mask); } int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, int node) { + unsigned long flags; + if (!src->user_cpus_ptr) return 0; @@ -2594,7 +2621,10 @@ int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src, if (!dst->user_cpus_ptr) return -ENOMEM; + /* Use pi_lock to protect content of user_cpus_ptr */ + raw_spin_lock_irqsave(&src->pi_lock, flags); cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr); + raw_spin_unlock_irqrestore(&src->pi_lock, flags); return 0; } @@ -2690,6 +2720,8 @@ void release_user_cpus_ptr(struct task_struct *p) */ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, int dest_cpu, unsigned int flags) + __releases(rq->lock) + __releases(p->pi_lock) { struct set_affinity_pending my_pending = { }, *pending = NULL; bool stop_pending, complete = false; @@ -2832,8 +2864,7 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag * Called with both p->pi_lock and rq->lock held; drops both before returning. */ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags, + struct affinity_context *ctx, struct rq *rq, struct rq_flags *rf) __releases(rq->lock) @@ -2842,7 +2873,6 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p); const struct cpumask *cpu_valid_mask = cpu_active_mask; bool kthread = p->flags & PF_KTHREAD; - struct cpumask *user_mask = NULL; unsigned int dest_cpu; int ret = 0; @@ -2862,7 +2892,7 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, cpu_valid_mask = cpu_online_mask; } - if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) { + if (!kthread && !cpumask_subset(ctx->new_mask, cpu_allowed_mask)) { ret = -EINVAL; goto out; } @@ -2871,18 +2901,18 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { + if ((ctx->flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (!(flags & SCA_MIGRATE_ENABLE)) { - if (cpumask_equal(&p->cpus_mask, new_mask)) + if (!(ctx->flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, ctx->new_mask)) goto out; if (WARN_ON_ONCE(p == current && is_migration_disabled(p) && - !cpumask_test_cpu(task_cpu(p), new_mask))) { + !cpumask_test_cpu(task_cpu(p), ctx->new_mask))) { ret = -EBUSY; goto out; } @@ -2893,22 +2923,15 @@ static int __set_cpus_allowed_ptr_locked(struct task_struct *p, * for groups of tasks (ie. cpuset), so that load balancing is not * immediately required to distribute the tasks within their new mask. */ - dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask); + dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, ctx->new_mask); if (dest_cpu >= nr_cpu_ids) { ret = -EINVAL; goto out; } - __do_set_cpus_allowed(p, new_mask, flags); - - if (flags & SCA_USER) - user_mask = clear_user_cpus_ptr(p); - - ret = affine_move_task(rq, p, rf, dest_cpu, flags); + __do_set_cpus_allowed(p, ctx); - kfree(user_mask); - - return ret; + return affine_move_task(rq, p, rf, dest_cpu, ctx->flags); out: task_rq_unlock(rq, p, rf); @@ -2926,25 +2949,41 @@ out: * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, u32 flags) + struct affinity_context *ctx) { struct rq_flags rf; struct rq *rq; rq = task_rq_lock(p, &rf); - return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf); + /* + * Masking should be skipped if SCA_USER or any of the SCA_MIGRATE_* + * flags are set. + */ + if (p->user_cpus_ptr && + !(ctx->flags & (SCA_USER | SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) && + cpumask_and(rq->scratch_mask, ctx->new_mask, p->user_cpus_ptr)) + ctx->new_mask = rq->scratch_mask; + + return __set_cpus_allowed_ptr_locked(p, ctx, rq, &rf); } int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, 0); + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; + + return __set_cpus_allowed_ptr(p, &ac); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); /* * Change a given task's CPU affinity to the intersection of its current - * affinity mask and @subset_mask, writing the resulting mask to @new_mask - * and pointing @p->user_cpus_ptr to a copy of the old mask. + * affinity mask and @subset_mask, writing the resulting mask to @new_mask. + * If user_cpus_ptr is defined, use it as the basis for restricting CPU + * affinity or use cpu_online_mask instead. + * * If the resulting mask is empty, leave the affinity unchanged and return * -EINVAL. */ @@ -2952,17 +2991,14 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p, struct cpumask *new_mask, const struct cpumask *subset_mask) { - struct cpumask *user_mask = NULL; + struct affinity_context ac = { + .new_mask = new_mask, + .flags = 0, + }; struct rq_flags rf; struct rq *rq; int err; - if (!p->user_cpus_ptr) { - user_mask = kmalloc(cpumask_size(), GFP_KERNEL); - if (!user_mask) - return -ENOMEM; - } - rq = task_rq_lock(p, &rf); /* @@ -2975,31 +3011,21 @@ static int restrict_cpus_allowed_ptr(struct task_struct *p, goto err_unlock; } - if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) { + if (!cpumask_and(new_mask, task_user_cpus(p), subset_mask)) { err = -EINVAL; goto err_unlock; } - /* - * We're about to butcher the task affinity, so keep track of what - * the user asked for in case we're able to restore it later on. - */ - if (user_mask) { - cpumask_copy(user_mask, p->cpus_ptr); - p->user_cpus_ptr = user_mask; - } - - return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf); + return __set_cpus_allowed_ptr_locked(p, &ac, rq, &rf); err_unlock: task_rq_unlock(rq, p, &rf); - kfree(user_mask); return err; } /* * Restrict the CPU affinity of task @p so that it is a subset of - * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the + * task_cpu_possible_mask() and point @p->user_cpus_ptr to a copy of the * old affinity mask. If the resulting mask is empty, we warn and walk * up the cpuset hierarchy until we find a suitable mask. */ @@ -3043,34 +3069,29 @@ out_free_mask: } static int -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask); +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx); /* * Restore the affinity of a task @p which was previously restricted by a - * call to force_compatible_cpus_allowed_ptr(). This will clear (and free) - * @p->user_cpus_ptr. + * call to force_compatible_cpus_allowed_ptr(). * * It is the caller's responsibility to serialise this with any calls to * force_compatible_cpus_allowed_ptr(@p). */ void relax_compatible_cpus_allowed_ptr(struct task_struct *p) { - struct cpumask *user_mask = p->user_cpus_ptr; - unsigned long flags; + struct affinity_context ac = { + .new_mask = task_user_cpus(p), + .flags = 0, + }; + int ret; /* - * Try to restore the old affinity mask. If this fails, then - * we free the mask explicitly to avoid it being inherited across - * a subsequent fork(). + * Try to restore the old affinity mask with __sched_setaffinity(). + * Cpuset masking will be done there too. */ - if (!user_mask || !__sched_setaffinity(p, user_mask)) - return; - - raw_spin_lock_irqsave(&p->pi_lock, flags); - user_mask = clear_user_cpus_ptr(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); - - kfree(user_mask); + ret = __sched_setaffinity(p, &ac); + WARN_ON_ONCE(ret); } void set_task_cpu(struct task_struct *p, unsigned int new_cpu) @@ -3548,10 +3569,9 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) + struct affinity_context *ctx) { - return set_cpus_allowed_ptr(p, new_mask); + return set_cpus_allowed_ptr(p, ctx->new_mask); } static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } @@ -3719,13 +3739,6 @@ void sched_ttwu_pending(void *arg) if (!llist) return; - /* - * rq::ttwu_pending racy indication of out-standing wakeups. - * Races such that false-negatives are possible, since they - * are shorter lived that false-positives would be. - */ - WRITE_ONCE(rq->ttwu_pending, 0); - rq_lock_irqsave(rq, &rf); update_rq_clock(rq); @@ -3739,6 +3752,17 @@ void sched_ttwu_pending(void *arg) ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf); } + /* + * Must be after enqueueing at least once task such that + * idle_cpu() does not observe a false-negative -- if it does, + * it is possible for select_idle_siblings() to stack a number + * of tasks on this CPU during that window. + * + * It is ok to clear ttwu_pending when another task pending. + * We will receive IPI after local irq enabled and then enqueue it. + * Since now nr_running > 0, idle_cpu() will always get correct result. + */ + WRITE_ONCE(rq->ttwu_pending, 0); rq_unlock_irqrestore(rq, &rf); } @@ -8106,7 +8130,7 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) #endif static int -__sched_setaffinity(struct task_struct *p, const struct cpumask *mask) +__sched_setaffinity(struct task_struct *p, struct affinity_context *ctx) { int retval; cpumask_var_t cpus_allowed, new_mask; @@ -8120,13 +8144,16 @@ __sched_setaffinity(struct task_struct *p, const struct cpumask *mask) } cpuset_cpus_allowed(p, cpus_allowed); - cpumask_and(new_mask, mask, cpus_allowed); + cpumask_and(new_mask, ctx->new_mask, cpus_allowed); + + ctx->new_mask = new_mask; + ctx->flags |= SCA_CHECK; retval = dl_task_check_affinity(p, new_mask); if (retval) goto out_free_new_mask; -again: - retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER); + + retval = __set_cpus_allowed_ptr(p, ctx); if (retval) goto out_free_new_mask; @@ -8137,7 +8164,24 @@ again: * Just reset the cpumask to the cpuset's cpus_allowed. */ cpumask_copy(new_mask, cpus_allowed); - goto again; + + /* + * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr() + * will restore the previous user_cpus_ptr value. + * + * In the unlikely event a previous user_cpus_ptr exists, + * we need to further restrict the mask to what is allowed + * by that old user_cpus_ptr. + */ + if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) { + bool empty = !cpumask_and(new_mask, new_mask, + ctx->user_mask); + + if (WARN_ON_ONCE(empty)) + cpumask_copy(new_mask, cpus_allowed); + } + __set_cpus_allowed_ptr(p, ctx); + retval = -EINVAL; } out_free_new_mask: @@ -8149,6 +8193,8 @@ out_free_cpus_allowed: long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { + struct affinity_context ac; + struct cpumask *user_mask; struct task_struct *p; int retval; @@ -8183,7 +8229,21 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (retval) goto out_put_task; - retval = __sched_setaffinity(p, in_mask); + user_mask = kmalloc(cpumask_size(), GFP_KERNEL); + if (!user_mask) { + retval = -ENOMEM; + goto out_put_task; + } + cpumask_copy(user_mask, in_mask); + ac = (struct affinity_context){ + .new_mask = in_mask, + .user_mask = user_mask, + .flags = SCA_USER, + }; + + retval = __sched_setaffinity(p, &ac); + kfree(ac.user_mask); + out_put_task: put_task_struct(p); return retval; @@ -8964,6 +9024,12 @@ void show_state_filter(unsigned int state_filter) */ void __init init_idle(struct task_struct *idle, int cpu) { +#ifdef CONFIG_SMP + struct affinity_context ac = (struct affinity_context) { + .new_mask = cpumask_of(cpu), + .flags = 0, + }; +#endif struct rq *rq = cpu_rq(cpu); unsigned long flags; @@ -8988,7 +9054,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu), 0); + set_cpus_allowed_common(idle, &ac); #endif /* * We're having a chicken and egg problem, even though we are @@ -9775,6 +9841,7 @@ void __init sched_init(void) rq->core_cookie = 0UL; #endif + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); } set_load_weight(&init_task, false); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ae8f41e3372..0d97d54276cc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -2485,8 +2485,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask, - u32 flags) + struct affinity_context *ctx) { struct root_domain *src_rd; struct rq *rq; @@ -2501,7 +2500,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * update. We already made space for us in the destination * domain (see cpuset_can_attach()). */ - if (!cpumask_intersects(src_rd->span, new_mask)) { + if (!cpumask_intersects(src_rd->span, ctx->new_mask)) { struct dl_bw *src_dl_b; src_dl_b = dl_bw_of(cpu_of(rq)); @@ -2515,7 +2514,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask, flags); + set_cpus_allowed_common(p, ctx); } /* Assumes rq->lock is held */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e4a0b8bd941c..4cc56c91e06e 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4280,14 +4280,16 @@ static inline unsigned long task_util_est(struct task_struct *p) } #ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { - return clamp(task_util_est(p), - uclamp_eff_value(p, UCLAMP_MIN), - uclamp_eff_value(p, UCLAMP_MAX)); + return clamp(task_util_est(p), uclamp_min, uclamp_max); } #else -static inline unsigned long uclamp_task_util(struct task_struct *p) +static inline unsigned long uclamp_task_util(struct task_struct *p, + unsigned long uclamp_min, + unsigned long uclamp_max) { return task_util_est(p); } @@ -4426,10 +4428,139 @@ done: trace_sched_util_est_se_tp(&p->se); } -static inline int task_fits_capacity(struct task_struct *p, - unsigned long capacity) +static inline int util_fits_cpu(unsigned long util, + unsigned long uclamp_min, + unsigned long uclamp_max, + int cpu) { - return fits_capacity(uclamp_task_util(p), capacity); + unsigned long capacity_orig, capacity_orig_thermal; + unsigned long capacity = capacity_of(cpu); + bool fits, uclamp_max_fits; + + /* + * Check if the real util fits without any uclamp boost/cap applied. + */ + fits = fits_capacity(util, capacity); + + if (!uclamp_is_used()) + return fits; + + /* + * We must use capacity_orig_of() for comparing against uclamp_min and + * uclamp_max. We only care about capacity pressure (by using + * capacity_of()) for comparing against the real util. + * + * If a task is boosted to 1024 for example, we don't want a tiny + * pressure to skew the check whether it fits a CPU or not. + * + * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * should fit a little cpu even if there's some pressure. + * + * Only exception is for thermal pressure since it has a direct impact + * on available OPP of the system. + * + * We honour it for uclamp_min only as a drop in performance level + * could result in not getting the requested minimum performance level. + * + * For uclamp_max, we can tolerate a drop in performance level as the + * goal is to cap the task. So it's okay if it's getting less. + * + * In case of capacity inversion we should honour the inverted capacity + * for both uclamp_min and uclamp_max all the time. + */ + capacity_orig = cpu_in_capacity_inversion(cpu); + if (capacity_orig) { + capacity_orig_thermal = capacity_orig; + } else { + capacity_orig = capacity_orig_of(cpu); + capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); + } + + /* + * We want to force a task to fit a cpu as implied by uclamp_max. + * But we do have some corner cases to cater for.. + * + * + * C=z + * | ___ + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | + * | | | | | | | (util somewhere in this region) + * | | | | | | | + * | | | | | | | + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * In the above example if a task is capped to a specific performance + * point, y, then when: + * + * * util = 80% of x then it does not fit on cpu0 and should migrate + * to cpu1 + * * util = 80% of y then it is forced to fit on cpu1 to honour + * uclamp_max request. + * + * which is what we're enforcing here. A task always fits if + * uclamp_max <= capacity_orig. But when uclamp_max > capacity_orig, + * the normal upmigration rules should withhold still. + * + * Only exception is when we are on max capacity, then we need to be + * careful not to block overutilized state. This is so because: + * + * 1. There's no concept of capping at max_capacity! We can't go + * beyond this performance level anyway. + * 2. The system is being saturated when we're operating near + * max capacity, it doesn't make sense to block overutilized. + */ + uclamp_max_fits = (capacity_orig == SCHED_CAPACITY_SCALE) && (uclamp_max == SCHED_CAPACITY_SCALE); + uclamp_max_fits = !uclamp_max_fits && (uclamp_max <= capacity_orig); + fits = fits || uclamp_max_fits; + + /* + * + * C=z + * | ___ (region a, capped, util >= uclamp_max) + * | C=y | | + * |_ _ _ _ _ _ _ _ _ ___ _ _ _ | _ | _ _ _ _ _ uclamp_max + * | C=x | | | | + * | ___ | | | | (region b, uclamp_min <= util <= uclamp_max) + * |_ _ _|_ _|_ _ _ _| _ | _ _ _| _ | _ _ _ _ _ uclamp_min + * | | | | | | | + * | | | | | | | (region c, boosted, util < uclamp_min) + * +---------------------------------------- + * cpu0 cpu1 cpu2 + * + * a) If util > uclamp_max, then we're capped, we don't care about + * actual fitness value here. We only care if uclamp_max fits + * capacity without taking margin/pressure into account. + * See comment above. + * + * b) If uclamp_min <= util <= uclamp_max, then the normal + * fits_capacity() rules apply. Except we need to ensure that we + * enforce we remain within uclamp_max, see comment above. + * + * c) If util < uclamp_min, then we are boosted. Same as (b) but we + * need to take into account the boosted value fits the CPU without + * taking margin/pressure into account. + * + * Cases (a) and (b) are handled in the 'fits' variable already. We + * just need to consider an extra check for case (c) after ensuring we + * handle the case uclamp_min > uclamp_max. + */ + uclamp_min = min(uclamp_min, uclamp_max); + if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE) + fits = fits && (uclamp_min <= capacity_orig_thermal); + + return fits; +} + +static inline int task_fits_cpu(struct task_struct *p, int cpu) +{ + unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN); + unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX); + unsigned long util = task_util_est(p); + return util_fits_cpu(util, uclamp_min, uclamp_max, cpu); } static inline void update_misfit_status(struct task_struct *p, struct rq *rq) @@ -4442,7 +4573,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) return; } - if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) { + if (task_fits_cpu(p, cpu_of(rq))) { rq->misfit_task_load = 0; return; } @@ -5862,7 +5993,10 @@ static inline void hrtick_update(struct rq *rq) #ifdef CONFIG_SMP static inline bool cpu_overutilized(int cpu) { - return !fits_capacity(cpu_util_cfs(cpu), capacity_of(cpu)); + unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN); + unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX); + + return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu); } static inline void update_overutilized_status(struct rq *rq) @@ -6654,21 +6788,23 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long task_util, best_cap = 0; + unsigned long task_util, util_min, util_max, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; cpus = this_cpu_cpumask_var_ptr(select_rq_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (fits_capacity(task_util, cpu_cap)) + if (util_fits_cpu(task_util, util_min, util_max, cpu)) return cpu; if (cpu_cap > best_cap) { @@ -6680,10 +6816,13 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } -static inline bool asym_fits_capacity(unsigned long task_util, int cpu) +static inline bool asym_fits_cpu(unsigned long util, + unsigned long util_min, + unsigned long util_max, + int cpu) { if (sched_asym_cpucap_active()) - return fits_capacity(task_util, capacity_of(cpu)); + return util_fits_cpu(util, util_min, util_max, cpu); return true; } @@ -6695,7 +6834,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) { bool has_idle_core = false; struct sched_domain *sd; - unsigned long task_util; + unsigned long task_util, util_min, util_max; int i, recent_used_cpu; /* @@ -6704,7 +6843,9 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (sched_asym_cpucap_active()) { sync_entity_load_avg(&p->se); - task_util = uclamp_task_util(p); + task_util = task_util_est(p); + util_min = uclamp_eff_value(p, UCLAMP_MIN); + util_max = uclamp_eff_value(p, UCLAMP_MAX); } /* @@ -6713,7 +6854,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) lockdep_assert_irqs_disabled(); if ((available_idle_cpu(target) || sched_idle_cpu(target)) && - asym_fits_capacity(task_util, target)) + asym_fits_cpu(task_util, util_min, util_max, target)) return target; /* @@ -6721,7 +6862,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_capacity(task_util, prev)) + asym_fits_cpu(task_util, util_min, util_max, prev)) return prev; /* @@ -6736,7 +6877,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) in_task() && prev == smp_processor_id() && this_rq()->nr_running <= 1 && - asym_fits_capacity(task_util, prev)) { + asym_fits_cpu(task_util, util_min, util_max, prev)) { return prev; } @@ -6748,7 +6889,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && - asym_fits_capacity(task_util, recent_used_cpu)) { + asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } @@ -7044,6 +7185,8 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; + unsigned long p_util_min = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MIN) : 0; + unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024; struct root_domain *rd = this_rq()->rd; int cpu, best_energy_cpu, target = -1; struct sched_domain *sd; @@ -7068,7 +7211,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!task_util_est(p)) + if (!uclamp_task_util(p, p_util_min, p_util_max)) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7076,7 +7219,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long cpu_cap, cpu_thermal_cap, util; unsigned long cur_delta, max_spare_cap = 0; - bool compute_prev_delta = false; |
