summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-02-20 17:41:08 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2023-02-20 17:41:08 -0800
commit1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f (patch)
treea5dabaa924d50867cbe347e20a7643b2850f11c0 /kernel/sched
parenta2f0e7eee1344eb9f91b22bc72d9eb0a52b849c9 (diff)
parent7c4a5b89a0b5a57a64b601775b296abf77a9fe97 (diff)
downloadlinux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.tar.gz
linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.tar.bz2
linux-1f2d9ffc7a5f916935749ffc6e93fb33bfe94d2f.zip
Merge tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: - Improve the scalability of the CFS bandwidth unthrottling logic with large number of CPUs. - Fix & rework various cpuidle routines, simplify interaction with the generic scheduler code. Add __cpuidle methods as noinstr to objtool's noinstr detection and fix boatloads of cpuidle bugs & quirks. - Add new ABI: introduce MEMBARRIER_CMD_GET_REGISTRATIONS, to query previously issued registrations. - Limit scheduler slice duration to the sysctl_sched_latency period, to improve scheduling granularity with a large number of SCHED_IDLE tasks. - Debuggability enhancement on sys_exit(): warn about disabled IRQs, but also enable them to prevent a cascade of followup problems and repeat warnings. - Fix the rescheduling logic in prio_changed_dl(). - Micro-optimize cpufreq and sched-util methods. - Micro-optimize ttwu_runnable() - Micro-optimize the idle-scanning in update_numa_stats(), select_idle_capacity() and steal_cookie_task(). - Update the RSEQ code & self-tests - Constify various scheduler methods - Remove unused methods - Refine __init tags - Documentation updates - Misc other cleanups, fixes * tag 'sched-core-2023-02-20' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (110 commits) sched/rt: pick_next_rt_entity(): check list_entry sched/deadline: Add more reschedule cases to prio_changed_dl() sched/fair: sanitize vruntime of entity being placed sched/fair: Remove capacity inversion detection sched/fair: unlink misfit task from cpu overutilized objtool: mem*() are not uaccess safe cpuidle: Fix poll_idle() noinstr annotation sched/clock: Make local_clock() noinstr sched/clock/x86: Mark sched_clock() noinstr x86/pvclock: Improve atomic update of last_value in pvclock_clocksource_read() x86/atomics: Always inline arch_atomic64*() cpuidle: tracing, preempt: Squash _rcuidle tracing cpuidle: tracing: Warn about !rcu_is_watching() cpuidle: lib/bug: Disable rcu_is_watching() during WARN/BUG cpuidle: drivers: firmware: psci: Dont instrument suspend code KVM: selftests: Fix build of rseq test exit: Detect and fix irq disabled state in oops cpuidle, arm64: Fix the ARM64 cpuidle logic cpuidle: mvebu: Fix duplicate flags assignment sched/fair: Limit sched slice duration ...
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/clock.c27
-rw-r--r--kernel/sched/core.c134
-rw-r--r--kernel/sched/cpufreq_schedutil.c43
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/deadline.c42
-rw-r--r--kernel/sched/fair.c389
-rw-r--r--kernel/sched/idle.c47
-rw-r--r--kernel/sched/membarrier.c39
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h107
-rw-r--r--kernel/sched/topology.c4
11 files changed, 565 insertions, 276 deletions
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e374c0c923da..5732fa75ebab 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -93,7 +93,7 @@ struct sched_clock_data {
static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
-notrace static inline struct sched_clock_data *this_scd(void)
+static __always_inline struct sched_clock_data *this_scd(void)
{
return this_cpu_ptr(&sched_clock_data);
}
@@ -244,12 +244,12 @@ late_initcall(sched_clock_init_late);
* min, max except they take wrapping into account
*/
-notrace static inline u64 wrap_min(u64 x, u64 y)
+static __always_inline u64 wrap_min(u64 x, u64 y)
{
return (s64)(x - y) < 0 ? x : y;
}
-notrace static inline u64 wrap_max(u64 x, u64 y)
+static __always_inline u64 wrap_max(u64 x, u64 y)
{
return (s64)(x - y) > 0 ? x : y;
}
@@ -260,7 +260,7 @@ notrace static inline u64 wrap_max(u64 x, u64 y)
* - filter out backward motion
* - use the GTOD tick value to create a window to filter crazy TSC values
*/
-notrace static u64 sched_clock_local(struct sched_clock_data *scd)
+static __always_inline u64 sched_clock_local(struct sched_clock_data *scd)
{
u64 now, clock, old_clock, min_clock, max_clock, gtod;
s64 delta;
@@ -287,13 +287,28 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
+ if (!arch_try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
}
-notrace static u64 sched_clock_remote(struct sched_clock_data *scd)
+noinstr u64 local_clock(void)
+{
+ u64 clock;
+
+ if (static_branch_likely(&__sched_clock_stable))
+ return sched_clock() + __sched_clock_offset;
+
+ preempt_disable_notrace();
+ clock = sched_clock_local(this_scd());
+ preempt_enable_notrace();
+
+ return clock;
+}
+EXPORT_SYMBOL_GPL(local_clock);
+
+static notrace u64 sched_clock_remote(struct sched_clock_data *scd)
{
struct sched_clock_data *my_scd = this_scd();
u64 this_clock, remote_clock;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a4918a1faa9..fb49dbf61273 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -152,7 +152,7 @@ __read_mostly int scheduler_running;
DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
/* kernel prio, less is more */
-static inline int __task_prio(struct task_struct *p)
+static inline int __task_prio(const struct task_struct *p)
{
if (p->sched_class == &stop_sched_class) /* trumps deadline */
return -2;
@@ -174,7 +174,8 @@ static inline int __task_prio(struct task_struct *p)
*/
/* real prio, less is less */
-static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
+static inline bool prio_less(const struct task_struct *a,
+ const struct task_struct *b, bool in_fi)
{
int pa = __task_prio(a), pb = __task_prio(b);
@@ -194,7 +195,8 @@ static inline bool prio_less(struct task_struct *a, struct task_struct *b, bool
return false;
}
-static inline bool __sched_core_less(struct task_struct *a, struct task_struct *b)
+static inline bool __sched_core_less(const struct task_struct *a,
+ const struct task_struct *b)
{
if (a->core_cookie < b->core_cookie)
return true;
@@ -3675,14 +3677,39 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
}
/*
- * Mark the task runnable and perform wakeup-preemption.
+ * Mark the task runnable.
*/
-static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
+static inline void ttwu_do_wakeup(struct task_struct *p)
{
- check_preempt_curr(rq, p, wake_flags);
WRITE_ONCE(p->__state, TASK_RUNNING);
trace_sched_wakeup(p);
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
+ struct rq_flags *rf)
+{
+ int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
+
+ lockdep_assert_rq_held(rq);
+
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+
+#ifdef CONFIG_SMP
+ if (wake_flags & WF_MIGRATED)
+ en_flags |= ENQUEUE_MIGRATED;
+ else
+#endif
+ if (p->in_iowait) {
+ delayacct_blkio_end(p);
+ atomic_dec(&task_rq(p)->nr_iowait);
+ }
+
+ activate_task(rq, p, en_flags);
+ check_preempt_curr(rq, p, wake_flags);
+
+ ttwu_do_wakeup(p);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken) {
@@ -3712,31 +3739,6 @@ static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
}
-static void
-ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
- struct rq_flags *rf)
-{
- int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
-
- lockdep_assert_rq_held(rq);
-
- if (p->sched_contributes_to_load)
- rq->nr_uninterruptible--;
-
-#ifdef CONFIG_SMP
- if (wake_flags & WF_MIGRATED)
- en_flags |= ENQUEUE_MIGRATED;
- else
-#endif
- if (p->in_iowait) {
- delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
- }
-
- activate_task(rq, p, en_flags);
- ttwu_do_wakeup(rq, p, wake_flags, rf);
-}
-
/*
* Consider @p being inside a wait loop:
*
@@ -3770,9 +3772,15 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags)
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
- /* check_preempt_curr() may use rq clock */
- update_rq_clock(rq);
- ttwu_do_wakeup(rq, p, wake_flags, &rf);
+ if (!task_on_cpu(rq, p)) {
+ /*
+ * When on_rq && !on_cpu the task is preempted, see if
+ * it should preempt the task that is current now.
+ */
+ update_rq_clock(rq);
+ check_preempt_curr(rq, p, wake_flags);
+ }
+ ttwu_do_wakeup(p);
ret = 1;
}
__task_rq_unlock(rq, &rf);
@@ -4138,8 +4146,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
goto out;
trace_sched_waking(p);
- WRITE_ONCE(p->__state, TASK_RUNNING);
- trace_sched_wakeup(p);
+ ttwu_do_wakeup(p);
goto out;
}
@@ -5104,6 +5111,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
+ switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -6260,7 +6268,7 @@ static bool steal_cookie_task(int cpu, struct sched_domain *sd)
{
int i;
- for_each_cpu_wrap(i, sched_domain_span(sd), cpu) {
+ for_each_cpu_wrap(i, sched_domain_span(sd), cpu + 1) {
if (i == cpu)
continue;
@@ -11365,3 +11373,53 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
{
trace_sched_update_nr_running_tp(rq, count);
}
+
+#ifdef CONFIG_SCHED_MM_CID
+void sched_mm_cid_exit_signals(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_before_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ mm_cid_put(mm, t->mm_cid);
+ t->mm_cid = -1;
+ t->mm_cid_active = 0;
+ local_irq_restore(flags);
+}
+
+void sched_mm_cid_after_execve(struct task_struct *t)
+{
+ struct mm_struct *mm = t->mm;
+ unsigned long flags;
+
+ if (!mm)
+ return;
+ local_irq_save(flags);
+ t->mm_cid = mm_cid_get(mm);
+ t->mm_cid_active = 1;
+ local_irq_restore(flags);
+ rseq_set_notify_resume(t);
+}
+
+void sched_mm_cid_fork(struct task_struct *t)
+{
+ WARN_ON_ONCE(!t->mm || t->mm_cid != -1);
+ t->mm_cid_active = 1;
+}
+#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 1207c78f85c1..5c840151f3bb 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,7 +48,6 @@ struct sugov_cpu {
unsigned long util;
unsigned long bw_dl;
- unsigned long max;
/* The field below is for single-CPU policies only: */
#ifdef CONFIG_NO_HZ_COMMON
@@ -158,7 +157,6 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu)
{
struct rq *rq = cpu_rq(sg_cpu->cpu);
- sg_cpu->max = arch_scale_cpu_capacity(sg_cpu->cpu);
sg_cpu->bw_dl = cpu_bw_dl(rq);
sg_cpu->util = effective_cpu_util(sg_cpu->cpu, cpu_util_cfs(sg_cpu->cpu),
FREQUENCY_UTIL, NULL);
@@ -238,6 +236,7 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* sugov_iowait_apply() - Apply the IO boost to a CPU.
* @sg_cpu: the sugov data for the cpu to boost
* @time: the update time from the caller
+ * @max_cap: the max CPU capacity
*
* A CPU running a task which woken up after an IO operation can have its
* utilization boosted to speed up the completion of those IO operations.
@@ -251,7 +250,8 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
* This mechanism is designed to boost high frequently IO waiting tasks, while
* being more conservative on tasks which does sporadic IO operations.
*/
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
+static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+ unsigned long max_cap)
{
unsigned long boost;
@@ -280,7 +280,7 @@ static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time)
* sg_cpu->util is already in capacity scale; convert iowait_boost
* into the same scale so we can compare.
*/
- boost = (sg_cpu->iowait_boost * sg_cpu->max) >> SCHED_CAPACITY_SHIFT;
+ boost = (sg_cpu->iowait_boost * max_cap) >> SCHED_CAPACITY_SHIFT;
boost = uclamp_rq_util_with(cpu_rq(sg_cpu->cpu), boost, NULL);
if (sg_cpu->util < boost)
sg_cpu->util = boost;
@@ -310,7 +310,8 @@ static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu)
}
static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
- u64 time, unsigned int flags)
+ u64 time, unsigned long max_cap,
+ unsigned int flags)
{
sugov_iowait_boost(sg_cpu, time, flags);
sg_cpu->last_update = time;
@@ -321,7 +322,7 @@ static inline bool sugov_update_single_common(struct sugov_cpu *sg_cpu,
return false;
sugov_get_util(sg_cpu);
- sugov_iowait_apply(sg_cpu, time);
+ sugov_iowait_apply(sg_cpu, time, max_cap);
return true;
}
@@ -332,12 +333,15 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned int cached_freq = sg_policy->cached_raw_freq;
+ unsigned long max_cap;
unsigned int next_f;
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- next_f = get_next_freq(sg_policy, sg_cpu->util, sg_cpu->max);
+ next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
/*
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
@@ -374,6 +378,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
unsigned long prev_util = sg_cpu->util;
+ unsigned long max_cap;
/*
* Fall back to the "frequency" path if frequency invariance is not
@@ -385,7 +390,9 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
return;
}
- if (!sugov_update_single_common(sg_cpu, time, flags))
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
+ if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
/*
@@ -399,7 +406,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, map_util_perf(sg_cpu->bw_dl),
- map_util_perf(sg_cpu->util), sg_cpu->max);
+ map_util_perf(sg_cpu->util), max_cap);
sg_cpu->sg_policy->last_freq_update_time = time;
}
@@ -408,25 +415,21 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
{
struct sugov_policy *sg_policy = sg_cpu->sg_policy;
struct cpufreq_policy *policy = sg_policy->policy;
- unsigned long util = 0, max = 1;
+ unsigned long util = 0, max_cap;
unsigned int j;
+ max_cap = arch_scale_cpu_capacity(sg_cpu->cpu);
+
for_each_cpu(j, policy->cpus) {
struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
- unsigned long j_util, j_max;
sugov_get_util(j_sg_cpu);
- sugov_iowait_apply(j_sg_cpu, time);
- j_util = j_sg_cpu->util;
- j_max = j_sg_cpu->max;
+ sugov_iowait_apply(j_sg_cpu, time, max_cap);
- if (j_util * max > j_max * util) {
- util = j_util;
- max = j_max;
- }
+ util = max(j_sg_cpu->util, util);
}
- return get_next_freq(sg_policy, util, max);
+ return get_next_freq(sg_policy, util, max_cap);
}
static void
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 95fc77853743..af7952f12e6c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,10 @@
* Simple CPU accounting cgroup controller
*/
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+ #include <asm/cputime.h>
+#endif
+
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0d97d54276cc..71b24371a6f7 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2663,17 +2663,20 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
static void prio_changed_dl(struct rq *rq, struct task_struct *p,
int oldprio)
{
- if (task_on_rq_queued(p) || task_current(rq, p)) {
+ if (!task_on_rq_queued(p))
+ return;
+
#ifdef CONFIG_SMP
- /*
- * This might be too much, but unfortunately
- * we don't have the old deadline value, and
- * we can't argue if the task is increasing
- * or lowering its prio, so...
- */
- if (!rq->dl.overloaded)
- deadline_queue_pull_task(rq);
+ /*
+ * This might be too much, but unfortunately
+ * we don't have the old deadline value, and
+ * we can't argue if the task is increasing
+ * or lowering its prio, so...
+ */
+ if (!rq->dl.overloaded)
+ deadline_queue_pull_task(rq);
+ if (task_current(rq, p)) {
/*
* If we now have a earlier deadline task than p,
* then reschedule, provided p is still on this
@@ -2681,15 +2684,24 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline))
resched_curr(rq);
-#else
+ } else {
/*
- * Again, we don't know if p has a earlier
- * or later deadline, so let's blindly set a
- * (maybe not needed) rescheduling point.
+ * Current may not be deadline in case p was throttled but we
+ * have just replenished it (e.g. rt_mutex_setprio()).
+ *
+ * Otherwise, if p was given an earlier deadline, reschedule.
*/
- resched_curr(rq);
-#endif /* CONFIG_SMP */
+ if (!dl_task(rq->curr) ||
+ dl_time_before(p->dl.deadline, rq->curr->dl.deadline))
+ resched_curr(rq);
}
+#else
+ /*
+ * We don't know if p has a earlier or later deadline, so let's blindly
+ * set a (maybe not needed) rescheduling point.
+ */
+ resched_curr(rq);
+#endif
}
DEFINE_SCHED_CLASS(dl) = {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..ff4dbbae3b10 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -468,7 +468,7 @@ is_same_group(struct sched_entity *se, struct sched_entity *pse)
return NULL;
}
-static inline struct sched_entity *parent_entity(struct sched_entity *se)
+static inline struct sched_entity *parent_entity(const struct sched_entity *se)
{
return se->parent;
}
@@ -595,8 +595,8 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
return min_vruntime;
}
-static inline bool entity_before(struct sched_entity *a,
- struct sched_entity *b)
+static inline bool entity_before(const struct sched_entity *a,
+ const struct sched_entity *b)
{
return (s64)(a->vruntime - b->vruntime) < 0;
}
@@ -1804,7 +1804,7 @@ static void update_numa_stats(struct task_numa_env *env,
ns->nr_running += rq->cfs.h_nr_running;
ns->compute_capacity += capacity_of(cpu);
- if (find_idle && !rq->nr_running && idle_cpu(cpu)) {
+ if (find_idle && idle_core < 0 && !rq->nr_running && idle_cpu(cpu)) {
if (READ_ONCE(rq->numa_migrate_on) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr))
continue;
@@ -1836,7 +1836,7 @@ static void task_numa_assign(struct task_numa_env *env,
int start = env->dst_cpu;
/* Find alternative idle CPU. */
- for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start) {
+ for_each_cpu_wrap(cpu, cpumask_of_node(env->dst_nid), start + 1) {
if (cpu == env->best_cpu || !idle_cpu(cpu) ||
!cpumask_test_cpu(cpu, env->p->cpus_ptr)) {
continue;
@@ -4476,17 +4476,9 @@ static inline int util_fits_cpu(unsigned long util,
*
* For uclamp_max, we can tolerate a drop in performance level as the
* goal is to cap the task. So it's okay if it's getting less.
- *
- * In case of capacity inversion we should honour the inverted capacity
- * for both uclamp_min and uclamp_max all the time.
*/
- capacity_orig = cpu_in_capacity_inversion(cpu);
- if (capacity_orig) {
- capacity_orig_thermal = capacity_orig;
- } else {
- capacity_orig = capacity_orig_of(cpu);
- capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
- }
+ capacity_orig = capacity_orig_of(cpu);
+ capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu);
/*
* We want to force a task to fit a cpu as implied by uclamp_max.
@@ -4561,8 +4553,8 @@ static inline int util_fits_cpu(unsigned long util,
* handle the case uclamp_min > uclamp_max.
*/
uclamp_min = min(uclamp_min, uclamp_max);
- if (util < uclamp_min && capacity_orig != SCHED_CAPACITY_SCALE)
- fits = fits && (uclamp_min <= capacity_orig_thermal);
+ if (fits && (util < uclamp_min) && (uclamp_min > capacity_orig_thermal))
+ return -1;
return fits;
}
@@ -4572,7 +4564,11 @@ static inline int task_fits_cpu(struct task_struct *p, int cpu)
unsigned long uclamp_min = uclamp_eff_value(p, UCLAMP_MIN);
unsigned long uclamp_max = uclamp_eff_value(p, UCLAMP_MAX);
unsigned long util = task_util_est(p);
- return util_fits_cpu(util, uclamp_min, uclamp_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements, which
+ * include the utilization but also the performance hints.
+ */
+ return (util_fits_cpu(util, uclamp_min, uclamp_max, cpu) > 0);
}
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
@@ -4656,6 +4652,7 @@ static void
place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
{
u64 vruntime = cfs_rq->min_vruntime;
+ u64 sleep_time;
/*
* The 'current' period is already promised to the current tasks,
@@ -4685,8 +4682,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
vruntime -= thresh;
}
- /* ensure we never gain time by being placed backwards. */
- se->vruntime = max_vruntime(se->vruntime, vruntime);
+ /*
+ * Pull vruntime of the entity being placed to the base level of
+ * cfs_rq, to prevent boosting it if placed backwards. If the entity
+ * slept for a long time, don't even try to compare its vruntime with
+ * the base as it may be too far off and the comparison may get
+ * inversed due to s64 overflow.
+ */
+ sleep_time = rq_clock_task(rq_of(cfs_rq)) - se->exec_start;
+ if ((s64)sleep_time > 60LL * NSEC_PER_SEC)
+ se->vruntime = vruntime;
+ else
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
}
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -4896,7 +4903,13 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
struct sched_entity *se;
s64 delta;
- ideal_runtime = sched_slice(cfs_rq, curr);
+ /*
+ * When many tasks blow up the sched_period; it is possible that
+ * sched_slice() reports unusually large results (when many tasks are
+ * very light for example). Therefore impose a maximum.
+ */
+ ideal_runtime = min_t(u64, sched_slice(cfs_rq, curr), sysctl_sched_latency);
+
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
if (delta_exec > ideal_runtime) {
resched_curr(rq_of(cfs_rq));
@@ -5461,22 +5474,105 @@ unthrottle_throttle:
resched_curr(rq);
}
-static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+#ifdef CONFIG_SMP
+static void __cfsb_csd_unthrottle(void *arg)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cursor, *tmp;
+ struct rq *rq = arg;
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+
+ /*
+ * Since we hold rq lock we're safe from concurrent manipulation of
+ * the CSD list. However, this RCU critical section annotates the
+ * fact that we pair with sched_free_group_rcu(), so that we cannot
+ * race with group being freed in the window between removing it
+ * from the list and advancing to the next entry in the list.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_safe(cursor, tmp, &rq->cfsb_csd_list,
+ throttled_csd_list) {
+ list_del_init(&cursor->throttled_csd_list);
+
+ if (cfs_rq_throttled(cursor))
+ unthrottle_cfs_rq(cursor);
+ }
+
+ rcu_read_unlock();
+
+ rq_unlock(rq, &rf);
+}
+
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ bool first;
+
+ if (rq == this_rq()) {
+ unthrottle_cfs_rq(cfs_rq);
+ return;
+ }
+
+ /* Already enqueued */
+ if (SCHED_WARN_ON(!list_empty(&cfs_rq->throttled_csd_list)))
+ return;
+
+ first = list_empty(&rq->cfsb_csd_list);
+ list_add_tail(&cfs_rq->throttled_csd_list, &rq->cfsb_csd_list);
+ if (first)
+ smp_call_function_single_async(cpu_of(rq), &rq->cfsb_csd);
+}
+#else
+static inline void __unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ unthrottle_cfs_rq(cfs_rq);
+}
+#endif
+
+static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq)
+{
+ lockdep_assert_rq_held(rq_of(cfs_rq));
+
+ if (SCHED_WARN_ON(!cfs_rq_throttled(cfs_rq) ||
+ cfs_rq->runtime_remaining <= 0))
+ return;
+
+ __unthrottle_cfs_rq_async(cfs_rq);
+}
+
+static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
+{
+ struct cfs_rq *local_unthrottle = NULL;
+ int this_cpu = smp_processor_id();
u64 runtime, remaining = 1;
+ bool throttled = false;
+ struct cfs_rq *cfs_rq;
+ struct rq_flags rf;
+ struct rq *rq;
rcu_read_lock();
list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
throttled_list) {
- struct rq *rq = rq_of(cfs_rq);
- struct rq_flags rf;
+ rq = rq_of(cfs_rq);
+
+ if (!remaining) {
+ throttled = true;
+ break;
+ }
rq_lock_irqsave(rq, &rf);
if (!cfs_rq_throttled(cfs_rq))
goto next;
- /* By the above check, this should never be true */
+#ifdef CONFIG_SMP
+ /* Already queued for async unthrottle */
+ if (!list_empty(&cfs_rq->throttled_csd_list))
+ goto next;
+#endif
+
+ /* By the above checks, this should never be true */
SCHED_WARN_ON(cfs_rq->runtime_remaining > 0);
raw_spin_lock(&cfs_b->lock);
@@ -5490,16 +5586,30 @@ static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b)
cfs_rq->runtime_remaining += runtime;
/* we check whether we're throttled above */
- if (cfs_rq->runtime_remaining > 0)
- unthrottle_cfs_rq(cfs_rq);
+ if (cfs_rq->runtime_remaining > 0) {
+ if (cpu_of(rq) != this_cpu ||
+ SCHED_WARN_ON(local_unthrottle))
+ unthrottle_cfs_rq_async(cfs_rq);
+ else
+ local_unthrottle = cfs_rq;
+ } else {
+ throttled = true;
+ }
next:
rq_unlock_irqrestore(rq, &rf);
-
- if (!remaining)
- break;
}
rcu_read_unlock();
+
+ if (local_unthrottle) {
+ rq = cpu_rq(this_cpu);
+ rq_lock_irqsave(rq, &rf);
+ if (cfs_rq_throttled(local_unthrottle))
+ unthrottle_cfs_rq(local_unthrottle);
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ return throttled;
}
/*
@@ -5544,10 +5654,8 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, u
while (throttled && cfs_b->runtime > 0) {
raw_spin_unlock_irqrestore(&cfs_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
- distribute_cfs_runtime(cfs_b);
+ throttled = distribute_cfs_runtime(cfs_b);
raw_spin_lock_irqsave(&cfs_b->lock, flags);
-
- throttled = !list_empty(&cfs_b->throttled_cfs_rq);
}
/*
@@ -5824,6 +5932,9 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
{
cfs_rq->runtime_enabled = 0;
INIT_LIST_HEAD(&cfs_rq->throttled_list);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&cfs_rq->throttled_csd_list);
+#endif
}
void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
@@ -5840,12 +5951,38 @@ void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
{
+ int __maybe_unused i;
+
/* init_cfs_bandwidth() was not called */
if (!cfs_b->throttled_cfs_rq.next)
return;
hrtimer_cancel(&cfs_b->period_timer);
hrtimer_cancel(&cfs_b->slack_timer);
+
+ /*
+ * It is possible that we still have some cfs_rq's pending on a CSD
+ * list, though this race is very rare. In order for this to occur, we
+ * must have raced with the last task leaving the group while there
+ * exist throttled cfs_rq(s), and the period_timer must have queued the
+ * CSD item but the remote cpu has not yet processed it. To handle this,
+ * we can simply flush all pending CSD work inline here. We're
+ * guaranteed at this point that no additional cfs_rq of this group can
+ * join a CSD list.
+ */
+#ifdef CONFIG_SMP
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ unsigned long flags;
+
+ if (list_empty(&rq->cfsb_csd_list))
+ continue;
+
+ local_irq_save(flags);
+ __cfsb_csd_unthrottle(rq);
+ local_irq_restore(flags);
+ }
+#endif
}
/*
@@ -6008,6 +6145,7 @@ static inline bool cpu_overutilized(int cpu)
unsigned long rq_util_min = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MIN);
unsigned long rq_util_max = uclamp_rq_get(cpu_rq(cpu), UCLAMP_MAX);
+ /* Return true only if the utilization doesn't fit CPU's capacity */
return !util_fits_cpu(cpu_util_cfs(cpu), rq_util_min, rq_util_max, cpu);
}
@@ -6801,6 +6939,7 @@ static int
select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
{
unsigned long task_util, util_min, util_max, best_cap = 0;
+ int fits, best_fits = 0;
int cpu, best_cpu = -1;
struct cpumask *cpus;
@@ -6811,17 +6950,33 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target)
util_min = uclamp_eff_value(p, UCLAMP_MIN);
util_max = uclamp_eff_value(p, UCLAMP_MAX);
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
unsigned long cpu_cap = capacity_of(cpu);
if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu))
continue;
- if (util_fits_cpu(task_util, util_min, util_max, cpu))
+
+ fits = util_fits_cpu(task_util, util_min, util_max, cpu);
+
+ /* This CPU fits with all requirements */
+ if (fits > 0)
return cpu;
+ /*
+ * Only the min performance hint (i.e. uclamp_min) doesn't fit.
+ * Look for the CPU with best capacity.
+ */
+ else if (fits < 0)
+ cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu));
- if (cpu_cap > best_cap) {
+ /*
+ * First, select CPU which fits better (-1 being better than 0).
+ * Then, select the one with best capacity at same level.
+ */
+ if ((fits < best_fits) ||
+ ((fits == best_fits) && (cpu_cap > best_cap))) {
best_cap = cpu_cap;
best_cpu = cpu;
+ best_fits = fits;
}
}
@@ -6834,7 +6989,11 @@ static inline bool asym_fits_cpu(unsigned long util,
int cpu)
{
if (sched_asym_cpucap_active())
- return util_fits_cpu(util, util_min, util_max, cpu);
+ /*
+ * Return true only if the cpu fully fits the task requirements
+ * which include the utilization and the performance hints.
+ */
+ return (util_fits_cpu(util, util_min, util_max, cpu) > 0);
return true;
}
@@ -7201,6 +7360,9 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
unsigned long p_util_max = uclamp_is_used() ? uclamp_eff_value(p, UCLAMP_MAX) : 1024;
struct root_domain *rd = this_rq()->rd;
int cpu, best_energy_cpu, target = -1;
+ int prev_fits = -1, best_fits = -1;
+ unsigned long best_thermal_cap = 0;
+ unsigned long prev_thermal_cap = 0;
struct sched_domain *sd;
struct perf_domain *pd;