summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
authorMark Brown <broonie@kernel.org>2018-01-25 18:16:26 +0000
committerMark Brown <broonie@kernel.org>2018-01-25 18:16:26 +0000
commit0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f (patch)
treeb6597ffe774a67ab5b8eebd57c9a723732d3a39c /kernel/sched
parent3bb0f7c31b1aedd0f85c675297031281799145d7 (diff)
parent93a00c467fe998bf5716cbc9cabc127046054782 (diff)
downloadlinux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.tar.gz
linux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.tar.bz2
linux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.zip
Merge branches 'topic/twl4030' and 'topic/twl6040' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into asoc-twl-breakage
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/clock.c2
-rw-r--r--kernel/sched/core.c271
-rw-r--r--kernel/sched/cpuacct.h18
-rw-r--r--kernel/sched/cpufreq_schedutil.c12
-rw-r--r--kernel/sched/cputime.c17
-rw-r--r--kernel/sched/deadline.c23
-rw-r--r--kernel/sched/debug.c18
-rw-r--r--kernel/sched/fair.c1051
-rw-r--r--kernel/sched/idle.c4
-rw-r--r--kernel/sched/isolation.c155
-rw-r--r--kernel/sched/rt.c318
-rw-r--r--kernel/sched/sched.h75
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/sched/topology.c49
-rw-r--r--kernel/sched/wait_bit.c18
16 files changed, 1334 insertions, 700 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index a9ee16bbc693..e2f9d4feff40 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
obj-$(CONFIG_CPU_FREQ) += cpufreq.o
obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
obj-$(CONFIG_MEMBARRIER) += membarrier.o
+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index ca0f8fc945c6..e086babe6c61 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -388,7 +388,7 @@ void sched_clock_tick(void)
if (unlikely(!sched_clock_running))
return;
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
scd = this_scd();
__scd_stamp(scd);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d17c5da523a0..75554f366fd3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -16,6 +16,7 @@
#include <linux/init_task.h>
#include <linux/context_tracking.h>
#include <linux/rcupdate_wait.h>
+#include <linux/compat.h>
#include <linux/blkdev.h>
#include <linux/kprobes.h>
@@ -26,6 +27,7 @@
#include <linux/profile.h>
#include <linux/security.h>
#include <linux/syscalls.h>
+#include <linux/sched/isolation.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -42,18 +44,21 @@
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
/*
* Debugging: various feature bits
+ *
+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
+ * at compile time and compiler optimization based on features default.
*/
-
#define SCHED_FEAT(name, enabled) \
(1UL << __SCHED_FEAT_##name) * enabled |
-
const_debug unsigned int sysctl_sched_features =
#include "features.h"
0;
-
#undef SCHED_FEAT
+#endif
/*
* Number of tasks to iterate in a single balance run.
@@ -83,9 +88,6 @@ __read_mostly int scheduler_running;
*/
int sysctl_sched_rt_runtime = 950000;
-/* CPUs with isolated domains */
-cpumask_var_t cpu_isolated_map;
-
/*
* __task_rq_lock - lock the rq @p resides on.
*/
@@ -505,8 +507,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -526,7 +527,7 @@ int get_nohz_timer_target(void)
int i, cpu = smp_processor_id();
struct sched_domain *sd;
- if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
+ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
rcu_read_lock();
@@ -535,15 +536,15 @@ int get_nohz_timer_target(void)
if (cpu == i)
continue;
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
cpu = i;
goto unlock;
}
}
}
- if (!is_housekeeping_cpu(cpu))
- cpu = housekeeping_any_cpu();
+ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
+ cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
unlock:
rcu_read_unlock();
return cpu;
@@ -733,7 +734,7 @@ int tg_nop(struct task_group *tg, void *data)
}
#endif
-static void set_load_weight(struct task_struct *p)
+static void set_load_weight(struct task_struct *p, bool update_load)
{
int prio = p->static_prio - MAX_RT_PRIO;
struct load_weight *load = &p->se.load;
@@ -747,8 +748,16 @@ static void set_load_weight(struct task_struct *p)
return;
}
- load->weight = scale_load(sched_prio_to_weight[prio]);
- load->inv_weight = sched_prio_to_wmult[prio];
+ /*
+ * SCHED_OTHER tasks have to update their load when changing their
+ * weight
+ */
+ if (update_load && p->sched_class == &fair_sched_class) {
+ reweight_task(p, prio);
+ } else {
+ load->weight = scale_load(sched_prio_to_weight[prio]);
+ load->inv_weight = sched_prio_to_wmult[prio];
+ }
}
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2358,7 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
p->static_prio = NICE_TO_PRIO(0);
p->prio = p->normal_prio = __normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, false);
/*
* We don't need the reset flag anymore after the fork. It has
@@ -3805,7 +3814,7 @@ void set_user_nice(struct task_struct *p, long nice)
put_prev_task(rq, p);
p->static_prio = NICE_TO_PRIO(nice);
- set_load_weight(p);
+ set_load_weight(p, true);
old_prio = p->prio;
p->prio = effective_prio(p);
delta = p->prio - old_prio;
@@ -3962,7 +3971,7 @@ static void __setscheduler_params(struct task_struct *p,
*/
p->rt_priority = attr->sched_priority;
p->normal_prio = normal_prio(p);
- set_load_weight(p);
+ set_load_weight(p, true);
}
/* Actually do priority change: must hold pi & rq lock. */
@@ -4842,6 +4851,7 @@ int __sched _cond_resched(void)
preempt_schedule_common();
return 1;
}
+ rcu_all_qs();
return 0;
}
EXPORT_SYMBOL(_cond_resched);
@@ -5098,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
* Return: On success, 0 and the timeslice is in @interval. Otherwise,
* an error code.
*/
-SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
- struct timespec __user *, interval)
+static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
{
struct task_struct *p;
unsigned int time_slice;
struct rq_flags rf;
- struct timespec t;
struct rq *rq;
int retval;
@@ -5128,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
task_rq_unlock(rq, p, &rf);
rcu_read_unlock();
- jiffies_to_timespec(time_slice, &t);
- retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
- return retval;
+ jiffies_to_timespec64(time_slice, t);
+ return 0;
out_unlock:
rcu_read_unlock();
return retval;
}
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
+ struct timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = put_timespec64(&t, interval);
+
+ return retval;
+}
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval,
+ compat_pid_t, pid,
+ struct compat_timespec __user *, interval)
+{
+ struct timespec64 t;
+ int retval = sched_rr_get_interval(pid, &t);
+
+ if (retval == 0)
+ retval = compat_put_timespec64(&t, interval);
+ return retval;
+}
+#endif
+
void sched_show_task(struct task_struct *p)
{
unsigned long free = 0;
@@ -5165,6 +5198,7 @@ void sched_show_task(struct task_struct *p)
show_stack(p, NULL);
put_task_stack(p);
}
+EXPORT_SYMBOL_GPL(sched_show_task);
static inline bool
state_filter_match(unsigned long state_filter, struct task_struct *p)
@@ -5726,10 +5760,6 @@ static inline void sched_init_smt(void) { }
void __init sched_init_smp(void)
{
- cpumask_var_t non_isolated_cpus;
-
- alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
-
sched_init_numa();
/*
@@ -5739,16 +5769,12 @@ void __init sched_init_smp(void)
*/
mutex_lock(&sched_domains_mutex);
sched_init_domains(cpu_active_mask);
- cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
- if (cpumask_empty(non_isolated_cpus))
- cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
/* Move init over to a non-isolated CPU */
- if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
+ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
BUG();
sched_init_granularity();
- free_cpumask_var(non_isolated_cpus);
init_sched_rt_class();
init_sched_dl_class();
@@ -5933,7 +5959,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
}
- set_load_weight(&init_task);
+ set_load_weight(&init_task, false);
/*
* The boot idle thread does lazy MMU switching as well:
@@ -5952,9 +5978,6 @@ void __init sched_init(void)
calc_load_update = jiffies + LOAD_FREQ;
#ifdef CONFIG_SMP
- /* May be allocated at isolcpus cmdline parse time */
- if (cpu_isolated_map == NULL)
- zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
idle_thread_set_boot_cpu();
set_cpu_rq_start_time(smp_processor_id());
#endif
@@ -6621,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
return ret;
}
-static int cpu_stats_show(struct seq_file *sf, void *v)
+static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
{
struct task_group *tg = css_tg(seq_css(sf));
struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -6661,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-static struct cftype cpu_files[] = {
+static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
.name = "shares",
@@ -6682,7 +6705,7 @@ static struct cftype cpu_files[] = {
},
{
.name = "stat",
- .seq_show = cpu_stats_show,
+ .seq_show = cpu_cfs_stat_show,
},
#endif
#ifdef CONFIG_RT_GROUP_SCHED
@@ -6700,16 +6723,182 @@ static struct cftype cpu_files[] = {
{ } /* Terminate */
};
+static int cpu_extra_stat_show(struct seq_file *sf,
+ struct cgroup_subsys_state *css)
+{
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ struct task_group *tg = css_tg(css);
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
+ u64 throttled_usec;
+
+ throttled_usec = cfs_b->throttled_time;
+ do_div(throttled_usec, NSEC_PER_USEC);
+
+ seq_printf(sf, "nr_periods %d\n"
+ "nr_throttled %d\n"
+ "throttled_usec %llu\n",
+ cfs_b->nr_periods, cfs_b->nr_throttled,
+ throttled_usec);
+ }
+#endif
+ return 0;
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct task_group *tg = css_tg(css);
+ u64 weight = scale_load_down(tg->shares);
+
+ return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+}
+
+static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 weight)
+{
+ /*
+ * cgroup weight knobs should use the common MIN, DFL and MAX
+ * values which are 1, 100 and 10000 respectively. While it loses
+ * a bit of range on both ends, it maps pretty well onto the shares
+ * value used by scheduler and the round-trip conversions preserve
+ * the original value over the entire range.
+ */
+ if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ return -ERANGE;
+
+ weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+
+static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ unsigned long weight = scale_load_down(css_tg(css)->shares);
+ int last_delta = INT_MAX;
+ int prio, delta;
+
+ /* find the closest nice value to the current weight */
+ for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
+ delta = abs(sched_prio_to_weight[prio] - weight);
+ if (delta >= last_delta)
+ break;
+ last_delta = delta;
+ }
+
+ return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
+}
+
+static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 nice)
+{
+ unsigned long weight;
+
+ if (nice < MIN_NICE || nice > MAX_NICE)
+ return -ERANGE;
+
+ weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO];
+ return sched_group_set_shares(css_tg(css), scale_load(weight));
+}
+#endif
+
+static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
+ long period, long quota)
+{
+ if (quota < 0)
+ seq_puts(sf, "max");
+ else
+ seq_printf(sf, "%ld", quota);
+
+ seq_printf(sf, " %ld\n", period);
+}
+
+/* caller should put the current value in *@periodp before calling */
+static int __maybe_unused cpu_period_quota_parse(char *buf,
+ u64 *periodp, u64 *quotap)
+{
+ char tok[21]; /* U64_MAX */
+
+ if (!sscanf(buf, "%s %llu", tok, periodp))
+ return -EINVAL;
+
+ *periodp *= NSEC_PER_USEC;
+
+ if (sscanf(tok, "%llu", quotap))
+ *quotap *= NSEC_PER_USEC;
+ else if (!strcmp(tok, "max"))
+ *quotap = RUNTIME_INF;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+
+#ifdef CONFIG_CFS_BANDWIDTH
+static int cpu_max_show(struct seq_file *sf, void *v)
+{
+ struct task_group *tg = css_tg(seq_css(sf));
+
+ cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
+ return 0;
+}
+
+static ssize_t cpu_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct task_group *tg = css_tg(of_css(of));
+ u64 period = tg_get_cfs_period(tg);
+ u64 quota;
+ int ret;
+
+ ret = cpu_period_quota_parse(buf, &period, &quota);
+ if (!ret)
+ ret = tg_set_cfs_bandwidth(tg, period, quota);
+ return ret ?: nbytes;
+}
+#endif
+
+static struct cftype cpu_files[] = {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_u64 = cpu_weight_read_u64,
+ .write_u64 = cpu_weight_write_u64,
+ },
+ {
+ .name = "weight.nice",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_weight_nice_read_s64,
+ .write_s64 = cpu_weight_nice_write_s64,
+ },
+#endif
+#ifdef CONFIG_CFS_BANDWIDTH
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cpu_max_show,
+ .write = cpu_max_write,
+ },
+#endif
+ { } /* terminate */
+};
+
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
+ .css_extra_stat_show = cpu_extra_stat_show,
.fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
- .legacy_cftypes = cpu_files,
+ .legacy_cftypes = cpu_legacy_files,
+ .dfl_cftypes = cpu_files,
.early_init = true,
+ .threaded = true,
};
#endif /* CONFIG_CGROUP_SCHED */
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
deleted file mode 100644
index a8358a57a316..000000000000
--- a/kernel/sched/cpuacct.h
+++ /dev/null
@@ -1,18 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifdef CONFIG_CGROUP_CPUACCT
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val);
-
-#else
-
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
-{
-}
-
-static inline void
-cpuacct_account_field(struct task_struct *tsk, int index, u64 val)
-{
-}
-
-#endif
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 9209d83ecdcf..2f52ec0f1539 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
* Do not reduce the frequency if the CPU has not been idle
* recently, as the reduction is likely to be premature then.
*/
- if (busy && next_f < sg_policy->next_freq)
+ if (busy && next_f < sg_policy->next_freq) {
next_f = sg_policy->next_freq;
+
+ /* Reset cached freq as next_freq has changed */
+ sg_policy->cached_raw_freq = 0;
+ }
}
sugov_update_commit(sg_policy, time, next_f);
}
@@ -649,6 +653,7 @@ static int sugov_start(struct cpufreq_policy *policy)
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
memset(sg_cpu, 0, sizeof(*sg_cpu));
+ sg_cpu->cpu = cpu;
sg_cpu->sg_policy = sg_policy;
sg_cpu->flags = SCHED_CPUFREQ_RT;
sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
@@ -714,11 +719,6 @@ struct cpufreq_governor *cpufreq_default_governor(void)
static int __init sugov_register(void)
{
- int cpu;
-
- for_each_possible_cpu(cpu)
- per_cpu(sugov_cpu, cpu).cpu = cpu;
-
return cpufreq_register_governor(&schedutil_gov);
}
fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 14d2dbf97c53..bac6ac9a4ec7 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index,
*/
__this_cpu_add(kernel_cpustat.cpustat[index], tmp);
- cpuacct_account_field(p, index, tmp);
+ cgroup_account_cputime_field(p, index, tmp);
}
/*
@@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max)
{
u64 accounted;
- /* Shall be converted to a lockdep-enabled lightweight check */
- WARN_ON_ONCE(!irqs_disabled());
+ lockdep_assert_irqs_disabled();
accounted = steal_account_process_time(max);
@@ -447,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk)
EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
#endif /* __ARCH_HAS_VTIME_ACCOUNT */
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
+{
+ *ut = curr->utime;
+ *st = curr->stime;
+}
+
void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
{
*ut = p->utime;
@@ -585,9 +591,8 @@ drop_precision:
*
* Assuming that rtime_i+1 >= rtime_i.
*/
-static void cputime_adjust(struct task_cputime *curr,
- struct prev_cputime *prev,
- u64 *ut, u64 *st)
+void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev,
+ u64 *ut, u64 *st)
{
u64 rtime, stime, utime;
unsigned long flags;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 4ae5c1ea90e2..2473736c7616 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
if (p->state == TASK_DEAD)
sub_rq_bw(p->dl.dl_bw, &rq->dl);
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
__dl_clear_params(p);
raw_spin_unlock(&dl_b->lock);
}
@@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq)
account_group_exec_runtime(curr, delta_exec);
curr->se.exec_start = rq_clock_task(rq);
- cpuacct_charge(curr, delta_exec);
+ cgroup_account_cputime(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
}
raw_spin_lock(&dl_b->lock);
- __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&dl_b->lock);
__dl_clear_params(p);
@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
update_dl_entity(dl_se, pi_se);
} else if (flags & ENQUEUE_REPLENISH) {
replenish_dl_entity(dl_se, pi_se);
+ } else if ((flags & ENQUEUE_RESTORE) &&
+ dl_time_before(dl_se->deadline,
+ rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
+ setup_new_dl_entity(dl_se);
}
__enqueue_dl_entity(dl_se);
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
* until we complete the update.
*/
raw_spin_lock(&src_dl_b->lock);
- __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+ __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
raw_spin_unlock(&src_dl_b->lock);
}
@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
return;
}
- /*
- * If p is boosted we already updated its params in
- * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
- * p's deadline being now already after rq_clock(rq).
- */
- if (dl_time_before(p->dl.deadline, rq_clock(rq)))
- setup_new_dl_entity(&p->dl);
if (rq->curr != p) {
#ifdef CONFIG_SMP
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
if (dl_policy(policy) && !task_has_dl_policy(p) &&
!__dl_overflow(dl_b, cpus, 0, new_bw)) {
if (hrtimer_active(&p->dl.inactive_timer))
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
err = 0;
} else if (dl_policy(policy) && task_has_dl_policy(p) &&
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
* But this would require to set the task's "inactive
* timer" when the task is not inactive.
*/
- __dl_clear(dl_b, p->dl.dl_bw, cpus);
+ __dl_sub(dl_b, p->dl.dl_bw, cpus);
__dl_add(dl_b, new_bw, cpus);
dl_change_utilization(p, new_bw);
err = 0;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..1ca0130ed4f9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
P_SCHEDSTAT(se->statistics.wait_count);
}
P(se->load.weight);
+ P(se->runnable_weight);
#ifdef CONFIG_SMP
P(se->avg.load_avg);
P(se->avg.util_avg);
+ P(se->avg.runnable_load_avg);
#endif
#undef PN_SCHEDSTAT
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
+ SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
cfs_rq->avg.load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg",
- cfs_rq->runnable_load_avg);
+ cfs_rq->avg.runnable_load_avg);
SEQ_printf(m, " .%-30s: %lu\n", "util_avg",
cfs_rq->avg.util_avg);
- SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg",
- atomic_long_read(&cfs_rq->removed_load_avg));
- SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg",
- atomic_long_read(&cfs_rq->removed_util_avg));
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg",
+ cfs_rq->removed.load_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg",
+ cfs_rq->removed.util_avg);
+ SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum",
+ cfs_rq->removed.runnable_sum);
#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
"nr_involuntary_switches", (long long)p->nivcsw);
P(se.load.weight);
+ P(se.runnable_weight);
#ifdef CONFIG_SMP
P(se.avg.load_sum);
+ P(se.avg.runnable_load_sum);
P(se.avg.util_sum);
P(se.avg.load_avg);
+ P(se.avg.runnable_load_avg);
P(se.avg.util_avg);
P(se.avg.last_update_time);
#endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5c09ddf8c832..4037e19bbca2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,7 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/sched/isolation.h>
#include <trace/events/sched.h>
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
{
struct sched_avg *sa = &se->avg;
- sa->last_update_time = 0;
- /*
- * sched_avg's period_contrib should be strictly less then 1024, so
- * we give it 1023 to make sure it is almost a period (1024us), and
- * will definitely be update (after enqueue).
- */
- sa->period_contrib = 1023;
+ memset(sa, 0, sizeof(*sa));
+
/*
* Tasks are intialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
- sa->load_avg = scale_load_down(se->load.weight);
- sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- /*
- * At this point, util_avg won't be used in select_task_rq_fair anyway
- */
- sa->util_avg = 0;
- sa->util_sum = 0;
+ sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
+
+ se->runnable_weight = se->load.weight;
+
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
}
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
} else {
sa->util_avg = cap;
}
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
}
if (entity_is_task(se)) {
@@ -852,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
struct task_struct *curtask = task_of(curr);
trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
- cpuacct_charge(curtask, delta_exec);
+ cgroup_account_cputime(curtask, delta_exec);
account_group_exec_runtime(curtask, delta_exec);
}
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
} else {
- delta = p->se.avg.load_sum / p->se.load.weight;
+ delta = p->se.avg.load_sum;
*period = LOAD_AVG_MAX;
}
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->nr_running--;
}
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+/*
+ * Unsigned subtract and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define sub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(*ptr) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ res = var - val; \
+ if (res > var) \
+ res = 0; \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+ return scale_load_down(se->load.weight);
+}
+
+static inline long se_runnable(struct sched_entity *se)
+{
+ return scale_load_down(se->runnable_weight);
+}
+
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight += se->runnable_weight;
+
+ cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
+ cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
+}
+
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->runnable_weight -= se->runnable_weight;
+
+ sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
+ sub_positive(&cfs_rq->avg.runnable_load_sum,
+ se_runnable(se) * se->avg.runnable_load_sum);
+}
+
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ cfs_rq->avg.load_avg += se->avg.load_avg;
+ cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
+}
+
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
+ sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
+}
+#else
+static inline void
+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+static inline void
+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
+#endif
+
+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
+ unsigned long weight, unsigned long runnable)
+{
+ if (se->on_rq) {
+ /* commit outstanding execution time */
+ if (cfs_rq->curr == se)
+ update_curr(cfs_rq);
+ account_entity_dequeue(cfs_rq, se);
+ dequeue_runnable_load_avg(cfs_rq, se);
+ }
+ dequeue_load_avg(cfs_rq, se);
+
+ se->runnable_weight = runnable;
+ update_load_set(&se->load, weight