diff options
| author | Mark Brown <broonie@kernel.org> | 2018-01-25 18:16:26 +0000 |
|---|---|---|
| committer | Mark Brown <broonie@kernel.org> | 2018-01-25 18:16:26 +0000 |
| commit | 0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f (patch) | |
| tree | b6597ffe774a67ab5b8eebd57c9a723732d3a39c /kernel/sched | |
| parent | 3bb0f7c31b1aedd0f85c675297031281799145d7 (diff) | |
| parent | 93a00c467fe998bf5716cbc9cabc127046054782 (diff) | |
| download | linux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.tar.gz linux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.tar.bz2 linux-0b5eca67bd2d0e6f6d0ccdc316aced0cc4bf2e9f.zip | |
Merge branches 'topic/twl4030' and 'topic/twl6040' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into asoc-twl-breakage
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/Makefile | 1 | ||||
| -rw-r--r-- | kernel/sched/clock.c | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 271 | ||||
| -rw-r--r-- | kernel/sched/cpuacct.h | 18 | ||||
| -rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 12 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 17 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 23 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 18 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 1051 | ||||
| -rw-r--r-- | kernel/sched/idle.c | 4 | ||||
| -rw-r--r-- | kernel/sched/isolation.c | 155 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 318 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 75 | ||||
| -rw-r--r-- | kernel/sched/stop_task.c | 2 | ||||
| -rw-r--r-- | kernel/sched/topology.c | 49 | ||||
| -rw-r--r-- | kernel/sched/wait_bit.c | 18 |
16 files changed, 1334 insertions, 700 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index a9ee16bbc693..e2f9d4feff40 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o obj-$(CONFIG_CPU_FREQ) += cpufreq.o obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o obj-$(CONFIG_MEMBARRIER) += membarrier.o +obj-$(CONFIG_CPU_ISOLATION) += isolation.o diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index ca0f8fc945c6..e086babe6c61 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -388,7 +388,7 @@ void sched_clock_tick(void) if (unlikely(!sched_clock_running)) return; - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); scd = this_scd(); __scd_stamp(scd); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d17c5da523a0..75554f366fd3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -16,6 +16,7 @@ #include <linux/init_task.h> #include <linux/context_tracking.h> #include <linux/rcupdate_wait.h> +#include <linux/compat.h> #include <linux/blkdev.h> #include <linux/kprobes.h> @@ -26,6 +27,7 @@ #include <linux/profile.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/sched/isolation.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -42,18 +44,21 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); +#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL) /* * Debugging: various feature bits + * + * If SCHED_DEBUG is disabled, each compilation unit has its own copy of + * sysctl_sched_features, defined in sched.h, to allow constants propagation + * at compile time and compiler optimization based on features default. */ - #define SCHED_FEAT(name, enabled) \ (1UL << __SCHED_FEAT_##name) * enabled | - const_debug unsigned int sysctl_sched_features = #include "features.h" 0; - #undef SCHED_FEAT +#endif /* * Number of tasks to iterate in a single balance run. @@ -83,9 +88,6 @@ __read_mostly int scheduler_running; */ int sysctl_sched_rt_runtime = 950000; -/* CPUs with isolated domains */ -cpumask_var_t cpu_isolated_map; - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -505,8 +507,7 @@ void resched_cpu(int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - if (!raw_spin_trylock_irqsave(&rq->lock, flags)) - return; + raw_spin_lock_irqsave(&rq->lock, flags); resched_curr(rq); raw_spin_unlock_irqrestore(&rq->lock, flags); } @@ -526,7 +527,7 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) + if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) return cpu; rcu_read_lock(); @@ -535,15 +536,15 @@ int get_nohz_timer_target(void) if (cpu == i) continue; - if (!idle_cpu(i) && is_housekeeping_cpu(i)) { + if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { cpu = i; goto unlock; } } } - if (!is_housekeeping_cpu(cpu)) - cpu = housekeeping_any_cpu(); + if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) + cpu = housekeeping_any_cpu(HK_FLAG_TIMER); unlock: rcu_read_unlock(); return cpu; @@ -733,7 +734,7 @@ int tg_nop(struct task_group *tg, void *data) } #endif -static void set_load_weight(struct task_struct *p) +static void set_load_weight(struct task_struct *p, bool update_load) { int prio = p->static_prio - MAX_RT_PRIO; struct load_weight *load = &p->se.load; @@ -747,8 +748,16 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; + /* + * SCHED_OTHER tasks have to update their load when changing their + * weight + */ + if (update_load && p->sched_class == &fair_sched_class) { + reweight_task(p, prio); + } else { + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; + } } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -2358,7 +2367,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->static_prio = NICE_TO_PRIO(0); p->prio = p->normal_prio = __normal_prio(p); - set_load_weight(p); + set_load_weight(p, false); /* * We don't need the reset flag anymore after the fork. It has @@ -3805,7 +3814,7 @@ void set_user_nice(struct task_struct *p, long nice) put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); - set_load_weight(p); + set_load_weight(p, true); old_prio = p->prio; p->prio = effective_prio(p); delta = p->prio - old_prio; @@ -3962,7 +3971,7 @@ static void __setscheduler_params(struct task_struct *p, */ p->rt_priority = attr->sched_priority; p->normal_prio = normal_prio(p); - set_load_weight(p); + set_load_weight(p, true); } /* Actually do priority change: must hold pi & rq lock. */ @@ -4842,6 +4851,7 @@ int __sched _cond_resched(void) preempt_schedule_common(); return 1; } + rcu_all_qs(); return 0; } EXPORT_SYMBOL(_cond_resched); @@ -5098,13 +5108,11 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) * Return: On success, 0 and the timeslice is in @interval. Otherwise, * an error code. */ -SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, - struct timespec __user *, interval) +static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { struct task_struct *p; unsigned int time_slice; struct rq_flags rf; - struct timespec t; struct rq *rq; int retval; @@ -5128,15 +5136,40 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, task_rq_unlock(rq, p, &rf); rcu_read_unlock(); - jiffies_to_timespec(time_slice, &t); - retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; - return retval; + jiffies_to_timespec64(time_slice, t); + return 0; out_unlock: rcu_read_unlock(); return retval; } +SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, + struct timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = put_timespec64(&t, interval); + + return retval; +} + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, + compat_pid_t, pid, + struct compat_timespec __user *, interval) +{ + struct timespec64 t; + int retval = sched_rr_get_interval(pid, &t); + + if (retval == 0) + retval = compat_put_timespec64(&t, interval); + return retval; +} +#endif + void sched_show_task(struct task_struct *p) { unsigned long free = 0; @@ -5165,6 +5198,7 @@ void sched_show_task(struct task_struct *p) show_stack(p, NULL); put_task_stack(p); } +EXPORT_SYMBOL_GPL(sched_show_task); static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p) @@ -5726,10 +5760,6 @@ static inline void sched_init_smt(void) { } void __init sched_init_smp(void) { - cpumask_var_t non_isolated_cpus; - - alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); - sched_init_numa(); /* @@ -5739,16 +5769,12 @@ void __init sched_init_smp(void) */ mutex_lock(&sched_domains_mutex); sched_init_domains(cpu_active_mask); - cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map); - if (cpumask_empty(non_isolated_cpus)) - cpumask_set_cpu(smp_processor_id(), non_isolated_cpus); mutex_unlock(&sched_domains_mutex); /* Move init over to a non-isolated CPU */ - if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0) + if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); sched_init_granularity(); - free_cpumask_var(non_isolated_cpus); init_sched_rt_class(); init_sched_dl_class(); @@ -5933,7 +5959,7 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } - set_load_weight(&init_task); + set_load_weight(&init_task, false); /* * The boot idle thread does lazy MMU switching as well: @@ -5952,9 +5978,6 @@ void __init sched_init(void) calc_load_update = jiffies + LOAD_FREQ; #ifdef CONFIG_SMP - /* May be allocated at isolcpus cmdline parse time */ - if (cpu_isolated_map == NULL) - zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); idle_thread_set_boot_cpu(); set_cpu_rq_start_time(smp_processor_id()); #endif @@ -6621,7 +6644,7 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) return ret; } -static int cpu_stats_show(struct seq_file *sf, void *v) +static int cpu_cfs_stat_show(struct seq_file *sf, void *v) { struct task_group *tg = css_tg(seq_css(sf)); struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; @@ -6661,7 +6684,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css, } #endif /* CONFIG_RT_GROUP_SCHED */ -static struct cftype cpu_files[] = { +static struct cftype cpu_legacy_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", @@ -6682,7 +6705,7 @@ static struct cftype cpu_files[] = { }, { .name = "stat", - .seq_show = cpu_stats_show, + .seq_show = cpu_cfs_stat_show, }, #endif #ifdef CONFIG_RT_GROUP_SCHED @@ -6700,16 +6723,182 @@ static struct cftype cpu_files[] = { { } /* Terminate */ }; +static int cpu_extra_stat_show(struct seq_file *sf, + struct cgroup_subsys_state *css) +{ +#ifdef CONFIG_CFS_BANDWIDTH + { + struct task_group *tg = css_tg(css); + struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; + u64 throttled_usec; + + throttled_usec = cfs_b->throttled_time; + do_div(throttled_usec, NSEC_PER_USEC); + + seq_printf(sf, "nr_periods %d\n" + "nr_throttled %d\n" + "throttled_usec %llu\n", + cfs_b->nr_periods, cfs_b->nr_throttled, + throttled_usec); + } +#endif + return 0; +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + u64 weight = scale_load_down(tg->shares); + + return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); +} + +static int cpu_weight_write_u64(struct cgroup_subsys_state *css, + struct cftype *cft, u64 weight) +{ + /* + * cgroup weight knobs should use the common MIN, DFL and MAX + * values which are 1, 100 and 10000 respectively. While it loses + * a bit of range on both ends, it maps pretty well onto the shares + * value used by scheduler and the round-trip conversions preserve + * the original value over the entire range. + */ + if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) + return -ERANGE; + + weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); + + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} + +static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + unsigned long weight = scale_load_down(css_tg(css)->shares); + int last_delta = INT_MAX; + int prio, delta; + + /* find the closest nice value to the current weight */ + for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) { + delta = abs(sched_prio_to_weight[prio] - weight); + if (delta >= last_delta) + break; + last_delta = delta; + } + + return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO); +} + +static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, + struct cftype *cft, s64 nice) +{ + unsigned long weight; + + if (nice < MIN_NICE || nice > MAX_NICE) + return -ERANGE; + + weight = sched_prio_to_weight[NICE_TO_PRIO(nice) - MAX_RT_PRIO]; + return sched_group_set_shares(css_tg(css), scale_load(weight)); +} +#endif + +static void __maybe_unused cpu_period_quota_print(struct seq_file *sf, + long period, long quota) +{ + if (quota < 0) + seq_puts(sf, "max"); + else + seq_printf(sf, "%ld", quota); + + seq_printf(sf, " %ld\n", period); +} + +/* caller should put the current value in *@periodp before calling */ +static int __maybe_unused cpu_period_quota_parse(char *buf, + u64 *periodp, u64 *quotap) +{ + char tok[21]; /* U64_MAX */ + + if (!sscanf(buf, "%s %llu", tok, periodp)) + return -EINVAL; + + *periodp *= NSEC_PER_USEC; + + if (sscanf(tok, "%llu", quotap)) + *quotap *= NSEC_PER_USEC; + else if (!strcmp(tok, "max")) + *quotap = RUNTIME_INF; + else + return -EINVAL; + + return 0; +} + +#ifdef CONFIG_CFS_BANDWIDTH +static int cpu_max_show(struct seq_file *sf, void *v) +{ + struct task_group *tg = css_tg(seq_css(sf)); + + cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg)); + return 0; +} + +static ssize_t cpu_max_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct task_group *tg = css_tg(of_css(of)); + u64 period = tg_get_cfs_period(tg); + u64 quota; + int ret; + + ret = cpu_period_quota_parse(buf, &period, "a); + if (!ret) + ret = tg_set_cfs_bandwidth(tg, period, quota); + return ret ?: nbytes; +} +#endif + +static struct cftype cpu_files[] = { +#ifdef CONFIG_FAIR_GROUP_SCHED + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = cpu_weight_read_u64, + .write_u64 = cpu_weight_write_u64, + }, + { + .name = "weight.nice", + .flags = CFTYPE_NOT_ON_ROOT, + .read_s64 = cpu_weight_nice_read_s64, + .write_s64 = cpu_weight_nice_write_s64, + }, +#endif +#ifdef CONFIG_CFS_BANDWIDTH + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = cpu_max_show, + .write = cpu_max_write, + }, +#endif + { } /* terminate */ +}; + struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, .css_online = cpu_cgroup_css_online, .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, + .css_extra_stat_show = cpu_extra_stat_show, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, - .legacy_cftypes = cpu_files, + .legacy_cftypes = cpu_legacy_files, + .dfl_cftypes = cpu_files, .early_init = true, + .threaded = true, }; #endif /* CONFIG_CGROUP_SCHED */ diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h deleted file mode 100644 index a8358a57a316..000000000000 --- a/kernel/sched/cpuacct.h +++ /dev/null @@ -1,18 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifdef CONFIG_CGROUP_CPUACCT - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); - -#else - -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ -} - -static inline void -cpuacct_account_field(struct task_struct *tsk, int index, u64 val) -{ -} - -#endif diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 9209d83ecdcf..2f52ec0f1539 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -282,8 +282,12 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, * Do not reduce the frequency if the CPU has not been idle * recently, as the reduction is likely to be premature then. */ - if (busy && next_f < sg_policy->next_freq) + if (busy && next_f < sg_policy->next_freq) { next_f = sg_policy->next_freq; + + /* Reset cached freq as next_freq has changed */ + sg_policy->cached_raw_freq = 0; + } } sugov_update_commit(sg_policy, time, next_f); } @@ -649,6 +653,7 @@ static int sugov_start(struct cpufreq_policy *policy) struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); memset(sg_cpu, 0, sizeof(*sg_cpu)); + sg_cpu->cpu = cpu; sg_cpu->sg_policy = sg_policy; sg_cpu->flags = SCHED_CPUFREQ_RT; sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; @@ -714,11 +719,6 @@ struct cpufreq_governor *cpufreq_default_governor(void) static int __init sugov_register(void) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(sugov_cpu, cpu).cpu = cpu; - return cpufreq_register_governor(&schedutil_gov); } fs_initcall(sugov_register); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 14d2dbf97c53..bac6ac9a4ec7 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -109,7 +109,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __this_cpu_add(kernel_cpustat.cpustat[index], tmp); - cpuacct_account_field(p, index, tmp); + cgroup_account_cputime_field(p, index, tmp); } /* @@ -259,8 +259,7 @@ static inline u64 account_other_time(u64 max) { u64 accounted; - /* Shall be converted to a lockdep-enabled lightweight check */ - WARN_ON_ONCE(!irqs_disabled()); + lockdep_assert_irqs_disabled(); accounted = steal_account_process_time(max); @@ -447,6 +446,13 @@ void vtime_account_irq_enter(struct task_struct *tsk) EXPORT_SYMBOL_GPL(vtime_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) +{ + *ut = curr->utime; + *st = curr->stime; +} + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) { *ut = p->utime; @@ -585,9 +591,8 @@ drop_precision: * * Assuming that rtime_i+1 >= rtime_i. */ -static void cputime_adjust(struct task_cputime *curr, - struct prev_cputime *prev, - u64 *ut, u64 *st) +void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, + u64 *ut, u64 *st) { u64 rtime, stime, utime; unsigned long flags; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 4ae5c1ea90e2..2473736c7616 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p) if (p->state == TASK_DEAD) sub_rq_bw(p->dl.dl_bw, &rq->dl); raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); __dl_clear_params(p); raw_spin_unlock(&dl_b->lock); } @@ -1144,7 +1144,7 @@ static void update_curr_dl(struct rq *rq) account_group_exec_runtime(curr, delta_exec); curr->se.exec_start = rq_clock_task(rq); - cpuacct_charge(curr, delta_exec); + cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); @@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer) } raw_spin_lock(&dl_b->lock); - __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&dl_b->lock); __dl_clear_params(p); @@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, update_dl_entity(dl_se, pi_se); } else if (flags & ENQUEUE_REPLENISH) { replenish_dl_entity(dl_se, pi_se); + } else if ((flags & ENQUEUE_RESTORE) && + dl_time_before(dl_se->deadline, + rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) { + setup_new_dl_entity(dl_se); } __enqueue_dl_entity(dl_se); @@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, * until we complete the update. */ raw_spin_lock(&src_dl_b->lock); - __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); raw_spin_unlock(&src_dl_b->lock); } @@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) return; } - /* - * If p is boosted we already updated its params in - * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH), - * p's deadline being now already after rq_clock(rq). - */ - if (dl_time_before(p->dl.deadline, rq_clock(rq))) - setup_new_dl_entity(&p->dl); if (rq->curr != p) { #ifdef CONFIG_SMP @@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, if (dl_policy(policy) && !task_has_dl_policy(p) && !__dl_overflow(dl_b, cpus, 0, new_bw)) { if (hrtimer_active(&p->dl.inactive_timer)) - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); err = 0; } else if (dl_policy(policy) && task_has_dl_policy(p) && @@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy, * But this would require to set the task's "inactive * timer" when the task is not inactive. */ - __dl_clear(dl_b, p->dl.dl_bw, cpus); + __dl_sub(dl_b, p->dl.dl_bw, cpus); __dl_add(dl_b, new_bw, cpus); dl_change_utilization(p, new_bw); err = 0; diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..1ca0130ed4f9 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group P_SCHEDSTAT(se->statistics.wait_count); } P(se->load.weight); + P(se->runnable_weight); #ifdef CONFIG_SMP P(se->avg.load_avg); P(se->avg.util_avg); + P(se->avg.runnable_load_avg); #endif #undef PN_SCHEDSTAT @@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); #ifdef CONFIG_SMP + SEQ_printf(m, " .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight); SEQ_printf(m, " .%-30s: %lu\n", "load_avg", cfs_rq->avg.load_avg); SEQ_printf(m, " .%-30s: %lu\n", "runnable_load_avg", - cfs_rq->runnable_load_avg); + cfs_rq->avg.runnable_load_avg); SEQ_printf(m, " .%-30s: %lu\n", "util_avg", cfs_rq->avg.util_avg); - SEQ_printf(m, " .%-30s: %ld\n", "removed_load_avg", - atomic_long_read(&cfs_rq->removed_load_avg)); - SEQ_printf(m, " .%-30s: %ld\n", "removed_util_avg", - atomic_long_read(&cfs_rq->removed_util_avg)); + SEQ_printf(m, " .%-30s: %ld\n", "removed.load_avg", + cfs_rq->removed.load_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.util_avg", + cfs_rq->removed.util_avg); + SEQ_printf(m, " .%-30s: %ld\n", "removed.runnable_sum", + cfs_rq->removed.runnable_sum); #ifdef CONFIG_FAIR_GROUP_SCHED SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib", cfs_rq->tg_load_avg_contrib); @@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, "nr_involuntary_switches", (long long)p->nivcsw); P(se.load.weight); + P(se.runnable_weight); #ifdef CONFIG_SMP P(se.avg.load_sum); + P(se.avg.runnable_load_sum); P(se.avg.util_sum); P(se.avg.load_avg); + P(se.avg.runnable_load_avg); P(se.avg.util_avg); P(se.avg.last_update_time); #endif diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5c09ddf8c832..4037e19bbca2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -33,6 +33,7 @@ #include <linux/mempolicy.h> #include <linux/migrate.h> #include <linux/task_work.h> +#include <linux/sched/isolation.h> #include <trace/events/sched.h> @@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se) { struct sched_avg *sa = &se->avg; - sa->last_update_time = 0; - /* - * sched_avg's period_contrib should be strictly less then 1024, so - * we give it 1023 to make sure it is almost a period (1024us), and - * will definitely be update (after enqueue). - */ - sa->period_contrib = 1023; + memset(sa, 0, sizeof(*sa)); + /* * Tasks are intialized with full load to be seen as heavy tasks until * they get a chance to stabilize to their real load level. @@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se) * nothing has been attached to the task group yet. */ if (entity_is_task(se)) - sa->load_avg = scale_load_down(se->load.weight); - sa->load_sum = sa->load_avg * LOAD_AVG_MAX; - /* - * At this point, util_avg won't be used in select_task_rq_fair anyway - */ - sa->util_avg = 0; - sa->util_sum = 0; + sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight); + + se->runnable_weight = se->load.weight; + /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ } @@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se) } else { sa->util_avg = cap; } - sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } if (entity_is_task(se)) { @@ -852,7 +844,7 @@ static void update_curr(struct cfs_rq *cfs_rq) struct task_struct *curtask = task_of(curr); trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); - cpuacct_charge(curtask, delta_exec); + cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); } @@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; } else { - delta = p->se.avg.load_sum / p->se.load.weight; + delta = p->se.avg.load_sum; *period = LOAD_AVG_MAX; } @@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) cfs_rq->nr_running--; } +/* + * Signed add and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define add_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(_val) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + \ + res = var + val; \ + \ + if (val < 0 && res > var) \ + res = 0; \ + \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +/* + * Unsigned subtract and clamp on underflow. + * + * Explicitly do a load-store to ensure the intermediate value never hits + * memory. This allows lockless observations without ever seeing the negative + * values. + */ +#define sub_positive(_ptr, _val) do { \ + typeof(_ptr) ptr = (_ptr); \ + typeof(*ptr) val = (_val); \ + typeof(*ptr) res, var = READ_ONCE(*ptr); \ + res = var - val; \ + if (res > var) \ + res = 0; \ + WRITE_ONCE(*ptr, res); \ +} while (0) + +#ifdef CONFIG_SMP +/* + * XXX we want to get rid of these helpers and use the full load resolution. + */ +static inline long se_weight(struct sched_entity *se) +{ + return scale_load_down(se->load.weight); +} + +static inline long se_runnable(struct sched_entity *se) +{ + return scale_load_down(se->runnable_weight); +} + +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_weight += se->runnable_weight; + + cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg; + cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum; +} + +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->runnable_weight -= se->runnable_weight; + + sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg); + sub_positive(&cfs_rq->avg.runnable_load_sum, + se_runnable(se) * se->avg.runnable_load_sum); +} + +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + cfs_rq->avg.load_avg += se->avg.load_avg; + cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum; +} + +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); + sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); +} +#else +static inline void +enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +static inline void +dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } +#endif + +static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight, unsigned long runnable) +{ + if (se->on_rq) { + /* commit outstanding execution time */ + if (cfs_rq->curr == se) + update_curr(cfs_rq); + account_entity_dequeue(cfs_rq, se); + dequeue_runnable_load_avg(cfs_rq, se); + } + dequeue_load_avg(cfs_rq, se); + + se->runnable_weight = runnable; + update_load_set(&se->load, weight |
