diff options
48 files changed, 1898 insertions, 785 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 36d6ce7cc886..b93aaa374266 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3903,6 +3903,13 @@ Format: {"off"} Disable Hardware Transactional Memory + preempt= [KNL] + Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC + none - Limited to cond_resched() calls + voluntary - Limited to cond_resched() and might_sleep() calls + full - Any section that isn't explicitly preempt disabled + can be preempted anytime. + print-fatal-signals= [KNL] debug: print fatal signals diff --git a/Documentation/scheduler/schedutil.txt b/Documentation/scheduler/schedutil.txt new file mode 100644 index 000000000000..78f6b91e2291 --- /dev/null +++ b/Documentation/scheduler/schedutil.txt @@ -0,0 +1,169 @@ + + +NOTE; all this assumes a linear relation between frequency and work capacity, +we know this is flawed, but it is the best workable approximation. + + +PELT (Per Entity Load Tracking) +------------------------------- + +With PELT we track some metrics across the various scheduler entities, from +individual tasks to task-group slices to CPU runqueues. As the basis for this +we use an Exponentially Weighted Moving Average (EWMA), each period (1024us) +is decayed such that y^32 = 0.5. That is, the most recent 32ms contribute +half, while the rest of history contribute the other half. + +Specifically: + + ewma_sum(u) := u_0 + u_1*y + u_2*y^2 + ... + + ewma(u) = ewma_sum(u) / ewma_sum(1) + +Since this is essentially a progression of an infinite geometric series, the +results are composable, that is ewma(A) + ewma(B) = ewma(A+B). This property +is key, since it gives the ability to recompose the averages when tasks move +around. + +Note that blocked tasks still contribute to the aggregates (task-group slices +and CPU runqueues), which reflects their expected contribution when they +resume running. + +Using this we track 2 key metrics: 'running' and 'runnable'. 'Running' +reflects the time an entity spends on the CPU, while 'runnable' reflects the +time an entity spends on the runqueue. When there is only a single task these +two metrics are the same, but once there is contention for the CPU 'running' +will decrease to reflect the fraction of time each task spends on the CPU +while 'runnable' will increase to reflect the amount of contention. + +For more detail see: kernel/sched/pelt.c + + +Frequency- / CPU Invariance +--------------------------- + +Because consuming the CPU for 50% at 1GHz is not the same as consuming the CPU +for 50% at 2GHz, nor is running 50% on a LITTLE CPU the same as running 50% on +a big CPU, we allow architectures to scale the time delta with two ratios, one +Dynamic Voltage and Frequency Scaling (DVFS) ratio and one microarch ratio. + +For simple DVFS architectures (where software is in full control) we trivially +compute the ratio as: + + f_cur + r_dvfs := ----- + f_max + +For more dynamic systems where the hardware is in control of DVFS we use +hardware counters (Intel APERF/MPERF, ARMv8.4-AMU) to provide us this ratio. +For Intel specifically, we use: + + APERF + f_cur := ----- * P0 + MPERF + + 4C-turbo; if available and turbo enabled + f_max := { 1C-turbo; if turbo enabled + P0; otherwise + + f_cur + r_dvfs := min( 1, ----- ) + f_max + +We pick 4C turbo over 1C turbo to make it slightly more sustainable. + +r_cpu is determined as the ratio of highest performance level of the current +CPU vs the highest performance level of any other CPU in the system. + + r_tot = r_dvfs * r_cpu + +The result is that the above 'running' and 'runnable' metrics become invariant +of DVFS and CPU type. IOW. we can transfer and compare them between CPUs. + +For more detail see: + + - kernel/sched/pelt.h:update_rq_clock_pelt() + - arch/x86/kernel/smpboot.c:"APERF/MPERF frequency ratio computation." + - Documentation/scheduler/sched-capacity.rst:"1. CPU Capacity + 2. Task utilization" + + +UTIL_EST / UTIL_EST_FASTUP +-------------------------- + +Because periodic tasks have their averages decayed while they sleep, even +though when running their expected utilization will be the same, they suffer a +(DVFS) ramp-up after they are running again. + +To alleviate this (a default enabled option) UTIL_EST drives an Infinite +Impulse Response (IIR) EWMA with the 'running' value on dequeue -- when it is +highest. A further default enabled option UTIL_EST_FASTUP modifies the IIR +filter to instantly increase and only decay on decrease. + +A further runqueue wide sum (of runnable tasks) is maintained of: + + util_est := \Sum_t max( t_running, t_util_est_ewma ) + +For more detail see: kernel/sched/fair.c:util_est_dequeue() + + +UCLAMP +------ + +It is possible to set effective u_min and u_max clamps on each CFS or RT task; +the runqueue keeps an max aggregate of these clamps for all running tasks. + +For more detail see: include/uapi/linux/sched/types.h + + +Schedutil / DVFS +---------------- + +Every time the scheduler load tracking is updated (task wakeup, task +migration, time progression) we call out to schedutil to update the hardware +DVFS state. + +The basis is the CPU runqueue's 'running' metric, which per the above it is +the frequency invariant utilization estimate of the CPU. From this we compute +a desired frequency like: + + max( running, util_est ); if UTIL_EST + u_cfs := { running; otherwise + + clamp( u_cfs + u_rt , u_min, u_max ); if UCLAMP_TASK + u_clamp := { u_cfs + u_rt; otherwise + + u := u_clamp + u_irq + u_dl; [approx. see source for more detail] + + f_des := min( f_max, 1.25 u * f_max ) + +XXX IO-wait; when the update is due to a task wakeup from IO-completion we +boost 'u' above. + +This frequency is then used to select a P-state/OPP or directly munged into a +CPPC style request to the hardware. + +XXX: deadline tasks (Sporadic Task Model) allows us to calculate a hard f_min +required to satisfy the workload. + +Because these callbacks are directly from the scheduler, the DVFS hardware +interaction should be 'fast' and non-blocking. Schedutil supports +rate-limiting DVFS requests for when hardware interaction is slow and +expensive, this reduces effectiveness. + +For more information see: kernel/sched/cpufreq_schedutil.c + + +NOTES +----- + + - On low-load scenarios, where DVFS is most relevant, the 'running' numbers + will closely reflect utilization. + + - In saturated scenarios task movement will cause some transient dips, + suppose we have a CPU saturated with 4 tasks, then when we migrate a task + to an idle CPU, the old CPU will have a 'running' value of 0.75 while the + new CPU will gain 0.25. This is inevitable and time progression will + correct this. XXX do we still guarantee f_max due to no idle-time? + + - Much of the above is about avoiding DVFS dips, and independent DVFS domains + having to re-learn / ramp-up when load shifts. + diff --git a/arch/Kconfig b/arch/Kconfig index 87608c2fa027..4790a5f23d9f 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1058,6 +1058,15 @@ config HAVE_STATIC_CALL_INLINE bool depends on HAVE_STATIC_CALL +config HAVE_PREEMPT_DYNAMIC + bool + depends on HAVE_STATIC_CALL + depends on GENERIC_ENTRY + help + Select this if the architecture support boot time preempt setting + on top of static calls. It is strongly advised to support inline + static call to avoid any overhead. + config ARCH_WANT_LD_ORPHAN_WARN bool help diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 9d06fffb1526..369206489895 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -72,7 +72,7 @@ static struct timer_list spuloadavg_timer; #define DEF_SPU_TIMESLICE (100 * HZ / (1000 * SPUSCHED_TICK)) #define SCALE_PRIO(x, prio) \ - max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE) + max(x * (MAX_PRIO - prio) / (NICE_WIDTH / 2), MIN_SPU_TIMESLICE) /* * scale user-nice values [ -20 ... 0 ... 19 ] to time slice values: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7b934a591df2..595193bc2d31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -224,6 +224,7 @@ config X86 select HAVE_STACK_VALIDATION if X86_64 select HAVE_STATIC_CALL select HAVE_STATIC_CALL_INLINE if HAVE_STACK_VALIDATION + select HAVE_PREEMPT_DYNAMIC select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 69485ca13665..f8cb8af4de5c 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -5,6 +5,7 @@ #include <asm/rmwcc.h> #include <asm/percpu.h> #include <linux/thread_info.h> +#include <linux/static_call_types.h> DECLARE_PER_CPU(int, __preempt_count); @@ -103,16 +104,45 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION - extern asmlinkage void preempt_schedule_thunk(void); -# define __preempt_schedule() \ - asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT) - extern asmlinkage void preempt_schedule(void); - extern asmlinkage void preempt_schedule_notrace_thunk(void); -# define __preempt_schedule_notrace() \ - asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT) +extern asmlinkage void preempt_schedule(void); +extern asmlinkage void preempt_schedule_thunk(void); - extern asmlinkage void preempt_schedule_notrace(void); -#endif +#define __preempt_schedule_func preempt_schedule_thunk + +extern asmlinkage void preempt_schedule_notrace(void); +extern asmlinkage void preempt_schedule_notrace_thunk(void); + +#define __preempt_schedule_notrace_func preempt_schedule_notrace_thunk + +#ifdef CONFIG_PREEMPT_DYNAMIC + +DECLARE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); + +#define __preempt_schedule() \ +do { \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule) : ASM_CALL_CONSTRAINT); \ +} while (0) + +DECLARE_STATIC_CALL(preempt_schedule_notrace, __preempt_schedule_notrace_func); + +#define __preempt_schedule_notrace() \ +do { \ + __STATIC_CALL_MOD_ADDRESSABLE(preempt_schedule_notrace); \ + asm volatile ("call " STATIC_CALL_TRAMP_STR(preempt_schedule_notrace) : ASM_CALL_CONSTRAINT); \ +} while (0) + +#else /* PREEMPT_DYNAMIC */ + +#define __preempt_schedule() \ + asm volatile ("call preempt_schedule_thunk" : ASM_CALL_CONSTRAINT); + +#define __preempt_schedule_notrace() \ + asm volatile ("call preempt_schedule_notrace_thunk" : ASM_CALL_CONSTRAINT); + +#endif /* PREEMPT_DYNAMIC */ + +#endif /* PREEMPTION */ #endif /* __ASM_PREEMPT_H */ diff --git a/arch/x86/include/asm/static_call.h b/arch/x86/include/asm/static_call.h index c37f11999d0c..cbb67b6030f9 100644 --- a/arch/x86/include/asm/static_call.h +++ b/arch/x86/include/asm/static_call.h @@ -37,4 +37,11 @@ #define ARCH_DEFINE_STATIC_CALL_NULL_TRAMP(name) \ __ARCH_DEFINE_STATIC_CALL_TRAMP(name, "ret; nop; nop; nop; nop") + +#define ARCH_ADD_TRAMP_KEY(name) \ + asm(".pushsection .static_call_tramp_key, \"a\" \n" \ + ".long " STATIC_CALL_TRAMP_STR(name) " - . \n" \ + ".long " STATIC_CALL_KEY_STR(name) " - . \n" \ + ".popsection \n") + #endif /* _ASM_STATIC_CALL_H */ diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ca9a380d9c0b..9442c4136c38 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -11,14 +11,26 @@ enum insn_type { RET = 3, /* tramp / site cond-tail-call */ }; +/* + * data16 data16 xorq %rax, %rax - a single 5 byte instruction that clears %rax + * The REX.W cancels the effect of any data16. + */ +static const u8 xor5rax[] = { 0x66, 0x66, 0x48, 0x31, 0xc0 }; + static void __ref __static_call_transform(void *insn, enum insn_type type, void *func) { + const void *emulate = NULL; int size = CALL_INSN_SIZE; const void *code; switch (type) { case CALL: code = text_gen_insn(CALL_INSN_OPCODE, insn, func); + if (func == &__static_call_return0) { + emulate = code; + code = &xor5rax; + } + break; case NOP: @@ -41,7 +53,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, void if (unlikely(system_state == SYSTEM_BOOTING)) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, NULL); + text_poke_bp(insn, code, size, emulate); } static void __static_call_validate(void *insn, bool tail) @@ -54,7 +66,8 @@ static void __static_call_validate(void *insn, bool tail) return; } else { if (opcode == CALL_INSN_OPCODE || - !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5)) + !memcmp(insn, ideal_nops[NOP_ATOMIC5], 5) || + !memcmp(insn, xor5rax, 5)) return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b404e4d7dd8..b967c1c774a1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1782,6 +1782,7 @@ EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) { + xfer_to_guest_mode_prepare(); return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending(); } diff --git a/drivers/thermal/cpufreq_cooling.c b/drivers/thermal/cpufreq_cooling.c index 612f063c1cfc..f5af2571f9b7 100644 --- a/drivers/thermal/cpufreq_cooling.c +++ b/drivers/thermal/cpufreq_cooling.c @@ -76,7 +76,9 @@ struct cpufreq_cooling_device { struct em_perf_domain *em; struct cpufreq_policy *policy; struct list_head node; +#ifndef CONFIG_SMP struct time_in_idle *idle_time; +#endif struct freq_qos_request qos_req; }; @@ -132,14 +134,25 @@ static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, } /** - * get_load() - get load for a cpu since last updated - * @cpufreq_cdev: &struct cpufreq_cooling_device for this cpu - * @cpu: cpu number - * @cpu_idx: index of the cpu in time_in_idle* + * get_load() - get load for a cpu + * @cpufreq_cdev: struct cpufreq_cooling_device for the cpu + * @cpu: cpu number + * @cpu_idx: index of the cpu in time_in_idle array * * Return: The average load of cpu @cpu in percentage since this * function was last called. */ +#ifdef CONFIG_SMP +static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, + int cpu_idx) +{ + unsigned long max = arch_scale_cpu_capacity(cpu); + unsigned long util; + + util = sched_cpu_util(cpu, max); + return (util * 100) / max; +} +#else /* !CONFIG_SMP */ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, int cpu_idx) { @@ -161,6 +174,7 @@ static u32 get_load(struct cpufreq_cooling_device *cpufreq_cdev, int cpu, return load; } +#endif /* CONFIG_SMP */ /** * get_dynamic_power() - calculate the dynamic power @@ -346,6 +360,36 @@ static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev, } #endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */ +#ifdef CONFIG_SMP +static inline int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) +{ + return 0; +} + +static inline void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) +{ +} +#else +static int allocate_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) +{ + unsigned int num_cpus = cpumask_weight(cpufreq_cdev->policy->related_cpus); + + cpufreq_cdev->idle_time = kcalloc(num_cpus, + sizeof(*cpufreq_cdev->idle_time), + GFP_KERNEL); + if (!cpufreq_cdev->idle_time) + return -ENOMEM; + + return 0; +} + +static void free_idle_time(struct cpufreq_cooling_device *cpufreq_cdev) +{ + kfree(cpufreq_cdev->idle_time); + cpufreq_cdev->idle_time = NULL; +} +#endif /* CONFIG_SMP */ + static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev, unsigned long state) { @@ -485,7 +529,7 @@ __cpufreq_cooling_register(struct device_node *np, struct thermal_cooling_device *cdev; struct cpufreq_cooling_device *cpufreq_cdev; char dev_name[THERMAL_NAME_LENGTH]; - unsigned int i, num_cpus; + unsigned int i; struct device *dev; int ret; struct thermal_cooling_device_ops *cooling_ops; @@ -496,7 +540,6 @@ __cpufreq_cooling_register(struct device_node *np, return ERR_PTR(-ENODEV); } - if (IS_ERR_OR_NULL(policy)) { pr_err("%s: cpufreq policy isn't valid: %p\n", __func__, policy); return ERR_PTR(-EINVAL); @@ -514,12 +557,10 @@ __cpufreq_cooling_register(struct device_node *np, return ERR_PTR(-ENOMEM); cpufreq_cdev->policy = policy; - num_cpus = cpumask_weight(policy->related_cpus); - cpufreq_cdev->idle_time = kcalloc(num_cpus, - sizeof(*cpufreq_cdev->idle_time), - GFP_KERNEL); - if (!cpufreq_cdev->idle_time) { - cdev = ERR_PTR(-ENOMEM); + + ret = allocate_idle_time(cpufreq_cdev); + if (ret) { + cdev = ERR_PTR(ret); goto free_cdev; } @@ -579,7 +620,7 @@ remove_qos_req: remove_ida: ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); free_idle_time: - kfree(cpufreq_cdev->idle_time); + free_idle_time(cpufreq_cdev); free_cdev: kfree(cpufreq_cdev); return cdev; @@ -672,7 +713,7 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev) thermal_cooling_device_unregister(cdev); freq_qos_remove_request(&cpufreq_cdev->qos_req); ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); - kfree(cpufreq_cdev->idle_time); + free_idle_time(cpufreq_cdev); kfree(cpufreq_cdev); } EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 52dbd58f6810..a54e08d77789 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -403,7 +403,10 @@ . = ALIGN(8); \ __start_static_call_sites = .; \ KEEP(*(.static_call_sites)) \ - __stop_static_call_sites = .; + __stop_static_call_sites = .; \ + __start_static_call_tramp_key = .; \ + KEEP(*(.static_call_tramp_key)) \ + __stop_static_call_tramp_key = .; /* * Allow architectures to handle ro_after_init data on their diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 451c2d26a5db..4f2f79de083e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -307,7 +307,7 @@ void css_task_iter_end(struct css_task_iter *it); * Inline functions. */ -static inline u64 cgroup_id(struct cgroup *cgrp) +static inline u64 cgroup_id(const struct cgroup *cgrp) { return cgrp->kn->id; } @@ -701,7 +701,7 @@ void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); struct cgroup_subsys_state; struct cgroup; -static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } +static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index a104b298019a..883acef895bc 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -2,6 +2,7 @@ #ifndef __LINUX_ENTRYCOMMON_H #define __LINUX_ENTRYCOMMON_H +#include <linux/static_call_types.h> #include <linux/tracehook.h> #include <linux/syscalls.h> #include <linux/seccomp.h> @@ -454,6 +455,9 @@ irqentry_state_t noinstr irqentry_enter(struct pt_regs *regs); * Conditional reschedule with additional sanity checks. */ void irqentry_exit_cond_resched(void); +#ifdef CONFIG_PREEMPT_DYNAMIC +DECLARE_STATIC_CALL(irqentry_exit_cond_resched, irqentry_exit_cond_resched); +#endif /** * irqentry_exit - Handle return from exception that used irqentry_enter() diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 9b93f8584ff7..8b2b1d68b954 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -47,6 +47,20 @@ static inline int arch_xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu, int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu); /** + * xfer_to_guest_mode_prepare - Perform last minute preparation work that + * need to be handled while IRQs are disabled + * upon entering to guest. + * + * Has to be invoked with interrupts disabled before the last call + * to xfer_to_guest_mode_work_pending(). + */ +static inline void xfer_to_guest_mode_prepare(void) +{ + lockdep_assert_irqs_disabled(); + rcu_nocb_flush_deferred_wakeup(); +} + +/** * __xfer_to_guest_mode_work_pending - Check if work is pending * * Returns: True if work pending, False otherwise. diff --git a/include/linux/kernel.h b/include/linux/kernel.h index f7902d8c1048..5b7ed6dc99ac 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -15,7 +15,7 @@ #include <linux/typecheck.h> #include <linux/printk.h> #include <linux/build_bug.h> - +#include <linux/static_call_types.h> #include <asm/byteorder.h> #include <uapi/linux/kernel.h> @@ -81,11 +81,26 @@ struct pt_regs; struct user; #ifdef CONFIG_PREEMPT_VOLUNTARY -extern int _cond_resched(void); -# define might_resched() _cond_resched() + +extern int __cond_resched(void); +# define might_resched() __cond_resched() + +#elif defined(CONFIG_PREEMPT_DYNAMIC) + +extern int __cond_resched(void); + +DECLARE_STATIC_CALL(might_resched, __cond_resched); + +static __always_inline void might_resched(void) +{ + static_call_mod(might_resched)(); +} + |