diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-16 14:47:16 -0700 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-16 14:47:16 -0700 |
| commit | 825a3b2605c3aa193e0075d0f9c72e33c17ab16a (patch) | |
| tree | e8665c4cc20076ae53165475839d36b4bc641cd3 | |
| parent | cf6ed9a6682d3f171cf9550d4bbe0ef31b768a7e (diff) | |
| parent | ef0491ea17f8019821c7e9c8e801184ecf17f85a (diff) | |
| download | linux-825a3b2605c3aa193e0075d0f9c72e33c17ab16a.tar.gz linux-825a3b2605c3aa193e0075d0f9c72e33c17ab16a.tar.bz2 linux-825a3b2605c3aa193e0075d0f9c72e33c17ab16a.zip | |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- massive CPU hotplug rework (Thomas Gleixner)
- improve migration fairness (Peter Zijlstra)
- CPU load calculation updates/cleanups (Yuyang Du)
- cpufreq updates (Steve Muckle)
- nohz optimizations (Frederic Weisbecker)
- switch_mm() micro-optimization on x86 (Andy Lutomirski)
- ... lots of other enhancements, fixes and cleanups.
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (66 commits)
ARM: Hide finish_arch_post_lock_switch() from modules
sched/core: Provide a tsk_nr_cpus_allowed() helper
sched/core: Use tsk_cpus_allowed() instead of accessing ->cpus_allowed
sched/loadavg: Fix loadavg artifacts on fully idle and on fully loaded systems
sched/fair: Correct unit of load_above_capacity
sched/fair: Clean up scale confusion
sched/nohz: Fix affine unpinned timers mess
sched/fair: Fix fairness issue on migration
sched/core: Kill sched_class::task_waking to clean up the migration logic
sched/fair: Prepare to fix fairness problems on migration
sched/fair: Move record_wakee()
sched/core: Fix comment typo in wake_q_add()
sched/core: Remove unused variable
sched: Make hrtick_notifier an explicit call
sched/fair: Make ilb_notifier an explicit call
sched/hotplug: Make activate() the last hotplug step
sched/hotplug: Move migration CPU_DYING to sched_cpu_dying()
sched/migration: Move CPU_ONLINE into scheduler state
sched/migration: Move calc_load_migrate() into CPU_DYING
sched/migration: Move prepare transition to SCHED_STARTING state
...
31 files changed, 1329 insertions, 927 deletions
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt index f52f297cb406..9857606dd7b7 100644 --- a/Documentation/trace/ftrace.txt +++ b/Documentation/trace/ftrace.txt @@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set. <idle>-0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit <idle>-0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit <idle>-0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit - <idle>-0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit - <idle>-0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz + <idle>-0 3dN.1 13us : cpu_load_update_nohz <-tick_nohz_idle_exit + <idle>-0 3dN.1 13us : _raw_spin_lock <-cpu_load_update_nohz <idle>-0 3dN.1 13us : add_preempt_count <-_raw_spin_lock - <idle>-0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz - <idle>-0 3dN.2 14us : sched_avg_update <-__update_cpu_load - <idle>-0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz + <idle>-0 3dN.2 13us : __cpu_load_update <-cpu_load_update_nohz + <idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update + <idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz <idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock <idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit <idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index fa5b42d44985..3cc14dd8587c 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -15,6 +15,7 @@ #include <linux/compiler.h> #include <linux/sched.h> +#include <linux/preempt.h> #include <asm/cacheflush.h> #include <asm/cachetype.h> #include <asm/proc-fns.h> @@ -66,6 +67,7 @@ static inline void check_and_switch_context(struct mm_struct *mm, cpu_switch_mm(mm->pgd, mm); } +#ifndef MODULE #define finish_arch_post_lock_switch \ finish_arch_post_lock_switch static inline void finish_arch_post_lock_switch(void) @@ -87,6 +89,7 @@ static inline void finish_arch_post_lock_switch(void) preempt_enable_no_resched(); } } +#endif /* !MODULE */ #endif /* CONFIG_MMU */ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 8cac1eb41466..55c924b65f71 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 40a6b4f9c36c..7b89a7572100 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu_attach_task(pcpu, tidle); pcpu_start_fn(pcpu, smp_start_secondary, NULL); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu) || !cpu_active(cpu)) + while (!cpu_online(cpu)) cpu_relax(); return 0; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 5e5e76a52f58..b7080bef9137 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2183,7 +2183,7 @@ void arch_perf_update_userpage(struct perf_event *event, * cap_user_time_zero doesn't make sense when we're using a different * time base for the records. */ - if (event->clock == &local_clock) { + if (!event->attr.use_clockid) { userpg->cap_user_time_zero = 1; userpg->time_zero = data->cyc2ns_offset; } diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 84280029cafd..396348196aa7 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm) destroy_context_ldt(mm); } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) -{ - unsigned cpu = smp_processor_id(); +extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); - if (likely(prev != next)) { -#ifdef CONFIG_SMP - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - this_cpu_write(cpu_tlbstate.active_mm, next); -#endif - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * Re-load page tables. - * - * This logic has an ordering constraint: - * - * CPU 0: Write to a PTE for 'next' - * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. - * CPU 1: set bit 1 in next's mm_cpumask - * CPU 1: load from the PTE that CPU 0 writes (implicit) - * - * We need to prevent an outcome in which CPU 1 observes - * the new PTE value and CPU 0 observes bit 1 clear in - * mm_cpumask. (If that occurs, then the IPI will never - * be sent, and CPU 0's TLB will contain a stale entry.) - * - * The bad outcome can occur if either CPU's load is - * reordered before that CPU's store, so both CPUs must - * execute full barriers to prevent this from happening. - * - * Thus, switch_mm needs a full barrier between the - * store to mm_cpumask and any operation that could load - * from next->pgd. TLB fills are special and can happen - * due to instruction fetches or for no reason at all, - * and neither LOCK nor MFENCE orders them. - * Fortunately, load_cr3() is serializing and gives the - * ordering guarantee we need. - * - */ - load_cr3(next->pgd); - - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - - /* Stop flush ipis for the previous mm */ - cpumask_clear_cpu(cpu, mm_cpumask(prev)); - - /* Load per-mm CR4 state */ - load_mm_cr4(next); - -#ifdef CONFIG_MODIFY_LDT_SYSCALL - /* - * Load the LDT, if the LDT is different. - * - * It's possible that prev->context.ldt doesn't match - * the LDT register. This can happen if leave_mm(prev) - * was called and then modify_ldt changed - * prev->context.ldt but suppressed an IPI to this CPU. - * In this case, prev->context.ldt != NULL, because we - * never set context.ldt to NULL while the mm still - * exists. That means that next->context.ldt != - * prev->context.ldt, because mms never share an LDT. - */ - if (unlikely(prev->context.ldt != next->context.ldt)) - load_mm_ldt(next); -#endif - } -#ifdef CONFIG_SMP - else { - this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); - BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); - - if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { - /* - * On established mms, the mm_cpumask is only changed - * from irq context, from ptep_clear_flush() while in - * lazy tlb mode, and here. Irqs are blocked during - * schedule, protecting us from simultaneous changes. - */ - cpumask_set_cpu(cpu, mm_cpumask(next)); - - /* - * We were in lazy tlb mode and leave_mm disabled - * tlb flush IPI delivery. We must reload CR3 - * to make sure to use no freed page tables. - * - * As above, load_cr3() is serializing and orders TLB - * fills with respect to the mm_cpumask write. - */ - load_cr3(next->pgd); - trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - load_mm_cr4(next); - load_mm_ldt(next); - } - } -#endif -} +extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk); +#define switch_mm_irqs_off switch_mm_irqs_off #define activate_mm(prev, next) \ do { \ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index f98913258c63..62c0043a5fd5 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -2,7 +2,7 @@ KCOV_INSTRUMENT_tlb.o := n obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o physaddr.o gup.o setup_nx.o + pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o # Make sure __phys_addr has no stackprotector nostackp := $(call cc-option, -fno-stack-protector) @@ -12,7 +12,6 @@ CFLAGS_setup_nx.o := $(nostackp) CFLAGS_fault.o := -I$(src)/../include/asm/trace obj-$(CONFIG_X86_PAT) += pat_rbtree.o -obj-$(CONFIG_SMP) += tlb.o obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index fe9b9f776361..5643fd0b1a7d 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -28,6 +28,8 @@ * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi */ +#ifdef CONFIG_SMP + struct flush_tlb_info { struct mm_struct *flush_mm; unsigned long flush_start; @@ -57,6 +59,118 @@ void leave_mm(int cpu) } EXPORT_SYMBOL_GPL(leave_mm); +#endif /* CONFIG_SMP */ + +void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} + +void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned cpu = smp_processor_id(); + + if (likely(prev != next)) { +#ifdef CONFIG_SMP + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + this_cpu_write(cpu_tlbstate.active_mm, next); +#endif + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * Re-load page tables. + * + * This logic has an ordering constraint: + * + * CPU 0: Write to a PTE for 'next' + * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI. + * CPU 1: set bit 1 in next's mm_cpumask + * CPU 1: load from the PTE that CPU 0 writes (implicit) + * + * We need to prevent an outcome in which CPU 1 observes + * the new PTE value and CPU 0 observes bit 1 clear in + * mm_cpumask. (If that occurs, then the IPI will never + * be sent, and CPU 0's TLB will contain a stale entry.) + * + * The bad outcome can occur if either CPU's load is + * reordered before that CPU's store, so both CPUs must + * execute full barriers to prevent this from happening. + * + * Thus, switch_mm needs a full barrier between the + * store to mm_cpumask and any operation that could load + * from next->pgd. TLB fills are special and can happen + * due to instruction fetches or for no reason at all, + * and neither LOCK nor MFENCE orders them. + * Fortunately, load_cr3() is serializing and gives the + * ordering guarantee we need. + * + */ + load_cr3(next->pgd); + + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + + /* Stop flush ipis for the previous mm */ + cpumask_clear_cpu(cpu, mm_cpumask(prev)); + + /* Load per-mm CR4 state */ + load_mm_cr4(next); + +#ifdef CONFIG_MODIFY_LDT_SYSCALL + /* + * Load the LDT, if the LDT is different. + * + * It's possible that prev->context.ldt doesn't match + * the LDT register. This can happen if leave_mm(prev) + * was called and then modify_ldt changed + * prev->context.ldt but suppressed an IPI to this CPU. + * In this case, prev->context.ldt != NULL, because we + * never set context.ldt to NULL while the mm still + * exists. That means that next->context.ldt != + * prev->context.ldt, because mms never share an LDT. + */ + if (unlikely(prev->context.ldt != next->context.ldt)) + load_mm_ldt(next); +#endif + } +#ifdef CONFIG_SMP + else { + this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); + BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next); + + if (!cpumask_test_cpu(cpu, mm_cpumask(next))) { + /* + * On established mms, the mm_cpumask is only changed + * from irq context, from ptep_clear_flush() while in + * lazy tlb mode, and here. Irqs are blocked during + * schedule, protecting us from simultaneous changes. + */ + cpumask_set_cpu(cpu, mm_cpumask(next)); + + /* + * We were in lazy tlb mode and leave_mm disabled + * tlb flush IPI delivery. We must reload CR3 + * to make sure to use no freed page tables. + * + * As above, load_cr3() is serializing and orders TLB + * fills with respect to the mm_cpumask write. + */ + load_cr3(next->pgd); + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + load_mm_cr4(next); + load_mm_ldt(next); + } + } +#endif +} + +#ifdef CONFIG_SMP + /* * The flush IPI assumes that a thread switch happens in this order: * [cpu0: the cpu that switches] @@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void) return 0; } late_initcall(create_tlb_single_page_flush_ceiling); + +#endif /* CONFIG_SMP */ diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f9b1fab4388a..21597dcac0e2 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -59,25 +59,7 @@ struct notifier_block; * CPU notifier priorities. */ enum { - /* - * SCHED_ACTIVE marks a cpu which is coming up active during - * CPU_ONLINE and CPU_DOWN_FAILED and must be the first - * notifier. CPUSET_ACTIVE adjusts cpuset according to - * cpu_active mask right after SCHED_ACTIVE. During - * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are - * ordered in the similar way. - * - * This ordering guarantees consistent cpu_active mask and - * migration behavior to all cpu notifiers. - */ - CPU_PRI_SCHED_ACTIVE = INT_MAX, - CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1, - CPU_PRI_SCHED_INACTIVE = INT_MIN + 1, - CPU_PRI_CPUSET_INACTIVE = INT_MIN, - - /* migration should happen before other stuff but after perf */ CPU_PRI_PERF = 20, - CPU_PRI_MIGRATION = 10, /* bring up workqueues before normal notifiers and down after */ CPU_PRI_WORKQUEUE_UP = 5, diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 5d68e15e46b7..386374d19987 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -8,6 +8,7 @@ enum cpuhp_state { CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, + CPUHP_AP_SCHED_STARTING, CPUHP_AP_NOTIFY_STARTING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, @@ -16,6 +17,7 @@ enum cpuhp_state { CPUHP_AP_NOTIFY_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, + CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 40cee6b77a93..e828cf65d7df 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present) static inline void set_cpu_online(unsigned int cpu, bool online) { - if (online) { + if (online) cpumask_set_cpu(cpu, &__cpu_online_mask); - cpumask_set_cpu(cpu, &__cpu_active_mask); - } else { + else cpumask_clear_cpu(cpu, &__cpu_online_mask); - } } static inline void diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index f75222ea7f16..eabe0138eb06 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); -extern void lock_pin_lock(struct lockdep_map *lock); -extern void lock_unpin_lock(struct lockdep_map *lock); +struct pin_cookie { unsigned int val; }; + +#define NIL_COOKIE (struct pin_cookie){ .val = 0U, } + +extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); +extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); +extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); # define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0, @@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock); #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) -#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) -#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map) +#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) +#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) +#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ @@ -427,8 +433,13 @@ struct lock_class_key { }; #define lockdep_recursing(tsk) (0) -#define lockdep_pin_lock(l) do { (void)(l); } while (0) -#define lockdep_unpin_lock(l) do { (void)(l); } while (0) +struct pin_cookie { }; + +#define NIL_COOKIE (struct pin_cookie){ } + +#define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; }) +#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) +#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h index 70fffeba7495..a4441784503b 100644 --- a/include/linux/mmu_context.h +++ b/include/linux/mmu_context.h @@ -1,9 +1,16 @@ #ifndef _LINUX_MMU_CONTEXT_H #define _LINUX_MMU_CONTEXT_H +#include <asm/mmu_context.h> + struct mm_struct; void use_mm(struct mm_struct *mm); void unuse_mm(struct mm_struct *mm); +/* Architectures that care about IRQ state in switch_mm can override this. */ +#ifndef switch_mm_irqs_off +# define switch_mm_irqs_off switch_mm +#endif + #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index e8dfa6f0d843..6cc0df970f1a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -177,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) -extern void update_cpu_load_nohz(int active); +extern void cpu_load_update_nohz_start(void); +extern void cpu_load_update_nohz_stop(void); #else -static inline void update_cpu_load_nohz(int active) { } +static inline void cpu_load_update_nohz_start(void) { } +static inline void cpu_load_update_nohz_stop(void) { } #endif extern void dump_cpu_task(int cpu); @@ -371,6 +373,15 @@ extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); extern void scheduler_tick(void); +extern int sched_cpu_starting(unsigned int cpu); +extern int sched_cpu_activate(unsigned int cpu); +extern int sched_cpu_deactivate(unsigned int cpu); + +#ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_dying(unsigned int cpu); +#else +# define sched_cpu_dying NULL +#endif extern void sched_show_task(struct task_struct *p); @@ -934,9 +945,19 @@ enum cpu_idle_type { }; /* + * Integer metrics need fixed point arithmetic, e.g., sched/fair + * has a few: load, load_avg, util_avg, freq, and capacity. + * + * We define a basic fixed point arithmetic range, and then formalize + * all these metrics based on that basic range. + */ +# define SCHED_FIXEDPOINT_SHIFT 10 +# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT) + +/* * Increase resolution of cpu_capacity calculations */ -#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT #define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) /* @@ -1198,18 +1219,56 @@ struct load_weight { }; /* - * The load_avg/util_avg accumulates an infinite geometric series. - * 1) load_avg factors frequency scaling into the amount of time that a - * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the - * aggregated such weights of all runnable and blocked sched_entities. - * 2) util_avg factors frequency and cpu scaling into the amount of time - * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. - * For cfs_rq, it is the aggregated such times of all runnable and + * The load_avg/util_avg accumulates an infinite geometric series + * (see __update_load_avg() in kernel/sched/fair.c). + * + * [load_avg definition] + * + * load_avg = runnable% * scale_load_down(load) + * + * where runnable% is the time ratio that a sched_entity is runnable. + * For cfs_rq, it is the aggregated load_avg of all runnable and * blocked sched_entities. - * The 64 bit load_sum can: - * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with - * the highest weight (=88761) always runnable, we should not overflow - * 2) for entity, support any load.weight always runnable + * + * load_avg may also take frequency scaling into account: + * + * load_avg = runnable% * scale_load_down(load) * freq% + * + * where freq% is the CPU frequency normalized to the highest frequency. + * + * [util_avg definition] + * + * util_avg = running% * SCHED_CAPACITY_SCALE + * + * where running% is the time ratio that a sched_entity is running on + * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable + * and blocked sched_entities. + * + * util_avg may also factor frequency scaling and CPU capacity scaling: + * + * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity% + * + * where freq% is the same as above, and capacity% is the CPU capacity + * normalized to the greatest capacity (due to uarch differences, etc). + * + * N.B., the above ratios (runnable%, running%, freq%, and capacity%) + * themselves are in the range of [0, 1]. To do fixed point arithmetics, + * we therefore scale them to as large a range as necessary. This is for + * example reflected by util_avg's SCHED_CAPACITY_SCALE. + * + * [Overflow issue] + * + * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities + * with the highest load (=88761), always runnable on a single cfs_rq, + * and should not overflow as the number already hits PID_MAX_LIMIT. + * + * For all other cases (including 32-bit kernels), struct load_weight's + * weight will overflow first before we do, because: + * + * Max(load_avg) <= Max(load.weight) + * + * Then it is the load_weight's responsibility to consider overflow + * issues. */ struct sched_avg { u64 last_update_time, load_sum; @@ -1871,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly; /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +static inline int tsk_nr_cpus_allowed(struct task_struct *p) +{ + return p->nr_cpus_allowed; +} + #define TNF_MIGRATED 0x01 #define TNF_NO_GROUP 0x02 #define TNF_SHARED 0x04 @@ -2303,8 +2367,6 @@ extern unsigned long long notrace sched_clock(void); /* * See the comment in kernel/sched/clock.c */ -extern u64 cpu_clock(int cpu); -extern u64 local_clock(void); extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); @@ -2323,6 +2385,16 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } + +static inline u64 cpu_clock(int cpu) +{ + return sched_clock(); +} + +static inline u64 local_clock(void) +{ + return sched_clock(); +} #else /* * Architectures can set this to 1 if they have specified @@ -2337,6 +2409,26 @@ extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); + +/* + * As outlined in clock.c, provides a fast, high resolution, nanosecond + * time source that is monotonic per cpu argument and has bounded drift + * between cpus. + * + * ######################### BIG FAT WARNING ########################## + * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # + * # go backwards !! # + * #################################################################### + */ +static inline u64 cpu_clock(int cpu) +{ + return sched_clock_cpu(cpu); +} + +static inline u64 local_clock(void) +{ + return sched_clock_cpu(raw_smp_processor_id()); +} #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/cpu.c b/kernel/cpu.c index 3e3f6e49eabb..d948e44c471e 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu) struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); int err; - /* - * By now we've cleared cpu_active_mask, wait for all preempt-disabled - * and RCU users of this state to go away such that all new such users - * will observe it. - * - * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might - * not imply sync_sched(), so wait for both. - * - * Do sync before park smpboot threads to take care the rcu boost case. - */ - if (IS_ENABLED(CONFIG_PREEMPT)) - synchronize_rcu_mult(call_rcu, call_rcu_sched); - else - synchronize_rcu(); - /* Park the smpboot threads */ kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); @@ -923,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state) st->state = CPUHP_AP_ONLINE_IDLE; - /* The cpu is marked online, set it active now */ - set_cpu_active(cpu, true); /* Unpark the stopper thread and the hotplug thread of this cpu */ stop_machine_unpark(cpu); kthread_unpark(st->thread); @@ -1236,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = { .name = "ap:offline", .cant_stop = true, }, + /* First state is scheduler control. Interrupts are disabled */ + [CPUHP_AP_SCHED_STARTING] = { + .name = "sched:starting", + .startup = sched_cpu_starting, + .teardown = sched_cpu_dying, + }, /* |
