summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/trace/ftrace.txt10
-rw-r--r--arch/arm/include/asm/mmu_context.h3
-rw-r--r--arch/powerpc/kernel/smp.c2
-rw-r--r--arch/s390/kernel/smp.c2
-rw-r--r--arch/x86/events/core.c2
-rw-r--r--arch/x86/include/asm/mmu_context.h101
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--arch/x86/mm/tlb.c116
-rw-r--r--include/linux/cpu.h18
-rw-r--r--include/linux/cpuhotplug.h2
-rw-r--r--include/linux/cpumask.h6
-rw-r--r--include/linux/lockdep.h23
-rw-r--r--include/linux/mmu_context.h7
-rw-r--r--include/linux/sched.h124
-rw-r--r--kernel/cpu.c32
-rw-r--r--kernel/locking/lockdep.c71
-rw-r--r--kernel/sched/clock.c48
-rw-r--r--kernel/sched/core.c749
-rw-r--r--kernel/sched/cpuacct.c147
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpupri.c4
-rw-r--r--kernel/sched/deadline.c55
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c513
-rw-r--r--kernel/sched/idle_task.c2
-rw-r--r--kernel/sched/loadavg.c11
-rw-r--r--kernel/sched/rt.c38
-rw-r--r--kernel/sched/sched.h140
-rw-r--r--kernel/sched/stop_task.c2
-rw-r--r--kernel/time/tick-sched.c9
-rw-r--r--mm/mmu_context.c2
31 files changed, 1329 insertions, 927 deletions
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index f52f297cb406..9857606dd7b7 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1562,12 +1562,12 @@ Doing the same with chrt -r 5 and function-trace set.
<idle>-0 3dN.1 12us : menu_hrtimer_cancel <-tick_nohz_idle_exit
<idle>-0 3dN.1 12us : ktime_get <-tick_nohz_idle_exit
<idle>-0 3dN.1 12us : tick_do_update_jiffies64 <-tick_nohz_idle_exit
- <idle>-0 3dN.1 13us : update_cpu_load_nohz <-tick_nohz_idle_exit
- <idle>-0 3dN.1 13us : _raw_spin_lock <-update_cpu_load_nohz
+ <idle>-0 3dN.1 13us : cpu_load_update_nohz <-tick_nohz_idle_exit
+ <idle>-0 3dN.1 13us : _raw_spin_lock <-cpu_load_update_nohz
<idle>-0 3dN.1 13us : add_preempt_count <-_raw_spin_lock
- <idle>-0 3dN.2 13us : __update_cpu_load <-update_cpu_load_nohz
- <idle>-0 3dN.2 14us : sched_avg_update <-__update_cpu_load
- <idle>-0 3dN.2 14us : _raw_spin_unlock <-update_cpu_load_nohz
+ <idle>-0 3dN.2 13us : __cpu_load_update <-cpu_load_update_nohz
+ <idle>-0 3dN.2 14us : sched_avg_update <-__cpu_load_update
+ <idle>-0 3dN.2 14us : _raw_spin_unlock <-cpu_load_update_nohz
<idle>-0 3dN.2 14us : sub_preempt_count <-_raw_spin_unlock
<idle>-0 3dN.1 15us : calc_load_exit_idle <-tick_nohz_idle_exit
<idle>-0 3dN.1 15us : touch_softlockup_watchdog <-tick_nohz_idle_exit
diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index fa5b42d44985..3cc14dd8587c 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -15,6 +15,7 @@
#include <linux/compiler.h>
#include <linux/sched.h>
+#include <linux/preempt.h>
#include <asm/cacheflush.h>
#include <asm/cachetype.h>
#include <asm/proc-fns.h>
@@ -66,6 +67,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
cpu_switch_mm(mm->pgd, mm);
}
+#ifndef MODULE
#define finish_arch_post_lock_switch \
finish_arch_post_lock_switch
static inline void finish_arch_post_lock_switch(void)
@@ -87,6 +89,7 @@ static inline void finish_arch_post_lock_switch(void)
preempt_enable_no_resched();
}
}
+#endif /* !MODULE */
#endif /* CONFIG_MMU */
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 8cac1eb41466..55c924b65f71 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -565,7 +565,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
smp_ops->give_timebase();
/* Wait until cpu puts itself in the online & active maps */
- while (!cpu_online(cpu) || !cpu_active(cpu))
+ while (!cpu_online(cpu))
cpu_relax();
return 0;
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index 40a6b4f9c36c..7b89a7572100 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -832,7 +832,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle)
pcpu_attach_task(pcpu, tidle);
pcpu_start_fn(pcpu, smp_start_secondary, NULL);
/* Wait until cpu puts itself in the online & active maps */
- while (!cpu_online(cpu) || !cpu_active(cpu))
+ while (!cpu_online(cpu))
cpu_relax();
return 0;
}
diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 5e5e76a52f58..b7080bef9137 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2183,7 +2183,7 @@ void arch_perf_update_userpage(struct perf_event *event,
* cap_user_time_zero doesn't make sense when we're using a different
* time base for the records.
*/
- if (event->clock == &local_clock) {
+ if (!event->attr.use_clockid) {
userpg->cap_user_time_zero = 1;
userpg->time_zero = data->cyc2ns_offset;
}
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 84280029cafd..396348196aa7 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -115,103 +115,12 @@ static inline void destroy_context(struct mm_struct *mm)
destroy_context_ldt(mm);
}
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
- struct task_struct *tsk)
-{
- unsigned cpu = smp_processor_id();
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
- if (likely(prev != next)) {
-#ifdef CONFIG_SMP
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * Re-load page tables.
- *
- * This logic has an ordering constraint:
- *
- * CPU 0: Write to a PTE for 'next'
- * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
- * CPU 1: set bit 1 in next's mm_cpumask
- * CPU 1: load from the PTE that CPU 0 writes (implicit)
- *
- * We need to prevent an outcome in which CPU 1 observes
- * the new PTE value and CPU 0 observes bit 1 clear in
- * mm_cpumask. (If that occurs, then the IPI will never
- * be sent, and CPU 0's TLB will contain a stale entry.)
- *
- * The bad outcome can occur if either CPU's load is
- * reordered before that CPU's store, so both CPUs must
- * execute full barriers to prevent this from happening.
- *
- * Thus, switch_mm needs a full barrier between the
- * store to mm_cpumask and any operation that could load
- * from next->pgd. TLB fills are special and can happen
- * due to instruction fetches or for no reason at all,
- * and neither LOCK nor MFENCE orders them.
- * Fortunately, load_cr3() is serializing and gives the
- * ordering guarantee we need.
- *
- */
- load_cr3(next->pgd);
-
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
- /* Stop flush ipis for the previous mm */
- cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
- /* Load per-mm CR4 state */
- load_mm_cr4(next);
-
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
- /*
- * Load the LDT, if the LDT is different.
- *
- * It's possible that prev->context.ldt doesn't match
- * the LDT register. This can happen if leave_mm(prev)
- * was called and then modify_ldt changed
- * prev->context.ldt but suppressed an IPI to this CPU.
- * In this case, prev->context.ldt != NULL, because we
- * never set context.ldt to NULL while the mm still
- * exists. That means that next->context.ldt !=
- * prev->context.ldt, because mms never share an LDT.
- */
- if (unlikely(prev->context.ldt != next->context.ldt))
- load_mm_ldt(next);
-#endif
- }
-#ifdef CONFIG_SMP
- else {
- this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
- BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
- if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
- /*
- * On established mms, the mm_cpumask is only changed
- * from irq context, from ptep_clear_flush() while in
- * lazy tlb mode, and here. Irqs are blocked during
- * schedule, protecting us from simultaneous changes.
- */
- cpumask_set_cpu(cpu, mm_cpumask(next));
-
- /*
- * We were in lazy tlb mode and leave_mm disabled
- * tlb flush IPI delivery. We must reload CR3
- * to make sure to use no freed page tables.
- *
- * As above, load_cr3() is serializing and orders TLB
- * fills with respect to the mm_cpumask write.
- */
- load_cr3(next->pgd);
- trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- load_mm_cr4(next);
- load_mm_ldt(next);
- }
- }
-#endif
-}
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f98913258c63..62c0043a5fd5 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -2,7 +2,7 @@
KCOV_INSTRUMENT_tlb.o := n
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
- pat.o pgtable.o physaddr.o gup.o setup_nx.o
+ pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
# Make sure __phys_addr has no stackprotector
nostackp := $(call cc-option, -fno-stack-protector)
@@ -12,7 +12,6 @@ CFLAGS_setup_nx.o := $(nostackp)
CFLAGS_fault.o := -I$(src)/../include/asm/trace
obj-$(CONFIG_X86_PAT) += pat_rbtree.o
-obj-$(CONFIG_SMP) += tlb.o
obj-$(CONFIG_X86_32) += pgtable_32.o iomap_32.o
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index fe9b9f776361..5643fd0b1a7d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+#ifdef CONFIG_SMP
+
struct flush_tlb_info {
struct mm_struct *flush_mm;
unsigned long flush_start;
@@ -57,6 +59,118 @@ void leave_mm(int cpu)
}
EXPORT_SYMBOL_GPL(leave_mm);
+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ switch_mm_irqs_off(prev, next, tsk);
+ local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+ struct task_struct *tsk)
+{
+ unsigned cpu = smp_processor_id();
+
+ if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * Re-load page tables.
+ *
+ * This logic has an ordering constraint:
+ *
+ * CPU 0: Write to a PTE for 'next'
+ * CPU 0: load bit 1 in mm_cpumask. if nonzero, send IPI.
+ * CPU 1: set bit 1 in next's mm_cpumask
+ * CPU 1: load from the PTE that CPU 0 writes (implicit)
+ *
+ * We need to prevent an outcome in which CPU 1 observes
+ * the new PTE value and CPU 0 observes bit 1 clear in
+ * mm_cpumask. (If that occurs, then the IPI will never
+ * be sent, and CPU 0's TLB will contain a stale entry.)
+ *
+ * The bad outcome can occur if either CPU's load is
+ * reordered before that CPU's store, so both CPUs must
+ * execute full barriers to prevent this from happening.
+ *
+ * Thus, switch_mm needs a full barrier between the
+ * store to mm_cpumask and any operation that could load
+ * from next->pgd. TLB fills are special and can happen
+ * due to instruction fetches or for no reason at all,
+ * and neither LOCK nor MFENCE orders them.
+ * Fortunately, load_cr3() is serializing and gives the
+ * ordering guarantee we need.
+ *
+ */
+ load_cr3(next->pgd);
+
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+ /* Stop flush ipis for the previous mm */
+ cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+ /* Load per-mm CR4 state */
+ load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+ /*
+ * Load the LDT, if the LDT is different.
+ *
+ * It's possible that prev->context.ldt doesn't match
+ * the LDT register. This can happen if leave_mm(prev)
+ * was called and then modify_ldt changed
+ * prev->context.ldt but suppressed an IPI to this CPU.
+ * In this case, prev->context.ldt != NULL, because we
+ * never set context.ldt to NULL while the mm still
+ * exists. That means that next->context.ldt !=
+ * prev->context.ldt, because mms never share an LDT.
+ */
+ if (unlikely(prev->context.ldt != next->context.ldt))
+ load_mm_ldt(next);
+#endif
+ }
+#ifdef CONFIG_SMP
+ else {
+ this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+ BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+ if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+ /*
+ * On established mms, the mm_cpumask is only changed
+ * from irq context, from ptep_clear_flush() while in
+ * lazy tlb mode, and here. Irqs are blocked during
+ * schedule, protecting us from simultaneous changes.
+ */
+ cpumask_set_cpu(cpu, mm_cpumask(next));
+
+ /*
+ * We were in lazy tlb mode and leave_mm disabled
+ * tlb flush IPI delivery. We must reload CR3
+ * to make sure to use no freed page tables.
+ *
+ * As above, load_cr3() is serializing and orders TLB
+ * fills with respect to the mm_cpumask write.
+ */
+ load_cr3(next->pgd);
+ trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ load_mm_cr4(next);
+ load_mm_ldt(next);
+ }
+ }
+#endif
+}
+
+#ifdef CONFIG_SMP
+
/*
* The flush IPI assumes that a thread switch happens in this order:
* [cpu0: the cpu that switches]
@@ -353,3 +467,5 @@ static int __init create_tlb_single_page_flush_ceiling(void)
return 0;
}
late_initcall(create_tlb_single_page_flush_ceiling);
+
+#endif /* CONFIG_SMP */
diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f9b1fab4388a..21597dcac0e2 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -59,25 +59,7 @@ struct notifier_block;
* CPU notifier priorities.
*/
enum {
- /*
- * SCHED_ACTIVE marks a cpu which is coming up active during
- * CPU_ONLINE and CPU_DOWN_FAILED and must be the first
- * notifier. CPUSET_ACTIVE adjusts cpuset according to
- * cpu_active mask right after SCHED_ACTIVE. During
- * CPU_DOWN_PREPARE, SCHED_INACTIVE and CPUSET_INACTIVE are
- * ordered in the similar way.
- *
- * This ordering guarantees consistent cpu_active mask and
- * migration behavior to all cpu notifiers.
- */
- CPU_PRI_SCHED_ACTIVE = INT_MAX,
- CPU_PRI_CPUSET_ACTIVE = INT_MAX - 1,
- CPU_PRI_SCHED_INACTIVE = INT_MIN + 1,
- CPU_PRI_CPUSET_INACTIVE = INT_MIN,
-
- /* migration should happen before other stuff but after perf */
CPU_PRI_PERF = 20,
- CPU_PRI_MIGRATION = 10,
/* bring up workqueues before normal notifiers and down after */
CPU_PRI_WORKQUEUE_UP = 5,
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 5d68e15e46b7..386374d19987 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -8,6 +8,7 @@ enum cpuhp_state {
CPUHP_BRINGUP_CPU,
CPUHP_AP_IDLE_DEAD,
CPUHP_AP_OFFLINE,
+ CPUHP_AP_SCHED_STARTING,
CPUHP_AP_NOTIFY_STARTING,
CPUHP_AP_ONLINE,
CPUHP_TEARDOWN_CPU,
@@ -16,6 +17,7 @@ enum cpuhp_state {
CPUHP_AP_NOTIFY_ONLINE,
CPUHP_AP_ONLINE_DYN,
CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30,
+ CPUHP_AP_ACTIVE,
CPUHP_ONLINE,
};
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 40cee6b77a93..e828cf65d7df 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -743,12 +743,10 @@ set_cpu_present(unsigned int cpu, bool present)
static inline void
set_cpu_online(unsigned int cpu, bool online)
{
- if (online) {
+ if (online)
cpumask_set_cpu(cpu, &__cpu_online_mask);
- cpumask_set_cpu(cpu, &__cpu_active_mask);
- } else {
+ else
cpumask_clear_cpu(cpu, &__cpu_online_mask);
- }
}
static inline void
diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index f75222ea7f16..eabe0138eb06 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -356,8 +356,13 @@ extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask);
extern void lockdep_clear_current_reclaim_state(void);
extern void lockdep_trace_alloc(gfp_t mask);
-extern void lock_pin_lock(struct lockdep_map *lock);
-extern void lock_unpin_lock(struct lockdep_map *lock);
+struct pin_cookie { unsigned int val; };
+
+#define NIL_COOKIE (struct pin_cookie){ .val = 0U, }
+
+extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock);
+extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie);
+extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
# define INIT_LOCKDEP .lockdep_recursion = 0, .lockdep_reclaim_gfp = 0,
@@ -373,8 +378,9 @@ extern void lock_unpin_lock(struct lockdep_map *lock);
#define lockdep_recursing(tsk) ((tsk)->lockdep_recursion)
-#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map)
-#define lockdep_unpin_lock(l) lock_unpin_lock(&(l)->dep_map)
+#define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map)
+#define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c))
+#define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c))
#else /* !CONFIG_LOCKDEP */
@@ -427,8 +433,13 @@ struct lock_class_key { };
#define lockdep_recursing(tsk) (0)
-#define lockdep_pin_lock(l) do { (void)(l); } while (0)
-#define lockdep_unpin_lock(l) do { (void)(l); } while (0)
+struct pin_cookie { };
+
+#define NIL_COOKIE (struct pin_cookie){ }
+
+#define lockdep_pin_lock(l) ({ struct pin_cookie cookie; cookie; })
+#define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0)
+#define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0)
#endif /* !LOCKDEP */
diff --git a/include/linux/mmu_context.h b/include/linux/mmu_context.h
index 70fffeba7495..a4441784503b 100644
--- a/include/linux/mmu_context.h
+++ b/include/linux/mmu_context.h
@@ -1,9 +1,16 @@
#ifndef _LINUX_MMU_CONTEXT_H
#define _LINUX_MMU_CONTEXT_H
+#include <asm/mmu_context.h>
+
struct mm_struct;
void use_mm(struct mm_struct *mm);
void unuse_mm(struct mm_struct *mm);
+/* Architectures that care about IRQ state in switch_mm can override this. */
+#ifndef switch_mm_irqs_off
+# define switch_mm_irqs_off switch_mm
+#endif
+
#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e8dfa6f0d843..6cc0df970f1a 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -177,9 +177,11 @@ extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
extern void calc_global_load(unsigned long ticks);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
-extern void update_cpu_load_nohz(int active);
+extern void cpu_load_update_nohz_start(void);
+extern void cpu_load_update_nohz_stop(void);
#else
-static inline void update_cpu_load_nohz(int active) { }
+static inline void cpu_load_update_nohz_start(void) { }
+static inline void cpu_load_update_nohz_stop(void) { }
#endif
extern void dump_cpu_task(int cpu);
@@ -371,6 +373,15 @@ extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
extern void scheduler_tick(void);
+extern int sched_cpu_starting(unsigned int cpu);
+extern int sched_cpu_activate(unsigned int cpu);
+extern int sched_cpu_deactivate(unsigned int cpu);
+
+#ifdef CONFIG_HOTPLUG_CPU
+extern int sched_cpu_dying(unsigned int cpu);
+#else
+# define sched_cpu_dying NULL
+#endif
extern void sched_show_task(struct task_struct *p);
@@ -934,9 +945,19 @@ enum cpu_idle_type {
};
/*
+ * Integer metrics need fixed point arithmetic, e.g., sched/fair
+ * has a few: load, load_avg, util_avg, freq, and capacity.
+ *
+ * We define a basic fixed point arithmetic range, and then formalize
+ * all these metrics based on that basic range.
+ */
+# define SCHED_FIXEDPOINT_SHIFT 10
+# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
+
+/*
* Increase resolution of cpu_capacity calculations
*/
-#define SCHED_CAPACITY_SHIFT 10
+#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
/*
@@ -1198,18 +1219,56 @@ struct load_weight {
};
/*
- * The load_avg/util_avg accumulates an infinite geometric series.
- * 1) load_avg factors frequency scaling into the amount of time that a
- * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the
- * aggregated such weights of all runnable and blocked sched_entities.
- * 2) util_avg factors frequency and cpu scaling into the amount of time
- * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
- * For cfs_rq, it is the aggregated such times of all runnable and
+ * The load_avg/util_avg accumulates an infinite geometric series
+ * (see __update_load_avg() in kernel/sched/fair.c).
+ *
+ * [load_avg definition]
+ *
+ * load_avg = runnable% * scale_load_down(load)
+ *
+ * where runnable% is the time ratio that a sched_entity is runnable.
+ * For cfs_rq, it is the aggregated load_avg of all runnable and
* blocked sched_entities.
- * The 64 bit load_sum can:
- * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
- * the highest weight (=88761) always runnable, we should not overflow
- * 2) for entity, support any load.weight always runnable
+ *
+ * load_avg may also take frequency scaling into account:
+ *
+ * load_avg = runnable% * scale_load_down(load) * freq%
+ *
+ * where freq% is the CPU frequency normalized to the highest frequency.
+ *
+ * [util_avg definition]
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE
+ *
+ * where running% is the time ratio that a sched_entity is running on
+ * a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
+ * and blocked sched_entities.
+ *
+ * util_avg may also factor frequency scaling and CPU capacity scaling:
+ *
+ * util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
+ *
+ * where freq% is the same as above, and capacity% is the CPU capacity
+ * normalized to the greatest capacity (due to uarch differences, etc).
+ *
+ * N.B., the above ratios (runnable%, running%, freq%, and capacity%)
+ * themselves are in the range of [0, 1]. To do fixed point arithmetics,
+ * we therefore scale them to as large a range as necessary. This is for
+ * example reflected by util_avg's SCHED_CAPACITY_SCALE.
+ *
+ * [Overflow issue]
+ *
+ * The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
+ * with the highest load (=88761), always runnable on a single cfs_rq,
+ * and should not overflow as the number already hits PID_MAX_LIMIT.
+ *
+ * For all other cases (including 32-bit kernels), struct load_weight's
+ * weight will overflow first before we do, because:
+ *
+ * Max(load_avg) <= Max(load.weight)
+ *
+ * Then it is the load_weight's responsibility to consider overflow
+ * issues.
*/
struct sched_avg {
u64 last_update_time, load_sum;
@@ -1871,6 +1930,11 @@ extern int arch_task_struct_size __read_mostly;
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+static inline int tsk_nr_cpus_allowed(struct task_struct *p)
+{
+ return p->nr_cpus_allowed;
+}
+
#define TNF_MIGRATED 0x01
#define TNF_NO_GROUP 0x02
#define TNF_SHARED 0x04
@@ -2303,8 +2367,6 @@ extern unsigned long long notrace sched_clock(void);
/*
* See the comment in kernel/sched/clock.c
*/
-extern u64 cpu_clock(int cpu);
-extern u64 local_clock(void);
extern u64 running_clock(void);
extern u64 sched_clock_cpu(int cpu);
@@ -2323,6 +2385,16 @@ static inline void sched_clock_idle_sleep_event(void)
static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
{
}
+
+static inline u64 cpu_clock(int cpu)
+{
+ return sched_clock();
+}
+
+static inline u64 local_clock(void)
+{
+ return sched_clock();
+}
#else
/*
* Architectures can set this to 1 if they have specified
@@ -2337,6 +2409,26 @@ extern void clear_sched_clock_stable(void);
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
+
+/*
+ * As outlined in clock.c, provides a fast, high resolution, nanosecond
+ * time source that is monotonic per cpu argument and has bounded drift
+ * between cpus.
+ *
+ * ######################### BIG FAT WARNING ##########################
+ * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
+ * # go backwards !! #
+ * ####################################################################
+ */
+static inline u64 cpu_clock(int cpu)
+{
+ return sched_clock_cpu(cpu);
+}
+
+static inline u64 local_clock(void)
+{
+ return sched_clock_cpu(raw_smp_processor_id());
+}
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3e3f6e49eabb..d948e44c471e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -703,21 +703,6 @@ static int takedown_cpu(unsigned int cpu)
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
int err;
- /*
- * By now we've cleared cpu_active_mask, wait for all preempt-disabled
- * and RCU users of this state to go away such that all new such users
- * will observe it.
- *
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
- * Do sync before park smpboot threads to take care the rcu boost case.
- */
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
-
/* Park the smpboot threads */
kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread);
smpboot_park_threads(cpu);
@@ -923,8 +908,6 @@ void cpuhp_online_idle(enum cpuhp_state state)
st->state = CPUHP_AP_ONLINE_IDLE;
- /* The cpu is marked online, set it active now */
- set_cpu_active(cpu, true);
/* Unpark the stopper thread and the hotplug thread of this cpu */
stop_machine_unpark(cpu);
kthread_unpark(st->thread);
@@ -1236,6 +1219,12 @@ static struct cpuhp_step cpuhp_ap_states[] = {
.name = "ap:offline",
.cant_stop = true,
},
+ /* First state is scheduler control. Interrupts are disabled */
+ [CPUHP_AP_SCHED_STARTING] = {
+ .name = "sched:starting",
+ .startup = sched_cpu_starting,
+ .teardown = sched_cpu_dying,
+ },
/*
* Low level startup/teardown notifiers. Run with interrupts
* disabled. Will be removed once the notifiers are converted to
@@ -1274,6 +1263,15 @@ static struct cpuhp_step cpuhp_ap_states[] = {
* The dynamically registered state space is here
*/
+#ifdef CONFIG_SMP
+ /* Last state is scheduler control setting the cpu active */
+ [CPUHP_AP_ACTIVE] = {
+ .name = "sched:active",
+ .startup = sched_cpu_activate,
+ .teardown = sched_cpu_deactivate,
+ },
+#endif
+
/* CPU is fully up and running. */
[CPUHP_ONLINE] = {
.name = "online",
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 874d53eaf389..81f1a7107c0e 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -45,6 +45,7 @@
#include <linux/bitops.h>
#include <linux/gfp.h>
#include <linux/kmemcheck.h>
+#include <linux/random.h>
#include <asm/sections.h>
@@ -3585,7 +3586,35 @@ static int __lock_is_held(struct lockdep_map *lock)
return 0;
}
-static void __lock_pin_lock(struct lockdep_map *lock)
+static struct pin_cookie __lock_pin_lock(struct lockdep_map *lock)
+{
+ struct pin_cookie cookie = NIL_COOKIE;
+ struct task_struct *curr = current;
+ int i;
+
+ if (unlikely(!debug_locks))
+ return cookie;
+
+ for (i = 0; i < curr->lockdep_depth; i++) {
+ struct held_lock *hlock = curr->held_locks + i;
+
+ if (match_held_lock(hlock, lock)) {
+ /*
+ * Grab 16bits of randomness; this is sufficient to not
+ * be guessable and still allows some pin nesting in
+ * our u32 pin_count.
+ */
+ cookie.val = 1 + (prandom_u32() >> 16);
+ hlock->pin_count += cookie.val;
+ return cookie;
+ }
+ }
+
+ WARN(1, "pinning an unheld lock\n");
+ return cookie;
+}
+
+static void __lock_repin_lock(struct lockdep_map *lock, struct pin_cookie cookie)
{
struct task_struct *curr = current;
int i;
@@ -3597,7 +3626,7 @@ static void __lock_pin_lock(struct lockdep_map *lock)<