diff options
| -rw-r--r-- | Documentation/scheduler/sched-deadline.txt | 18 | ||||
| -rw-r--r-- | arch/ia64/kernel/mca.c | 10 | ||||
| -rw-r--r-- | arch/x86/kernel/smpboot.c | 46 | ||||
| -rw-r--r-- | include/linux/kernel.h | 9 | ||||
| -rw-r--r-- | include/linux/sched.h | 30 | ||||
| -rw-r--r-- | include/linux/u64_stats_sync.h | 45 | ||||
| -rw-r--r-- | include/linux/wait.h | 17 | ||||
| -rw-r--r-- | kernel/exit.c | 26 | ||||
| -rw-r--r-- | kernel/sched/core.c | 322 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.c | 153 | ||||
| -rw-r--r-- | kernel/sched/cpudeadline.h | 3 | ||||
| -rw-r--r-- | kernel/sched/cputime.c | 87 | ||||
| -rw-r--r-- | kernel/sched/deadline.c | 78 | ||||
| -rw-r--r-- | kernel/sched/debug.c | 103 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 752 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 4 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 92 | ||||
| -rw-r--r-- | kernel/sched/stats.h | 24 | ||||
| -rw-r--r-- | kernel/sched/wait.c | 113 | ||||
| -rw-r--r-- | kernel/smpboot.c | 2 | ||||
| -rw-r--r-- | kernel/stop_machine.c | 5 | ||||
| -rw-r--r-- | mm/huge_memory.c | 2 | ||||
| -rw-r--r-- | mm/memory.c | 2 | ||||
| -rw-r--r-- | tools/objtool/builtin-check.c | 1 |
24 files changed, 1198 insertions, 746 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 53a2fe1ae8b8..8e37b0ba2c9d 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -16,6 +16,7 @@ CONTENTS 4.1 System-wide settings 4.2 Task interface 4.3 Default behavior + 4.4 Behavior of sched_yield() 5. Tasks CPU affinity 5.1 SCHED_DEADLINE and cpusets HOWTO 6. Future plans @@ -426,6 +427,23 @@ CONTENTS Finally, notice that in order not to jeopardize the admission control a -deadline task cannot fork. + +4.4 Behavior of sched_yield() +----------------------------- + + When a SCHED_DEADLINE task calls sched_yield(), it gives up its + remaining runtime and is immediately throttled, until the next + period, when its runtime will be replenished (a special flag + dl_yielded is set and used to handle correctly throttling and runtime + replenishment after a call to sched_yield()). + + This behavior of sched_yield() allows the task to wake-up exactly at + the beginning of the next period. Also, this may be useful in the + future with bandwidth reclaiming mechanisms, where sched_yield() will + make the leftoever runtime available for reclamation by other + SCHED_DEADLINE tasks. + + 5. Tasks CPU affinity ===================== diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index eb9220cde76c..d47616c8b885 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, int cpu = smp_processor_id(); previous_current = curr_task(cpu); - set_curr_task(cpu, current); + ia64_set_curr_task(cpu, current); if ((p = strchr(current->comm, ' '))) *p = '\0'; @@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, cpumask_clear_cpu(i, &mca_cpu); /* wake next cpu */ while (monarch_cpu != -1) cpu_relax(); /* spin until last cpu leaves */ - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; return; } } } - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; monarch_cpu = -1; /* This frees the slaves and previous monarchs */ } @@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1); mprintk("Slave on cpu %d returning to normal service.\n", cpu); - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE; atomic_dec(&slaves); return; @@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw, mprintk("\nINIT dump complete. Monarch on cpu %d returning to normal service.\n", cpu); atomic_dec(&monarchs); - set_curr_task(cpu, previous_current); + ia64_set_curr_task(cpu, previous_current); monarch_cpu = -1; return; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4296beb8fdd3..7137ec4eea9a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static struct sched_domain_topology_level numa_inside_package_topology[] = { +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif @@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = { #endif { NULL, }, }; + +static struct sched_domain_topology_level x86_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + /* - * set_sched_topology() sets the topology internal to a CPU. The - * NUMA topologies are layered on top of it to build the full - * system topology. - * - * If NUMA nodes are observed to occur within a CPU package, this - * function should be called. It forces the sched domain code to - * only use the SMT level for the CPU portion of the topology. - * This essentially falls back to relying on NUMA information - * from the SRAT table to describe the entire system topology - * (except for hyperthreads). + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours and Intel Cluster-on-Die have this. */ -static void primarily_use_numa_for_topology(void) -{ - set_sched_topology(numa_inside_package_topology); -} +static bool x86_has_numa_in_package; void set_cpu_sibling_map(int cpu) { @@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu) c->booted_cores = cpu_data(i).booted_cores; } if (match_die(c, o) && !topology_same_node(c, o)) - primarily_use_numa_for_topology(); + x86_has_numa_in_package = true; } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + set_cpu_sibling_map(0); switch (smp_sanity_check(max_cpus)) { @@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + nmi_selftest(); impress_friends(); setup_ioapic_dest(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index d96a6118d26a..74fd6f05bc5b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -259,17 +259,14 @@ static inline void might_fault(void) { } extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(int state); __printf(1, 2) -void panic(const char *fmt, ...) - __noreturn __cold; +void panic(const char *fmt, ...) __noreturn __cold; void nmi_panic(struct pt_regs *regs, const char *msg); extern void oops_enter(void); extern void oops_exit(void); void print_oops_end_marker(void); extern int oops_may_print(void); -void do_exit(long error_code) - __noreturn; -void complete_and_exit(struct completion *, long) - __noreturn; +void do_exit(long error_code) __noreturn; +void complete_and_exit(struct completion *, long) __noreturn; /* Internal, do not use. */ int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res); diff --git a/include/linux/sched.h b/include/linux/sched.h index 98fe95fea30c..f76d75fc9eaf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -448,6 +448,8 @@ static inline void io_schedule(void) io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); } +void __noreturn do_task_dead(void); + struct nsproxy; struct user_namespace; @@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head); #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */ #define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ @@ -1064,6 +1067,12 @@ extern int sched_domain_level_max; struct sched_group; +struct sched_domain_shared { + atomic_t ref; + atomic_t nr_busy_cpus; + int has_idle_cores; +}; + struct sched_domain { /* These fields must be setup */ struct sched_domain *parent; /* top domain must be null terminated */ @@ -1094,6 +1103,8 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long next_decay_max_lb_cost; + u64 avg_scan_cost; /* select_idle_sibling */ + #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; @@ -1132,6 +1143,7 @@ struct sched_domain { void *private; /* used during construction */ struct rcu_head rcu; /* used during destruction */ }; + struct sched_domain_shared *shared; unsigned int span_weight; /* @@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void); struct sd_data { struct sched_domain **__percpu sd; + struct sched_domain_shared **__percpu sds; struct sched_group **__percpu sg; struct sched_group_capacity **__percpu sgc; }; @@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p) return p->pid == 0; } extern struct task_struct *curr_task(int cpu); -extern void set_curr_task(int cpu, struct task_struct *p); +extern void ia64_set_curr_task(int cpu, struct task_struct *p); void yield(void); @@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p) * cond_resched_lock() will drop the spinlock before scheduling, * cond_resched_softirq() will enable bhs before scheduling. */ +#ifndef CONFIG_PREEMPT extern int _cond_resched(void); +#else +static inline int _cond_resched(void) { return 0; } +#endif #define cond_resched() ({ \ ___might_sleep(__FILE__, __LINE__, 0); \ @@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void) #endif } +static inline unsigned long get_preempt_disable_ip(struct task_struct *p) +{ +#ifdef CONFIG_DEBUG_PREEMPT + return p->preempt_disable_ip; +#else + return 0; +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index d3a2bb712af3..650f3dd6b800 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp) #endif } -static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_begin(&syncp->seq); #else -#if BITS_PER_LONG==32 - preempt_disable(); -#endif return 0; #endif } -static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, +static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_disable(); +#endif + return __u64_stats_fetch_begin(syncp); +} + +static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) return read_seqcount_retry(&syncp->seq, start); #else -#if BITS_PER_LONG==32 - preempt_enable(); -#endif return false; #endif } +static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, + unsigned int start) +{ +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) + preempt_enable(); +#endif + return __u64_stats_fetch_retry(syncp, start); +} + /* * In case irq handlers can update u64 counters, readers can use following helpers * - SMP 32bit arches use seqcount protection, irq safe. @@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_begin(&syncp->seq); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_disable(); #endif - return 0; -#endif + return __u64_stats_fetch_begin(syncp); } static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, - unsigned int start) + unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - return read_seqcount_retry(&syncp->seq, start); -#else -#if BITS_PER_LONG==32 +#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) local_irq_enable(); #endif - return false; -#endif + return __u64_stats_fetch_retry(syncp, start); } #endif /* _LINUX_U64_STATS_SYNC_H */ diff --git a/include/linux/wait.h b/include/linux/wait.h index c3ff74d764fa..2408e8d5c05c 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int); (!__builtin_constant_p(state) || \ state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE) \ +extern void init_wait_entry(wait_queue_t *__wait, int flags); + /* * The below macro ___wait_event() has an explicit shadow of the __ret * variable when used from the wait_event_*() macros. @@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); wait_queue_t __wait; \ long __ret = ret; /* explicit shadow */ \ \ - INIT_LIST_HEAD(&__wait.task_list); \ - if (exclusive) \ - __wait.flags = WQ_FLAG_EXCLUSIVE; \ - else \ - __wait.flags = 0; \ - \ + init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ for (;;) { \ long __int = prepare_to_wait_event(&wq, &__wait, state);\ \ @@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int); \ if (___wait_is_interruptible(state) && __int) { \ __ret = __int; \ - if (exclusive) { \ - abort_exclusive_wait(&wq, &__wait, \ - state, NULL); \ - goto __out; \ - } \ - break; \ + goto __out; \ } \ \ cmd; \ @@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state); void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state); long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state); void finish_wait(wait_queue_head_t *q, wait_queue_t *wait); -void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key); long wait_woken(wait_queue_t *wait, unsigned mode, long timeout); int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key); diff --git a/kernel/exit.c b/kernel/exit.c index 091a78be3b09..1e1d913914c0 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -725,7 +725,7 @@ static void check_stack_usage(void) static inline void check_stack_usage(void) {} #endif -void do_exit(long code) +void __noreturn do_exit(long code) { struct task_struct *tsk = current; int group_dead; @@ -882,29 +882,7 @@ void do_exit(long code) exit_rcu(); TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); - /* - * The setting of TASK_RUNNING by try_to_wake_up() may be delayed - * when the following two conditions become true. - * - There is race condition of mmap_sem (It is acquired by - * exit_mm()), and - * - SMI occurs before setting TASK_RUNINNG. - * (or hypervisor of virtual machine switches to other guest) - * As a result, we may become TASK_RUNNING after becoming TASK_DEAD - * - * To avoid it, we have to wait for releasing tsk->pi_lock which - * is held by try_to_wake_up() - */ - smp_mb(); - raw_spin_unlock_wait(&tsk->pi_lock); - - /* causes final put_task_struct in finish_task_switch(). */ - tsk->state = TASK_DEAD; - tsk->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ - schedule(); - BUG(); - /* Avoid "noreturn function does return". */ - for (;;) - cpu_relax(); /* For when BUG is null */ + do_task_dead(); } EXPORT_SYMBOL_GPL(do_exit); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0a6a13c21c5a..fac6492f0b98 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1070,8 +1070,12 @@ static int migration_cpu_stop(void *data) * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ - if (task_rq(p) == rq && task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + if (task_rq(p) == rq) { + if (task_on_rq_queued(p)) + rq = __migrate_task(rq, p, arg->dest_cpu); + else + p->wake_cpu = arg->dest_cpu; + } raw_spin_unlock(&rq->lock); raw_spin_unlock(&p->pi_lock); @@ -1112,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); } /* @@ -1272,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) /* * Task isn't running anymore; make it appear like we migrated * it before it went to sleep. This means on wakeup we make the - * previous cpu our targer instead of where it really is. + * previous cpu our target instead of where it really is. */ p->wake_cpu = cpu; } @@ -1636,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p, static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { -#ifdef CONFIG_SCHEDSTATS - struct rq *rq = this_rq(); + struct rq *rq; -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); + if (!schedstat_enabled()) + return; + + rq = this_rq(); - if (cpu == this_cpu) { - schedstat_inc(rq, ttwu_local); - schedstat_inc(p, se.statistics.nr_wakeups_local); +#ifdef CONFIG_SMP + if (cpu == rq->cpu) { + schedstat_inc(rq->ttwu_local); + schedstat_inc(p->se.statistics.nr_wakeups_local); } else { struct sched_domain *sd; - schedstat_inc(p, se.statistics.nr_wakeups_remote); + schedstat_inc(p->se.statistics.nr_wakeups_remote); rcu_read_lock(); - for_each_domain(this_cpu, sd) { + for_each_domain(rq->cpu, sd) { if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { - schedstat_inc(sd, ttwu_wake_remote); + schedstat_inc(sd->ttwu_wake_remote); break; } } @@ -1660,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags) } if (wake_flags & WF_MIGRATED) - schedstat_inc(p, se.statistics.nr_wakeups_migrate); - + schedstat_inc(p->se.statistics.nr_wakeups_migrate); #endif /* CONFIG_SMP */ - schedstat_inc(rq, ttwu_count); - schedstat_inc(p, se.statistics.nr_wakeups); + schedstat_inc(rq->ttwu_count); + schedstat_inc(p->se.statistics.nr_wakeups); if (wake_flags & WF_SYNC) - schedstat_inc(p, se.statistics.nr_wakeups_sync); - -#endif /* CONFIG_SCHEDSTATS */ + schedstat_inc(p->se.statistics.nr_wakeups_sync); } static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags) @@ -2091,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu, wake_flags); stat: - if (schedstat_enabled()) - ttwu_stat(p, cpu, wake_flags); + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2102,6 +2104,7 @@ out: /** * try_to_wake_up_local - try to wake up a local task with rq lock held * @p: the thread to be awakened + * @cookie: context's cookie for pinning * * Put @p on the run-queue if it's not already there. The caller must * ensure that this_rq() is locked, @p is bound to this_rq() and not @@ -2140,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0, cookie); - if (schedstat_enabled()) - ttwu_stat(p, smp_processor_id(), 0); + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -3199,6 +3201,9 @@ static inline void preempt_latency_stop(int val) { } */ static noinline void __schedule_bug(struct task_struct *prev) { + /* Save this before calling printk(), since that will clobber it */ + unsigned long preempt_disable_ip = get_preempt_disable_ip(current); + if (oops_in_progress) return; @@ -3209,13 +3214,12 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); -#ifdef CONFIG_DEBUG_PREEMPT - if (in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) + && in_atomic_preempt_off()) { pr_err("Preemption disabled at:"); - print_ip_sym(current->preempt_disable_ip); + print_ip_sym(preempt_disable_ip); pr_cont("\n"); } -#endif if (panic_on_warn) panic("scheduling while atomic\n"); @@ -3241,7 +3245,7 @@ static inline void schedule_debug(struct task_struct *prev) profile_hit(SCHED_PROFILING, __builtin_return_address(0)); - schedstat_inc(this_rq(), sched_count); + schedstat_inc(this_rq()->sched_count); } /* @@ -3334,17 +3338,6 @@ static void __sched notrace __schedule(bool preempt) rq = cpu_rq(cpu); prev = rq->curr; - /* - * do_exit() calls schedule() with preemption disabled as an exception; - * however we must fix that up, otherwise the next task will see an - * inconsistent (higher) preempt count. - * - * It also avoids the below schedule_debug() test from complaining - * about this. - */ - if (unlikely(prev->state == TASK_DEAD)) - preempt_enable_no_resched_notrace(); - schedule_debug(prev); if (sched_feat(HRTICK)) @@ -3412,6 +3405,33 @@ static void __sched notrace __schedule(bool preempt) } STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ +void __noreturn do_task_dead(void) +{ + /* + * The setting of TASK_RUNNING by try_to_wake_up() may be delayed + * when the following two conditions become true. + * - There is race condition of mmap_sem (It is acquired by + * exit_mm()), and + * - SMI occurs before setting TASK_RUNINNG. + * (or hypervisor of virtual machine switches to other guest) + * As a result, we may become TASK_RUNNING after becoming TASK_DEAD + * + * To avoid it, we have to wait for releasing tsk->pi_lock which + * is held by try_to_wake_up() + */ + smp_mb(); + raw_spin_unlock_wait(¤t->pi_lock); + + /* causes final put_task_struct in finish_task_switch(). */ + __set_current_state(TASK_DEAD); + current->flags |= PF_NOFREEZE; /* tell freezer to ignore us */ + __schedule(false); + BUG(); + /* Avoid "noreturn function does return". */ + for (;;) + cpu_relax(); /* For when BUG is null */ +} + static inline void sched_submit_work(struct task_struct *tsk) { if (!tsk->state || tsk_is_pi_blocked(tsk)) @@ -3694,10 +3714,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, queue_flag); + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3711,7 +3731,8 @@ out_unlock: void set_user_nice(struct task_struct *p, long nice) { - int old_prio, delta, queued; + bool queued, running; + int old_prio, delta; struct rq_flags rf; struct rq *rq; @@ -3733,8 +3754,11 @@ void set_user_nice(struct task_struct *p, long nice) goto out_unlock; } queued = task_on_rq_queued(p); + running = task_current(rq, p); if (queued) dequeue_task(rq, p, DEQUEUE_SAVE); + if (running) + put_prev_task(rq, p); p->static_prio = NICE_TO_PRIO(nice); set_load_weight(p); @@ -3751,6 +3775,8 @@ void set_user_nice(struct task_struct *p, long nice) if (delta < 0 || (delta > 0 && task_running(rq, p))) resched_curr(rq); } + if (running) + set_curr_task(rq, p); out_unlock: task_rq_unlock(rq, p, &rf); } @@ -4250,8 +4276,6 @@ change: prev_class = p->sched_class; __setscheduler(rq, p, attr, pi); - if (running) - p->sched_class->set_curr_task(rq); if (queued) { /* * We enqueue to tail when the priority of a task is @@ -4262,6 +4286,8 @@ change: enqueue_task(rq, p, queue_flags); } + if (running) + set_curr_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); preempt_disable(); /* avoid rq from going away on us */ @@ -4853,7 +4879,7 @@ SYSCALL_DEFINE0(sched_yield) { struct rq *rq = this_rq_lock(); - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); /* @@ -4870,6 +4896,7 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +#ifndef CONFIG_PREEMPT int __sched _cond_resched(void) { if (should_resched(0)) { @@ -4879,6 +4906,7 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); +#endif /* * __cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -5004,7 +5032,7 @@ again: yielded = curr->sched_class->yield_to_task(rq, p, preempt); if (yielded) { - schedstat_inc(rq, yld_count); + schedstat_inc(rq->yld_count); /* * Make p's CPU reschedule; pick_next_entity takes care of * fairness. @@ -5424,10 +5452,10 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; - if (running) - p->sched_class->set_curr_task(rq); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE); + if (running) + set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); } #endif /* CONFIG_NUMA_BALANCING */ @@ -5724,6 +5752,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) } } #else /* !CONFIG_SCHED_DEBUG */ + +# define sched_debug_enabled 0 # define sched_domain_debug(sd, cpu) do { } while (0) static inline bool sched_debug(void) { @@ -5742,6 +5772,7 @@ static int sd_degenerate(struct sched_domain *sd) SD_BALANCE_FORK | SD_BALANCE_EXEC | SD_SHARE_CPUCAPACITY | + SD_ASYM_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN)) { if (sd->groups != sd->groups->next) @@ -5772,6 +5803,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) SD_BALANCE_NEWIDLE | SD_BALANCE_FORK | SD_BALANCE_EXEC | + SD_ASYM_CPUCAPACITY | SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES | SD_PREFER_SIBLING | @@ -5916,10 +5948,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc) } while (sg != first); } -static void free_sched_domain(struct rcu_head *rcu) +static void destroy_sched_domain(struct sched_domain *sd) { - struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); - /* * If its an overlapping domain it has private groups, iterate and * nuke them all. @@ -5930,18 +5960,26 @@ static void free_sched_domain(struct rcu_head *rcu) kfree(sd->groups->sgc); kfree(sd->groups); } + if (sd->shared && atomic_dec_and_test(&sd->shared->ref)) + kfree(sd->shared); kfree(sd); } -static void destroy_sched_domain(struct sched_domain *sd, int cpu) +static void destroy_sched_domains_rcu(struct rcu_head *rcu) { - call_rcu(&sd->rcu, free_sched_domain); + struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); + + while (sd) { + struct sched_domain *parent = sd->parent; + destroy_sched_domain(sd); + sd = parent; + } } -static void destroy_sched_domains(struct sched_domain *sd, int cpu) +static void destroy_sched_domains(struct sched_domain *sd) { - for (; sd; sd = sd->parent) - destroy_sched_domain(sd, cpu); + if (sd) + call_rcu(&sd->rcu, destroy_sched_domains_rcu); } /* @@ -5956,14 +5994,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu) DEFINE_PER_CPU(struct sched_domain *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain *, sd_numa); -DEFINE_PER_CPU(struct sched_domain *, sd_busy); DEFINE_PER_CPU(struct sched_domain *, sd_asym); static void update_top_cache_domain(int cpu) { + struct sched_domain_shared *sds = NULL; struct sched_domain *sd; - struct sched_domain *busy_sd = NULL; int id = cpu; int size = 1; @@ -5971,13 +6009,13 @@ static void update_top_cache_domain(int cpu) if (sd) { id = cpumask_first(sched_domain_span(sd)); size = cpumask_weight(sched_domain_span(sd)); - busy_sd = sd->parent; /* sd_busy */ + sds = sd->shared; } - rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd); rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); per_cpu(sd_llc_size, cpu) = size; per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); sd = lowest_flag_domain(cpu, SD_NUMA); |
