diff options
-rw-r--r-- | fs/proc/array.c | 4 | ||||
-rw-r--r-- | include/linux/cpuhotplug.h | 1 | ||||
-rw-r--r-- | include/linux/cpumask.h | 6 | ||||
-rw-r--r-- | include/linux/preempt.h | 69 | ||||
-rw-r--r-- | include/linux/sched.h | 5 | ||||
-rw-r--r-- | include/linux/sched/hotplug.h | 2 | ||||
-rw-r--r-- | include/linux/stop_machine.h | 5 | ||||
-rw-r--r-- | kernel/cpu.c | 9 | ||||
-rw-r--r-- | kernel/sched/core.c | 920 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 4 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 46 | ||||
-rw-r--r-- | kernel/sched/rt.c | 75 | ||||
-rw-r--r-- | kernel/sched/sched.h | 59 | ||||
-rw-r--r-- | kernel/stop_machine.c | 27 | ||||
-rw-r--r-- | kernel/workqueue.c | 4 | ||||
-rw-r--r-- | lib/cpumask.c | 18 | ||||
-rw-r--r-- | lib/dump_stack.c | 2 | ||||
-rw-r--r-- | lib/smp_processor_id.c | 5 |
19 files changed, 1038 insertions, 227 deletions
diff --git a/fs/proc/array.c b/fs/proc/array.c index 65ec2029fa80..7052441be967 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -382,9 +382,9 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index bc56287a1ed1..0042ef362511 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -152,6 +152,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index f0d895d6ac39..383684e30f12 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, return cpumask_next_and(-1, src1p, src2p); } +static inline int cpumask_any_distribute(const struct cpumask *srcp) +{ + return cpumask_first(srcp); +} + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ @@ -252,6 +257,7 @@ int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); +int cpumask_any_distribute(const struct cpumask *srcp); /** * for_each_cpu - iterate over every cpu in a mask diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7d9c1c0e149c..8b43922e65df 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -322,6 +322,73 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + +/* + * Migrate-Disable and why it is undesired. + * + * When a preempted task becomes elegible to run under the ideal model (IOW it + * becomes one of the M highest priority tasks), it might still have to wait + * for the preemptee's migrate_disable() section to complete. Thereby suffering + * a reduction in bandwidth in the exact duration of the migrate_disable() + * section. + * + * Per this argument, the change from preempt_disable() to migrate_disable() + * gets us: + * + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() + * it would have had to wait for the lower priority task. + * + * - a lower priority tasks; which under preempt_disable() could've instantly + * migrated away when another CPU becomes available, is now constrained + * by the ability to push the higher priority task away, which might itself be + * in a migrate_disable() section, reducing it's available bandwidth. + * + * IOW it trades latency / moves the interference term, but it stays in the + * system, and as long as it remains unbounded, the system is not fully + * deterministic. + * + * + * The reason we have it anyway. + * + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a + * number of primitives into becoming preemptible, they would also allow + * migration. This turns out to break a bunch of per-cpu usage. To this end, + * all these primitives employ migirate_disable() to restore this implicit + * assumption. + * + * This is a 'temporary' work-around at best. The correct solution is getting + * rid of the above assumptions and reworking the code to employ explicit + * per-cpu locking or short preempt-disable regions. + * + * The end goal must be to get rid of migrate_disable(), alternatively we need + * a schedulability theory that does not depend on abritrary migration. + * + * + * Notes on the implementation. + * + * The implementation is particularly tricky since existing code patterns + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. + * This means that it cannot use cpus_read_lock() to serialize against hotplug, + * nor can it easily migrate itself into a pending affinity mask change on + * migrate_enable(). + * + * + * Note: even non-work-conserving schedulers like semi-partitioned depends on + * migration, so migrate_disable() is not only a problem for + * work-conserving schedulers. + * + */ +extern void migrate_disable(void); +extern void migrate_enable(void); + +#elif defined(CONFIG_PREEMPT_RT) + +static inline void migrate_disable(void) { } +static inline void migrate_enable(void) { } + +#else /* !CONFIG_PREEMPT_RT */ + /** * migrate_disable - Prevent migration of the current task * @@ -352,4 +419,6 @@ static __always_inline void migrate_enable(void) preempt_enable(); } +#endif /* CONFIG_SMP && CONFIG_PREEMPT_RT */ + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 063cd120b459..3af9d52fe093 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -714,6 +714,11 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; + void *migration_pending; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + unsigned short migration_disabled; +#endif + unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 9a62ffdd296f..412cdaba33eb 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); extern int sched_cpu_deactivate(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_wait_empty(unsigned int cpu); extern int sched_cpu_dying(unsigned int cpu); #else +# define sched_cpu_wait_empty NULL # define sched_cpu_dying NULL #endif diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 76d8b09384a7..30577c3aecf8 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -24,6 +24,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); struct cpu_stop_work { struct list_head list; /* cpu_stopper->works */ cpu_stop_fn_t fn; + unsigned long caller; void *arg; struct cpu_stop_done *done; }; @@ -36,6 +37,8 @@ void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); +extern void print_stop_info(const char *log_lvl, struct task_struct *task); + #else /* CONFIG_SMP */ #include <linux/workqueue.h> @@ -80,6 +83,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } + #endif /* CONFIG_SMP */ /* diff --git a/kernel/cpu.c b/kernel/cpu.c index 6ff2578ecf17..fa535eaa4826 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1602,7 +1602,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { .name = "ap:online", }, /* - * Handled on controll processor until the plugged processor manages + * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { @@ -1611,6 +1611,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = takedown_cpu, .cant_stop = true, }, + + [CPUHP_AP_SCHED_WAIT_EMPTY] = { + .name = "sched:waitempty", + .startup.single = NULL, + .teardown.single = sched_cpu_wait_empty, + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6f533bb7d3b9..622f343413a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1696,6 +1696,80 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT_RT + +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; + return; + } + + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + +#endif + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1705,7 +1779,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1750,8 +1824,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +struct set_affinity_pending { + refcount_t refs; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1783,16 +1865,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { + struct set_affinity_pending *pending; struct migration_arg *arg = data; struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr @@ -1802,21 +1887,126 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + pending = p->migration_pending; /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + } + + /* migrate_enable() -- we must not race against SCA */ + if (dest_cpu < 0) { + /* + * When this was migrate_enable() but we no longer + * have a @pending, a concurrent SCA 'fixed' things + * and we should be valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + goto out; + } + + dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } + if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, dest_cpu); else - p->wake_cpu = arg->dest_cpu; + p->wake_cpu = dest_cpu; + + } else if (dest_cpu < 0) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * When this was migrate_enable() but we no longer have an + * @pending, a concurrent SCA 'fixed' things and we should be + * valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!is_cpu_allowed(p, cpu_of(rq))); + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ + pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); - local_irq_enable(); + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock(&rq->lock); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); return 0; } @@ -1824,18 +2014,39 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1851,7 +2062,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, new_mask, flags); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1859,6 +2070,208 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) set_next_task(rq, p); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask, 0); +} + +/* + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <resumes> + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * <wakes local stopper> + * `--> <woken on migration completion> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * cancels the need for an active migration. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 P2 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * set_cpus_allowed_ptr(P0, [0, 1]); + * <signal completion> + * <awakes> + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded by an uninstallation of + * p->migration_pending done with p->pi_lock held. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + struct migration_arg arg = { + .task = p, + .dest_cpu = dest_cpu, + }; + bool complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + + pending = p->migration_pending; + if (pending) { + refcount_inc(&pending->refs); + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + + if (complete) + goto do_complete; + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (flags & SCA_MIGRATE_ENABLE) { + + refcount_inc(&pending->refs); /* pending->{arg,stop_work} */ + p->migration_flags &= ~MDF_PUSH; + task_rq_unlock(rq, p, rf); + + pending->arg = (struct migration_arg) { + .task = p, + .dest_cpu = -1, + .pending = pending, + }; + + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + + return 0; + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { + /* + * Lessen races (and headaches) by delegating + * is_migration_disabled(p) checks to the stopper, which will + * run on the same CPU as said p. + */ + task_rq_unlock(rq, p, rf); + stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); + + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + p->migration_pending = NULL; + complete = true; + } + task_rq_unlock(rq, p, rf); + +do_complete: + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); + + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + return 0; +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1869,7 +2282,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; @@ -1880,9 +2294,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } @@ -1891,13 +2310,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity @@ -1910,7 +2338,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - do_set_cpus_allowed(p, new_mask); + __do_set_cpus_allowed(p, new_mask, flags); if (p->flags & PF_KTHREAD) { /* @@ -1922,23 +2350,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, p->nr_cpus_allowed != 1); } - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + return affine_move_task(rq, p, &rf, dest_cpu, flags); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: task_rq_unlock(rq, p, &rf); @@ -1947,7 +2360,7 @@ out: int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); @@ -1988,6 +2401,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -2318,6 +2733,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; @@ -2352,7 +2773,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -2375,6 +2796,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2390,6 +2812,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; @@ -2406,13 +2842,25 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) #else static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { return set_cpus_allowed_ptr(p, new_mask); } #endif /* CONFIG_SMP */ +#if !defined(CONFIG_SMP) || !defined(CONFIG_PREEMPT_RT) + +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} + +#endif + static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) { @@ -3098,6 +3546,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; #endif } @@ -3485,6 +3934,90 @@ static inline void finish_task(struct task_struct *prev) #endif } +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +{ + void (*func)(struct rq *rq); + struct callback_head *next; + + lockdep_assert_held(&rq->lock); |