diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/bpf/stackmap.c | 2 | ||||
-rw-r--r-- | kernel/cgroup/cpuset.c | 33 | ||||
-rw-r--r-- | kernel/cpu.c | 9 | ||||
-rw-r--r-- | kernel/debug/debug_core.c | 6 | ||||
-rw-r--r-- | kernel/exit.c | 16 | ||||
-rw-r--r-- | kernel/irq_work.c | 45 | ||||
-rw-r--r-- | kernel/kthread.c | 21 | ||||
-rw-r--r-- | kernel/printk/printk.c | 6 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 3 | ||||
-rw-r--r-- | kernel/sched/core.c | 1104 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 4 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 9 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 52 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 8 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 101 | ||||
-rw-r--r-- | kernel/sched/fair.c | 121 | ||||
-rw-r--r-- | kernel/sched/idle.c | 7 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 147 | ||||
-rw-r--r-- | kernel/sched/rt.c | 99 | ||||
-rw-r--r-- | kernel/sched/sched.h | 150 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 5 | ||||
-rw-r--r-- | kernel/sched/topology.c | 61 | ||||
-rw-r--r-- | kernel/smp.c | 52 | ||||
-rw-r--r-- | kernel/stop_machine.c | 27 | ||||
-rw-r--r-- | kernel/time/tick-sched.c | 6 | ||||
-rw-r--r-- | kernel/trace/bpf_trace.c | 2 | ||||
-rw-r--r-- | kernel/workqueue.c | 4 |
27 files changed, 1607 insertions, 493 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 06065fa27124..599041cd0c8a 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs, if (irqs_disabled()) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { work = this_cpu_ptr(&up_read_work); - if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) { + if (irq_work_is_busy(&work->irq_work)) { /* cannot queue more up_read, fallback */ irq_work_busy = true; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 57b5b5d0a5fd..53c70c470a38 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[], */ static void rebuild_sched_domains_locked(void) { + struct cgroup_subsys_state *pos_css; struct sched_domain_attr *attr; cpumask_var_t *doms; + struct cpuset *cs; int ndoms; lockdep_assert_cpus_held(); percpu_rwsem_assert_held(&cpuset_rwsem); /* - * We have raced with CPU hotplug. Don't do anything to avoid + * If we have raced with CPU hotplug, return early to avoid * passing doms with offlined cpu to partition_sched_domains(). - * Anyways, hotplug work item will rebuild sched domains. + * Anyways, cpuset_hotplug_workfn() will rebuild sched domains. + * + * With no CPUs in any subpartitions, top_cpuset's effective CPUs + * should be the same as the active CPUs, so checking only top_cpuset + * is enough to detect racing CPU offlines. */ if (!top_cpuset.nr_subparts_cpus && !cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask)) return; - if (top_cpuset.nr_subparts_cpus && - !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask)) - return; + /* + * With subpartition CPUs, however, the effective CPUs of a partition + * root should be only a subset of the active CPUs. Since a CPU in any + * partition root could be offlined, all must be checked. + */ + if (top_cpuset.nr_subparts_cpus) { + rcu_read_lock(); + cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) { + if (!is_partition_root(cs)) { + pos_css = css_rightmost_descendant(pos_css); + continue; + } + if (!cpumask_subset(cs->effective_cpus, + cpu_active_mask)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + } /* Generate domain masks and attrs */ ndoms = generate_sched_domains(&doms, &attr); diff --git a/kernel/cpu.c b/kernel/cpu.c index 2b8d7a5db383..4e11e91010e1 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1606,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { .name = "ap:online", }, /* - * Handled on controll processor until the plugged processor manages + * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { @@ -1615,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = takedown_cpu, .cant_stop = true, }, + + [CPUHP_AP_SCHED_WAIT_EMPTY] = { + .name = "sched:waitempty", + .startup.single = NULL, + .teardown.single = sched_cpu_wait_empty, + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 1e75a8923a8d..af6e8b4fb359 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception); * Default (weak) implementation for kgdb_roundup_cpus */ -static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd); - void __weak kgdb_call_nmi_hook(void *ignored) { /* @@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored) } NOKPROBE_SYMBOL(kgdb_call_nmi_hook); +static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) = + CSD_INIT(kgdb_call_nmi_hook, NULL); + void __weak kgdb_roundup_cpus(void) { call_single_data_t *csd; @@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void) continue; kgdb_info[cpu].rounding_up = true; - csd->func = kgdb_call_nmi_hook; ret = smp_call_function_single_async(cpu, csd); if (ret) kgdb_info[cpu].rounding_up = false; diff --git a/kernel/exit.c b/kernel/exit.c index 1f236ed375f8..3594291a8542 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -478,10 +478,24 @@ static void exit_mm(void) BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - mmap_read_unlock(mm); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) diff --git a/kernel/irq_work.c b/kernel/irq_work.c index eca83965b631..e8da1e71583a 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work) { int oflags; - oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags); + oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags); /* * If the work is already pending, no need to raise the IPI. - * The pairing atomic_fetch_andnot() in irq_work_run() makes sure + * The pairing smp_mb() in irq_work_single() makes sure * everything we did before is visible. */ if (oflags & IRQ_WORK_PENDING) @@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void) static void __irq_work_queue_local(struct irq_work *work) { /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) && + if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { + if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && tick_nohz_tick_stopped()) arch_irq_work_raise(); } else { - if (llist_add(&work->llnode, this_cpu_ptr(&raised_list))) + if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) arch_irq_work_raise(); } } @@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); - __smp_call_single_queue(cpu, &work->llnode); + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } @@ -136,23 +136,28 @@ void irq_work_single(void *arg) int flags; /* - * Clear the PENDING bit, after this point the @work - * can be re-used. - * Make it immediately visible so that other CPUs trying - * to claim that work don't rely on us to handle their data - * while we are in the middle of the func. + * Clear the PENDING bit, after this point the @work can be re-used. + * The PENDING bit acts as a lock, and we own it, so we can clear it + * without atomic ops. */ - flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags); + flags = atomic_read(&work->node.a_flags); + flags &= ~IRQ_WORK_PENDING; + atomic_set(&work->node.a_flags, flags); + + /* + * See irq_work_claim(). + */ + smp_mb(); - lockdep_irq_work_enter(work); + lockdep_irq_work_enter(flags); work->func(work); - lockdep_irq_work_exit(work); + lockdep_irq_work_exit(flags); + /* - * Clear the BUSY bit and return to the free state if - * no-one else claimed it meanwhile. + * Clear the BUSY bit, if set, and return to the free state if no-one + * else claimed it meanwhile. */ - flags &= ~IRQ_WORK_PENDING; - (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY); + (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); } static void irq_work_run_list(struct llist_head *list) @@ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list) return; llnode = llist_del_all(list); - llist_for_each_entry_safe(work, tmp, llnode, llnode) + llist_for_each_entry_safe(work, tmp, llnode, node.llist) irq_work_single(work); } @@ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); - while (atomic_read(&work->flags) & IRQ_WORK_BUSY) + while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/kthread.c b/kernel/kthread.c index 933a625621b8..e6aa66551241 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1249,6 +1249,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->active_mm = mm; } tsk->mm = mm; + membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); @@ -1256,8 +1257,19 @@ void kthread_use_mm(struct mm_struct *mm) finish_arch_post_lock_switch(); #endif + /* + * When a kthread starts operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after storing to tsk->mm, before accessing + * user-space memory. A full memory barrier for membarrier + * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by + * mmdrop(), or explicitly with smp_mb(). + */ if (active_mm != mm) mmdrop(active_mm); + else + smp_mb(); to_kthread(tsk)->oldfs = force_uaccess_begin(); } @@ -1277,9 +1289,18 @@ void kthread_unuse_mm(struct mm_struct *mm) force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); + /* + * When a kthread stops operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after accessing user-space memory, before + * clearing tsk->mm. + */ + smp_mb__after_spinlock(); sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; + membarrier_update_current_mm(NULL); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index bc1e3b5a97bd..28713dda3f6b 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) wake_up_interruptible(&log_wait); } -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = ATOMIC_INIT(IRQ_WORK_LAZY), -}; +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = + IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func); void wake_up_klogd(void) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b7124c119c0b..40e5e3dd253e 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1322,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) if (IS_ENABLED(CONFIG_IRQ_WORK) && !rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq && (rnp->ffmask & rdp->grpmask)) { - init_irq_work(&rdp->rcu_iw, rcu_iw_handler); - atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ); rdp->rcu_iw_pending = true; rdp->rcu_iw_gp_seq = rnp->gp_seq; irq_work_queue_on(&rdp->rcu_iw, rdp->cpu); @@ -4023,6 +4021,7 @@ int rcutree_prepare_cpu(unsigned int cpu) rdp->cpu_no_qs.b.norm = true; rdp->core_needs_qs = false; rdp->rcu_iw_pending = false; + rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler); rdp->rcu_iw_gp_seq = rdp->gp_seq - 1; trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e0948cbb1d70..3636b80222ac 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000; * * Normal scheduling state is serialized by rq->lock. __schedule() takes the * local CPU's rq->lock, it optionally removes the task from the runqueue and - * always looks at the local rq data structures to find the most elegible task + * always looks at the local rq data structures to find the most eligible task * to run next. * * Task enqueue is also under rq->lock, possibly taken from another CPU. @@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq) update_rq_clock_task(rq, delta); } -static inline void -rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func) -{ - csd->flags = 0; - csd->func = func; - csd->info = rq; -} - #ifdef CONFIG_SCHED_HRTICK /* * Use HR-timers to deliver accurate preemption points. @@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay) static void hrtick_rq_init(struct rq *rq) { #ifdef CONFIG_SMP - rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start); + INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq); #endif hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD); rq->hrtick_timer.function = hrtick; @@ -518,7 +510,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) /* * Atomically grab the task, if ->wake_q is !nil already it means - * its already queued (either by us or someone else) and will get the + * it's already queued (either by us or someone else) and will get the * wakeup due to that. * * In order to ensure that a pending wakeup will observe our pending @@ -769,7 +761,7 @@ bool sched_can_stop_tick(struct rq *rq) return false; /* - * If there are more than one RR tasks, we need the tick to effect the + * If there are more than one RR tasks, we need the tick to affect the * actual RR behaviour. */ if (rq->rt.rr_nr_running) { @@ -1187,14 +1179,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p, * accounting was performed at enqueue time and we can just return * here. * - * Need to be careful of the following enqeueue/dequeue ordering + * Need to be careful of the following enqueue/dequeue ordering * problem too * * enqueue(taskA) * // sched_uclamp_used gets enabled * enqueue(taskB) * dequeue(taskA) - * // Must not decrement bukcet->tasks here + * // Must not decrement bucket->tasks here * dequeue(taskB) * * where we could end up with stale data in uc_se and @@ -1413,17 +1405,24 @@ done: static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr) { - unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value; - unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value; + int util_min = p->uclamp_req[UCLAMP_MIN].value; + int util_max = p->uclamp_req[UCLAMP_MAX].value; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) - lower_bound = attr->sched_util_min; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) - upper_bound = attr->sched_util_max; + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + util_min = attr->sched_util_min; - if (lower_bound > upper_bound) - return -EINVAL; - if (upper_bound > SCHED_CAPACITY_SCALE) + if (util_min + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + util_max = attr->sched_util_max; + + if (util_max + 1 > SCHED_CAPACITY_SCALE + 1) + return -EINVAL; + } + + if (util_min != -1 && util_max != -1 && util_min > util_max) return -EINVAL; /* @@ -1438,20 +1437,41 @@ static int uclamp_validate(struct task_struct *p, return 0; } +static bool uclamp_reset(const struct sched_attr *attr, + enum uclamp_id clamp_id, + struct uclamp_se *uc_se) +{ + /* Reset on sched class change for a non user-defined clamp value. */ + if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) && + !uc_se->user_defined) + return true; + + /* Reset on sched_util_{min,max} == -1. */ + if (clamp_id == UCLAMP_MIN && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min == -1) { + return true; + } + + if (clamp_id == UCLAMP_MAX && + attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max == -1) { + return true; + } + + return false; +} + static void __setscheduler_uclamp(struct task_struct *p, const struct sched_attr *attr) { enum uclamp_id clamp_id; - /* - * On scheduling class change, reset to default clamps for tasks - * without a task-specific value. - */ for_each_clamp_id(clamp_id) { struct uclamp_se *uc_se = &p->uclamp_req[clamp_id]; + unsigned int value; - /* Keep using defined clamps across class changes */ - if (uc_se->user_defined) + if (!uclamp_reset(attr, clamp_id, uc_se)) continue; /* @@ -1459,21 +1479,25 @@ static void __setscheduler_uclamp(struct task_struct *p, * at runtime. */ if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) - __uclamp_update_util_min_rt_default(p); + value = sysctl_sched_uclamp_util_min_rt_default; else - uclamp_se_set(uc_se, uclamp_none(clamp_id), false); + value = uclamp_none(clamp_id); + + uclamp_se_set(uc_se, value, false); } if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) return; - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN && + attr->sched_util_min != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true); } - if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) { + if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX && + attr->sched_util_max != -1) { uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true); } @@ -1696,6 +1720,76 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; + return; + } + + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1705,7 +1799,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1750,8 +1844,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +struct set_affinity_pending { + refcount_t refs; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1783,16 +1885,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, */ static int migration_cpu_stop(void *data) { + struct set_affinity_pending *pending; struct migration_arg *arg = data; struct task_struct *p = arg->task; + int dest_cpu = arg->dest_cpu; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr @@ -1802,21 +1907,137 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + + pending = p->migration_pending; /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + p->migration_pending = NULL; + complete = true; + } + + /* migrate_enable() -- we must not race against SCA */ + if (dest_cpu < 0) { + /* + * When this was migrate_enable() but we no longer + * have a @pending, a concurrent SCA 'fixed' things + * and we should be valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + + dest_cpu = cpumask_any_distribute(&p->cpus_mask); + } + if (task_on_rq_queued(p)) - rq = __migrate_task(rq, &rf, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, dest_cpu); else - p->wake_cpu = arg->dest_cpu; + p->wake_cpu = dest_cpu; + + } else if (dest_cpu < 0 || pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + p->migration_pending = NULL; + complete = true; + goto out; + } + + /* + * When this was migrate_enable() but we no longer have an + * @pending, a concurrent SCA 'fixed' things and we should be + * valid again. Nothing to do. + */ + if (!pending) { + WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask)); + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); + + /* For pending->{arg,stop_work} */ + pending = arg->pending; + if (pending && refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); - local_irq_enable(); + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock(&rq->lock); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); return 0; } @@ -1824,18 +2045,39 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1851,7 +2093,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) put_prev_task(rq, p); - p->s |