summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/stackmap.c2
-rw-r--r--kernel/cgroup/cpuset.c33
-rw-r--r--kernel/cpu.c9
-rw-r--r--kernel/debug/debug_core.c6
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/irq_work.c45
-rw-r--r--kernel/kthread.c21
-rw-r--r--kernel/printk/printk.c6
-rw-r--r--kernel/rcu/tree.c3
-rw-r--r--kernel/sched/core.c1104
-rw-r--r--kernel/sched/cpudeadline.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c9
-rw-r--r--kernel/sched/cpupri.c52
-rw-r--r--kernel/sched/cpupri.h8
-rw-r--r--kernel/sched/deadline.c101
-rw-r--r--kernel/sched/fair.c121
-rw-r--r--kernel/sched/idle.c7
-rw-r--r--kernel/sched/membarrier.c147
-rw-r--r--kernel/sched/rt.c99
-rw-r--r--kernel/sched/sched.h150
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/topology.c61
-rw-r--r--kernel/smp.c52
-rw-r--r--kernel/stop_machine.c27
-rw-r--r--kernel/time/tick-sched.c6
-rw-r--r--kernel/trace/bpf_trace.c2
-rw-r--r--kernel/workqueue.c4
27 files changed, 1607 insertions, 493 deletions
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 06065fa27124..599041cd0c8a 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -298,7 +298,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
if (irqs_disabled()) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
work = this_cpu_ptr(&up_read_work);
- if (atomic_read(&work->irq_work.flags) & IRQ_WORK_BUSY) {
+ if (irq_work_is_busy(&work->irq_work)) {
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index 57b5b5d0a5fd..53c70c470a38 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -983,25 +983,48 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
*/
static void rebuild_sched_domains_locked(void)
{
+ struct cgroup_subsys_state *pos_css;
struct sched_domain_attr *attr;
cpumask_var_t *doms;
+ struct cpuset *cs;
int ndoms;
lockdep_assert_cpus_held();
percpu_rwsem_assert_held(&cpuset_rwsem);
/*
- * We have raced with CPU hotplug. Don't do anything to avoid
+ * If we have raced with CPU hotplug, return early to avoid
* passing doms with offlined cpu to partition_sched_domains().
- * Anyways, hotplug work item will rebuild sched domains.
+ * Anyways, cpuset_hotplug_workfn() will rebuild sched domains.
+ *
+ * With no CPUs in any subpartitions, top_cpuset's effective CPUs
+ * should be the same as the active CPUs, so checking only top_cpuset
+ * is enough to detect racing CPU offlines.
*/
if (!top_cpuset.nr_subparts_cpus &&
!cpumask_equal(top_cpuset.effective_cpus, cpu_active_mask))
return;
- if (top_cpuset.nr_subparts_cpus &&
- !cpumask_subset(top_cpuset.effective_cpus, cpu_active_mask))
- return;
+ /*
+ * With subpartition CPUs, however, the effective CPUs of a partition
+ * root should be only a subset of the active CPUs. Since a CPU in any
+ * partition root could be offlined, all must be checked.
+ */
+ if (top_cpuset.nr_subparts_cpus) {
+ rcu_read_lock();
+ cpuset_for_each_descendant_pre(cs, pos_css, &top_cpuset) {
+ if (!is_partition_root(cs)) {
+ pos_css = css_rightmost_descendant(pos_css);
+ continue;
+ }
+ if (!cpumask_subset(cs->effective_cpus,
+ cpu_active_mask)) {
+ rcu_read_unlock();
+ return;
+ }
+ }
+ rcu_read_unlock();
+ }
/* Generate domain masks and attrs */
ndoms = generate_sched_domains(&doms, &attr);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2b8d7a5db383..4e11e91010e1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1606,7 +1606,7 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.name = "ap:online",
},
/*
- * Handled on controll processor until the plugged processor manages
+ * Handled on control processor until the plugged processor manages
* this itself.
*/
[CPUHP_TEARDOWN_CPU] = {
@@ -1615,6 +1615,13 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.teardown.single = takedown_cpu,
.cant_stop = true,
},
+
+ [CPUHP_AP_SCHED_WAIT_EMPTY] = {
+ .name = "sched:waitempty",
+ .startup.single = NULL,
+ .teardown.single = sched_cpu_wait_empty,
+ },
+
/* Handle smpboot threads park/unpark */
[CPUHP_AP_SMPBOOT_THREADS] = {
.name = "smpboot/threads:online",
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 1e75a8923a8d..af6e8b4fb359 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -225,8 +225,6 @@ NOKPROBE_SYMBOL(kgdb_skipexception);
* Default (weak) implementation for kgdb_roundup_cpus
*/
-static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd);
-
void __weak kgdb_call_nmi_hook(void *ignored)
{
/*
@@ -241,6 +239,9 @@ void __weak kgdb_call_nmi_hook(void *ignored)
}
NOKPROBE_SYMBOL(kgdb_call_nmi_hook);
+static DEFINE_PER_CPU(call_single_data_t, kgdb_roundup_csd) =
+ CSD_INIT(kgdb_call_nmi_hook, NULL);
+
void __weak kgdb_roundup_cpus(void)
{
call_single_data_t *csd;
@@ -267,7 +268,6 @@ void __weak kgdb_roundup_cpus(void)
continue;
kgdb_info[cpu].rounding_up = true;
- csd->func = kgdb_call_nmi_hook;
ret = smp_call_function_single_async(cpu, csd);
if (ret)
kgdb_info[cpu].rounding_up = false;
diff --git a/kernel/exit.c b/kernel/exit.c
index 1f236ed375f8..3594291a8542 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -478,10 +478,24 @@ static void exit_mm(void)
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
+ /*
+ * When a thread stops operating on an address space, the loop
+ * in membarrier_private_expedited() may not observe that
+ * tsk->mm, and the loop in membarrier_global_expedited() may
+ * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+ * rq->membarrier_state, so those would not issue an IPI.
+ * Membarrier requires a memory barrier after accessing
+ * user-space memory, before clearing tsk->mm or the
+ * rq->membarrier_state.
+ */
+ smp_mb__after_spinlock();
+ local_irq_disable();
current->mm = NULL;
- mmap_read_unlock(mm);
+ membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
+ local_irq_enable();
task_unlock(current);
+ mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index eca83965b631..e8da1e71583a 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -31,10 +31,10 @@ static bool irq_work_claim(struct irq_work *work)
{
int oflags;
- oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->flags);
+ oflags = atomic_fetch_or(IRQ_WORK_CLAIMED | CSD_TYPE_IRQ_WORK, &work->node.a_flags);
/*
* If the work is already pending, no need to raise the IPI.
- * The pairing atomic_fetch_andnot() in irq_work_run() makes sure
+ * The pairing smp_mb() in irq_work_single() makes sure
* everything we did before is visible.
*/
if (oflags & IRQ_WORK_PENDING)
@@ -53,12 +53,12 @@ void __weak arch_irq_work_raise(void)
static void __irq_work_queue_local(struct irq_work *work)
{
/* If the work is "lazy", handle it from next tick if any */
- if (atomic_read(&work->flags) & IRQ_WORK_LAZY) {
- if (llist_add(&work->llnode, this_cpu_ptr(&lazy_list)) &&
+ if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
+ if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
arch_irq_work_raise();
} else {
- if (llist_add(&work->llnode, this_cpu_ptr(&raised_list)))
+ if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
}
@@ -102,7 +102,7 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
- __smp_call_single_queue(cpu, &work->llnode);
+ __smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
@@ -136,23 +136,28 @@ void irq_work_single(void *arg)
int flags;
/*
- * Clear the PENDING bit, after this point the @work
- * can be re-used.
- * Make it immediately visible so that other CPUs trying
- * to claim that work don't rely on us to handle their data
- * while we are in the middle of the func.
+ * Clear the PENDING bit, after this point the @work can be re-used.
+ * The PENDING bit acts as a lock, and we own it, so we can clear it
+ * without atomic ops.
*/
- flags = atomic_fetch_andnot(IRQ_WORK_PENDING, &work->flags);
+ flags = atomic_read(&work->node.a_flags);
+ flags &= ~IRQ_WORK_PENDING;
+ atomic_set(&work->node.a_flags, flags);
+
+ /*
+ * See irq_work_claim().
+ */
+ smp_mb();
- lockdep_irq_work_enter(work);
+ lockdep_irq_work_enter(flags);
work->func(work);
- lockdep_irq_work_exit(work);
+ lockdep_irq_work_exit(flags);
+
/*
- * Clear the BUSY bit and return to the free state if
- * no-one else claimed it meanwhile.
+ * Clear the BUSY bit, if set, and return to the free state if no-one
+ * else claimed it meanwhile.
*/
- flags &= ~IRQ_WORK_PENDING;
- (void)atomic_cmpxchg(&work->flags, flags, flags & ~IRQ_WORK_BUSY);
+ (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
}
static void irq_work_run_list(struct llist_head *list)
@@ -166,7 +171,7 @@ static void irq_work_run_list(struct llist_head *list)
return;
llnode = llist_del_all(list);
- llist_for_each_entry_safe(work, tmp, llnode, llnode)
+ llist_for_each_entry_safe(work, tmp, llnode, node.llist)
irq_work_single(work);
}
@@ -198,7 +203,7 @@ void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
- while (atomic_read(&work->flags) & IRQ_WORK_BUSY)
+ while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 933a625621b8..e6aa66551241 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1249,6 +1249,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->active_mm = mm;
}
tsk->mm = mm;
+ membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable();
task_unlock(tsk);
@@ -1256,8 +1257,19 @@ void kthread_use_mm(struct mm_struct *mm)
finish_arch_post_lock_switch();
#endif
+ /*
+ * When a kthread starts operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after storing to tsk->mm, before accessing
+ * user-space memory. A full memory barrier for membarrier
+ * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
+ * mmdrop(), or explicitly with smp_mb().
+ */
if (active_mm != mm)
mmdrop(active_mm);
+ else
+ smp_mb();
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@@ -1277,9 +1289,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
+ /*
+ * When a kthread stops operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after accessing user-space memory, before
+ * clearing tsk->mm.
+ */
+ smp_mb__after_spinlock();
sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
+ membarrier_update_current_mm(NULL);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index bc1e3b5a97bd..28713dda3f6b 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -3025,10 +3025,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
wake_up_interruptible(&log_wait);
}
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = ATOMIC_INIT(IRQ_WORK_LAZY),
-};
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) =
+ IRQ_WORK_INIT_LAZY(wake_up_klogd_work_func);
void wake_up_klogd(void)
{
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b7124c119c0b..40e5e3dd253e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1322,8 +1322,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
if (IS_ENABLED(CONFIG_IRQ_WORK) &&
!rdp->rcu_iw_pending && rdp->rcu_iw_gp_seq != rnp->gp_seq &&
(rnp->ffmask & rdp->grpmask)) {
- init_irq_work(&rdp->rcu_iw, rcu_iw_handler);
- atomic_set(&rdp->rcu_iw.flags, IRQ_WORK_HARD_IRQ);
rdp->rcu_iw_pending = true;
rdp->rcu_iw_gp_seq = rnp->gp_seq;
irq_work_queue_on(&rdp->rcu_iw, rdp->cpu);
@@ -4023,6 +4021,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
rdp->cpu_no_qs.b.norm = true;
rdp->core_needs_qs = false;
rdp->rcu_iw_pending = false;
+ rdp->rcu_iw = IRQ_WORK_INIT_HARD(rcu_iw_handler);
rdp->rcu_iw_gp_seq = rdp->gp_seq - 1;
trace_rcu_grace_period(rcu_state.name, rdp->gp_seq, TPS("cpuonl"));
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index e0948cbb1d70..3636b80222ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -97,7 +97,7 @@ int sysctl_sched_rt_runtime = 950000;
*
* Normal scheduling state is serialized by rq->lock. __schedule() takes the
* local CPU's rq->lock, it optionally removes the task from the runqueue and
- * always looks at the local rq data structures to find the most elegible task
+ * always looks at the local rq data structures to find the most eligible task
* to run next.
*
* Task enqueue is also under rq->lock, possibly taken from another CPU.
@@ -320,14 +320,6 @@ void update_rq_clock(struct rq *rq)
update_rq_clock_task(rq, delta);
}
-static inline void
-rq_csd_init(struct rq *rq, call_single_data_t *csd, smp_call_func_t func)
-{
- csd->flags = 0;
- csd->func = func;
- csd->info = rq;
-}
-
#ifdef CONFIG_SCHED_HRTICK
/*
* Use HR-timers to deliver accurate preemption points.
@@ -428,7 +420,7 @@ void hrtick_start(struct rq *rq, u64 delay)
static void hrtick_rq_init(struct rq *rq)
{
#ifdef CONFIG_SMP
- rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
+ INIT_CSD(&rq->hrtick_csd, __hrtick_start, rq);
#endif
hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
rq->hrtick_timer.function = hrtick;
@@ -518,7 +510,7 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
/*
* Atomically grab the task, if ->wake_q is !nil already it means
- * its already queued (either by us or someone else) and will get the
+ * it's already queued (either by us or someone else) and will get the
* wakeup due to that.
*
* In order to ensure that a pending wakeup will observe our pending
@@ -769,7 +761,7 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
/*
- * If there are more than one RR tasks, we need the tick to effect the
+ * If there are more than one RR tasks, we need the tick to affect the
* actual RR behaviour.
*/
if (rq->rt.rr_nr_running) {
@@ -1187,14 +1179,14 @@ static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
* accounting was performed at enqueue time and we can just return
* here.
*
- * Need to be careful of the following enqeueue/dequeue ordering
+ * Need to be careful of the following enqueue/dequeue ordering
* problem too
*
* enqueue(taskA)
* // sched_uclamp_used gets enabled
* enqueue(taskB)
* dequeue(taskA)
- * // Must not decrement bukcet->tasks here
+ * // Must not decrement bucket->tasks here
* dequeue(taskB)
*
* where we could end up with stale data in uc_se and
@@ -1413,17 +1405,24 @@ done:
static int uclamp_validate(struct task_struct *p,
const struct sched_attr *attr)
{
- unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
- unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
+ int util_min = p->uclamp_req[UCLAMP_MIN].value;
+ int util_max = p->uclamp_req[UCLAMP_MAX].value;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN)
- lower_bound = attr->sched_util_min;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX)
- upper_bound = attr->sched_util_max;
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ util_min = attr->sched_util_min;
- if (lower_bound > upper_bound)
- return -EINVAL;
- if (upper_bound > SCHED_CAPACITY_SCALE)
+ if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ util_max = attr->sched_util_max;
+
+ if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
+ return -EINVAL;
+ }
+
+ if (util_min != -1 && util_max != -1 && util_min > util_max)
return -EINVAL;
/*
@@ -1438,20 +1437,41 @@ static int uclamp_validate(struct task_struct *p,
return 0;
}
+static bool uclamp_reset(const struct sched_attr *attr,
+ enum uclamp_id clamp_id,
+ struct uclamp_se *uc_se)
+{
+ /* Reset on sched class change for a non user-defined clamp value. */
+ if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
+ !uc_se->user_defined)
+ return true;
+
+ /* Reset on sched_util_{min,max} == -1. */
+ if (clamp_id == UCLAMP_MIN &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min == -1) {
+ return true;
+ }
+
+ if (clamp_id == UCLAMP_MAX &&
+ attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max == -1) {
+ return true;
+ }
+
+ return false;
+}
+
static void __setscheduler_uclamp(struct task_struct *p,
const struct sched_attr *attr)
{
enum uclamp_id clamp_id;
- /*
- * On scheduling class change, reset to default clamps for tasks
- * without a task-specific value.
- */
for_each_clamp_id(clamp_id) {
struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
+ unsigned int value;
- /* Keep using defined clamps across class changes */
- if (uc_se->user_defined)
+ if (!uclamp_reset(attr, clamp_id, uc_se))
continue;
/*
@@ -1459,21 +1479,25 @@ static void __setscheduler_uclamp(struct task_struct *p,
* at runtime.
*/
if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
- __uclamp_update_util_min_rt_default(p);
+ value = sysctl_sched_uclamp_util_min_rt_default;
else
- uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
+ value = uclamp_none(clamp_id);
+
+ uclamp_se_set(uc_se, value, false);
}
if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
return;
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
+ attr->sched_util_min != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
attr->sched_util_min, true);
}
- if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
+ if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
+ attr->sched_util_max != -1) {
uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
attr->sched_util_max, true);
}
@@ -1696,6 +1720,76 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags);
+
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags);
+
+static void migrate_disable_switch(struct rq *rq, struct task_struct *p)
+{
+ if (likely(!p->migration_disabled))
+ return;
+
+ if (p->cpus_ptr != &p->cpus_mask)
+ return;
+
+ /*
+ * Violates locking rules! see comment in __do_set_cpus_allowed().
+ */
+ __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE);
+}
+
+void migrate_disable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled) {
+ p->migration_disabled++;
+ return;
+ }
+
+ preempt_disable();
+ this_rq()->nr_pinned++;
+ p->migration_disabled = 1;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_disable);
+
+void migrate_enable(void)
+{
+ struct task_struct *p = current;
+
+ if (p->migration_disabled > 1) {
+ p->migration_disabled--;
+ return;
+ }
+
+ /*
+ * Ensure stop_task runs either before or after this, and that
+ * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule().
+ */
+ preempt_disable();
+ if (p->cpus_ptr != &p->cpus_mask)
+ __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE);
+ /*
+ * Mustn't clear migration_disabled() until cpus_ptr points back at the
+ * regular cpus_mask, otherwise things that race (eg.
+ * select_fallback_rq) get confused.
+ */
+ barrier();
+ p->migration_disabled = 0;
+ this_rq()->nr_pinned--;
+ preempt_enable();
+}
+EXPORT_SYMBOL_GPL(migrate_enable);
+
+static inline bool rq_has_pinned_tasks(struct rq *rq)
+{
+ return rq->nr_pinned;
+}
+
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -1705,7 +1799,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
if (!cpumask_test_cpu(cpu, p->cpus_ptr))
return false;
- if (is_per_cpu_kthread(p))
+ if (is_per_cpu_kthread(p) || is_migration_disabled(p))
return cpu_online(cpu);
return cpu_active(cpu);
@@ -1750,8 +1844,16 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
}
struct migration_arg {
- struct task_struct *task;
- int dest_cpu;
+ struct task_struct *task;
+ int dest_cpu;
+ struct set_affinity_pending *pending;
+};
+
+struct set_affinity_pending {
+ refcount_t refs;
+ struct completion done;
+ struct cpu_stop_work stop_work;
+ struct migration_arg arg;
};
/*
@@ -1783,16 +1885,19 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
*/
static int migration_cpu_stop(void *data)
{
+ struct set_affinity_pending *pending;
struct migration_arg *arg = data;
struct task_struct *p = arg->task;
+ int dest_cpu = arg->dest_cpu;
struct rq *rq = this_rq();
+ bool complete = false;
struct rq_flags rf;
/*
* The original target CPU might have gone down and we might
* be on another CPU but it doesn't matter.
*/
- local_irq_disable();
+ local_irq_save(rf.flags);
/*
* We need to explicitly wake pending tasks before running
* __migrate_task() such that we will not miss enforcing cpus_ptr
@@ -1802,21 +1907,137 @@ static int migration_cpu_stop(void *data)
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
+
+ pending = p->migration_pending;
/*
* If task_rq(p) != rq, it cannot be migrated here, because we're
* holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
* we're holding p->pi_lock.
*/
if (task_rq(p) == rq) {
+ if (is_migration_disabled(p))
+ goto out;
+
+ if (pending) {
+ p->migration_pending = NULL;
+ complete = true;
+ }
+
+ /* migrate_enable() -- we must not race against SCA */
+ if (dest_cpu < 0) {
+ /*
+ * When this was migrate_enable() but we no longer
+ * have a @pending, a concurrent SCA 'fixed' things
+ * and we should be valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ goto out;
+ }
+
+ dest_cpu = cpumask_any_distribute(&p->cpus_mask);
+ }
+
if (task_on_rq_queued(p))
- rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
+ rq = __migrate_task(rq, &rf, p, dest_cpu);
else
- p->wake_cpu = arg->dest_cpu;
+ p->wake_cpu = dest_cpu;
+
+ } else if (dest_cpu < 0 || pending) {
+ /*
+ * This happens when we get migrated between migrate_enable()'s
+ * preempt_enable() and scheduling the stopper task. At that
+ * point we're a regular task again and not current anymore.
+ *
+ * A !PREEMPT kernel has a giant hole here, which makes it far
+ * more likely.
+ */
+
+ /*
+ * The task moved before the stopper got to run. We're holding
+ * ->pi_lock, so the allowed mask is stable - if it got
+ * somewhere allowed, we're done.
+ */
+ if (pending && cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) {
+ p->migration_pending = NULL;
+ complete = true;
+ goto out;
+ }
+
+ /*
+ * When this was migrate_enable() but we no longer have an
+ * @pending, a concurrent SCA 'fixed' things and we should be
+ * valid again. Nothing to do.
+ */
+ if (!pending) {
+ WARN_ON_ONCE(!cpumask_test_cpu(task_cpu(p), &p->cpus_mask));
+ goto out;
+ }
+
+ /*
+ * When migrate_enable() hits a rq mis-match we can't reliably
+ * determine is_migration_disabled() and so have to chase after
+ * it.
+ */
+ task_rq_unlock(rq, p, &rf);
+ stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop,
+ &pending->arg, &pending->stop_work);
+ return 0;
}
- rq_unlock(rq, &rf);
- raw_spin_unlock(&p->pi_lock);
+out:
+ task_rq_unlock(rq, p, &rf);
+
+ if (complete)
+ complete_all(&pending->done);
+
+ /* For pending->{arg,stop_work} */
+ pending = arg->pending;
+ if (pending && refcount_dec_and_test(&pending->refs))
+ wake_up_var(&pending->refs);
- local_irq_enable();
+ return 0;
+}
+
+int push_cpu_stop(void *arg)
+{
+ struct rq *lowest_rq = NULL, *rq = this_rq();
+ struct task_struct *p = arg;
+
+ raw_spin_lock_irq(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+
+ if (task_rq(p) != rq)
+ goto out_unlock;
+
+ if (is_migration_disabled(p)) {
+ p->migration_flags |= MDF_PUSH;
+ goto out_unlock;
+ }
+
+ p->migration_flags &= ~MDF_PUSH;
+
+ if (p->sched_class->find_lock_rq)
+ lowest_rq = p->sched_class->find_lock_rq(p, rq);
+
+ if (!lowest_rq)
+ goto out_unlock;
+
+ // XXX validate p is still the highest prio task
+ if (task_rq(p) == rq) {
+ deactivate_task(rq, p, 0);
+ set_task_cpu(p, lowest_rq->cpu);
+ activate_task(lowest_rq, p, 0);
+ resched_curr(lowest_rq);
+ }
+
+ double_unlock_balance(rq, lowest_rq);
+
+out_unlock:
+ rq->push_busy = false;
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irq(&p->pi_lock);
+
+ put_task_struct(p);
return 0;
}
@@ -1824,18 +2045,39 @@ static int migration_cpu_stop(void *data)
* sched_class::set_cpus_allowed must do the below, but is not required to
* actually call this function.
*/
-void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
+ if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) {
+ p->cpus_ptr = new_mask;
+ return;
+ }
+
cpumask_copy(&p->cpus_mask, new_mask);
p->nr_cpus_allowed = cpumask_weight(new_mask);
}
-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+static void
+__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags)
{
struct rq *rq = task_rq(p);
bool queued, running;
- lockdep_assert_held(&p->pi_lock);
+ /*
+ * This here violates the locking rules for affinity, since we're only
+ * supposed to change these variables while holding both rq->lock and
+ * p->pi_lock.
+ *
+ * HOWEVER, it magically works, because ttwu() is the only code that
+ * accesses these variables under p->pi_lock and only does so after
+ * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule()
+ * before finish_task().
+ *
+ * XXX do further audits, this smells like something putrid.
+ */
+ if (flags & SCA_MIGRATE_DISABLE)
+ SCHED_WARN_ON(!p->on_cpu);
+ else
+ lockdep_assert_held(&p->pi_lock);
queued = task_on_rq_queued(p);
running = task_current(rq, p);
@@ -1851,7 +2093,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
if (running)
put_prev_task(rq, p);
- p->s