summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/livepatch/core.c1
-rw-r--r--kernel/livepatch/transition.c122
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c669
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/fair.c22
-rw-r--r--kernel/sched/psi.c473
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/sched.h243
-rw-r--r--kernel/sched/topology.c4
12 files changed, 1266 insertions, 316 deletions
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ae518166d70a..f2a77b1b45a2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3771,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
- new = psi_trigger_create(psi, buf, res);
+ new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4342200d5e2b..eccb35a85216 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -924,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1188,7 +1189,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
#endif
return tsk;
@@ -1296,18 +1299,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
- mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index fc851455740c..61328328c474 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f1b25ec581e0..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,11 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -25,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
switch (ret) {
case 0: /* success */
break;
@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
+
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
- * operations by the tasklist_lock. The only exception is
- * klp_update_patch_state(current), but we cannot race with
- * that because we are current.
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5732fa75ebab..b5cc2b53464d 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -300,6 +300,9 @@ noinstr u64 local_clock(void)
if (static_branch_likely(&__sched_clock_stable))
return sched_clock() + __sched_clock_offset;
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock();
+
preempt_disable_notrace();
clock = sched_clock_local(this_scd());
preempt_enable_notrace();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d2b6742d02c..54c75af24899 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -261,36 +261,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
- struct rb_node *node;
-
- node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
- /*
- * The idle task always matches any cookie!
- */
- if (!node)
- return idle_sched_class.pick_task(rq);
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
- return __node_2_sc(node);
+ return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
- node = rb_next(node);
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
- p = container_of(node, struct task_struct, core_node);
- if (p->core_cookie != cookie)
- return NULL;
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
- return p;
+ return sched_core_next(p, cookie);
}
/*
@@ -2087,6 +2102,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -3196,6 +3213,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -4469,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5115,7 +5134,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
- switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5272,6 +5290,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
@@ -5301,6 +5322,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -5589,6 +5613,7 @@ void scheduler_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
+ task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
@@ -6241,7 +6266,7 @@ static bool try_steal_cookie(int this, int that)
goto unlock;
p = sched_core_find(src, cookie);
- if (p == src->idle)
+ if (!p)
goto unlock;
do {
@@ -6253,6 +6278,13 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure that task @p
+ * is not throttled now, we also need to check whether the runqueue
+ * of the destination CPU is being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
@@ -8508,6 +8540,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
+ klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -8656,13 +8689,17 @@ int sched_dynamic_mode(const char *str)
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
-void sched_dynamic_update(int mode)
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8670,36 +8707,79 @@ void sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: none\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: voluntary\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
- preempt_dynamic_disable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_update(preempt_dynamic_mode);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#endif /* CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
static int __init setup_preempt_mode(char *str)
{
int mode = sched_dynamic_mode(str);
@@ -10334,7 +10414,7 @@ void sched_release_group(struct task_group *tg)
spin_unlock_irqrestore(&task_group_lock, flags);
}
-static void sched_change_group(struct task_struct *tsk)
+static struct task_group *sched_get_task_group(struct task_struct *tsk)
{
struct task_group *tg;
@@ -10346,7 +10426,13 @@ static void sched_change_group(struct task_struct *tsk)
tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
- tsk->sched_task_group = tg;
+
+ return tg;
+}
+
+static void sched_change_group(struct task_struct *tsk, struct task_group *group)
+{
+ tsk->sched_task_group = group;
#ifdef CONFIG_FAIR_GROUP_SCHED
if (tsk->sched_class->task_change_group)
@@ -10367,10 +10453,19 @@ void sched_move_task(struct task_struct *tsk)
{
int queued, running, queue_flags =
DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
+ struct task_group *group;
struct rq_flags rf;
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ /*
+ * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous
+ * group changes.
+ */
+ group = sched_get_task_group(tsk);
+ if (group == tsk->sched_task_group)
+ goto unlock;
+
update_rq_clock(rq);
running = task_current(rq, tsk);
@@ -10381,7 +10476,7 @@ void sched_move_task(struct task_struct *tsk)
if (running)
put_prev_task(rq, tsk);
- sched_change_group(tsk);
+ sched_change_group(tsk, group);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -10395,6 +10490,7 @@ void sched_move_task(struct task_struct *tsk)
resched_curr(rq);
}
+unlock:
task_rq_unlock(rq, tsk, &rf);
}
@@ -11385,45 +11481,524 @@ void call_trace_sched_update_nr_running(struct rq *rq, int count)
}
#ifdef CONFIG_SCHED_MM_CID
-void sched_mm_cid_exit_signals(struct task_struct *t)
+
+/**
+ * @cid_lock: Guarantee forward-progress of cid allocation.
+ *
+ * Concurrency ID allocation within a bitmap is mostly lock-free. The cid_lock
+ * is only used when contention is detected by the lock-free allocation so
+ * forward progress can be guaranteed.
+ */
+DEFINE_RAW_SPINLOCK(cid_lock);
+
+/**
+ * @use_cid_lock: Select cid allocation behavior: lock-free vs spinlock.
+ *
+ * When @use_cid_lock is 0, the cid allocation is lock-free. When contention is
+ * detected, it is set to 1 to ensure that all newly coming allocations are
+ * serialized by @cid_lock until the allocation which detected contention
+ * completes and sets @use_cid_lock back to 0. This guarantees forward progress
+ * of a cid allocation.
+ */
+int use_cid_lock;
+
+/*
+ * mm_cid remote-clear implements a lock-free algorithm to clear per-mm/cpu cid
+ * concurrently with respect to the execution of the source runqueue context
+ * switch.
+ *
+ * There is one basic properties we want to guarantee here:
+ *
+ * (1) Remote-clear should _never_ mark a per-cpu cid UNSET when it is actively
+ * used by a task. That would lead to concurrent allocation of the cid and
+ * userspace corruption.
+ *
+ * Provide this guarantee by introducing a Dekker memory ordering to guarantee
+ * that a pair of loads observe at least one of a pair of stores, which can be
+ * shown as:
+ *
+ * X = Y = 0
+ *
+ * w[X]=1 w[Y]=1
+ * MB MB
+ * r[Y]=y r[X]=x
+ *
+ * Which guarantees that x==0 && y==0 is impossible. But rather than using
+ * values 0 and 1, this algorithm cares about specific state transitions of the
+ * runqueue current task (as updated by the scheduler context switch), and the
+ * per-mm/cpu cid value.
+ *
+ * Let's introduce task (Y) which has task->mm == mm and task (N) which has
+ * task->mm != mm for the rest of the discussion. There are two scheduler state
+ * transitions on context switch we care about:
+ *
+ * (TSA) Store to rq->curr with transition from (N) to (Y)
+ *
+ * (TSB) Store to rq->curr with transition from (Y) to (N)
+ *
+ * On the remote-clear side, there is one transition we care about:
+ *
+ * (TMA) cmpxchg to *pcpu_cid to set the LAZY flag
+ *
+ * There is also a transition to UNSET state which can be performed from all
+ * sides (scheduler, remote-clear). It is always performed with a cmpxchg which
+ * guarantees that only a single thread will succeed:
+ *
+ * (TMB) cmpxchg to *pcpu_cid to mark UNSET
+ *
+ * Just to be clear, what we do _not_ want to happen is a transition to UNSET
+ * when a thread is actively using the cid (property (1)).
+ *
+ * Let's looks at the relevant combinations of TSA/TSB, and TMA transitions.
+ *
+ * Scenario A) (TSA)+(TMA) (from next task perspective)
+ *
+ * CPU0 CPU1
+ *
+ * Context switch CS-1 Remote-clear
+ * - store to rq->curr: (N)->(Y) (TSA) - cmpxchg to *pcpu_id to LAZY (TMA)
+ * (implied barrier after cmpxchg)
+ * - switch_mm_cid()
+ * - memory barrier (see switch_mm_cid()
+ * comment explaining how this barrier
+ * is combined with other scheduler
+ * barriers)
+ * - mm_cid_get (next)
+ * - READ_ONCE(*pcpu_cid) - rcu_dereference(src_rq->curr)
+ *
+ * This Dekker ensures that either task (Y) is observed by the
+ * rcu_dereference() or the LAZY flag is observed by READ_ONCE(), or both are
+ * observed.
+ *
+ * If task (Y) store is observed by rcu_dereference(), it means that there is
+ * still an active task on the cpu. Remote-clear will therefore not transition
+ * to UNSET, which fulfills property (1).
+ *
+ * If task (Y) is not observed, but the lazy flag is observed by READ_ONCE(),
+ * it will move its state to UNSET, which clears the percpu cid perhaps
+ * uselessly (which is not an issue for correctness). Because task (Y) is not
+ * observed, CPU1 can move ahead to set the state to UNSET. Because moving
+ * state to UNSET is done with a cmpxchg expecting that the old state has the
+ * LAZY flag set, only one thread will successfully UNSET.
+ *
+ * If both states (LAZY flag and task (Y)) are observed, the thread on CPU0
+ * will observe the LAZY flag and transition to UNSET (perhaps uselessly), and
+ * CPU1 will observe task (Y) and do nothing more, which is fine.
+ *
+ * What we are effectively preventing with this Dekker is a scenario where
+ * neither LAZY flag nor store (Y) are observed, which would fail property (1)
+ * because this would UNSET a cid which is actively used.
+ */
+
+void sched_mm_cid_migrate_from(struct task_struct *t)
+{
+ t->migrate_from_cpu = task_cpu(t);
+}
+
+static
+int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid)
{
struct mm_struct *mm = t->mm;
- unsigned long flags;
+ struct task_struct *src_task;
+ int src_cid, last_mm_cid;
+
+ if (!mm)
+ return -1;
+
+ last_mm_cid = t->last_mm_cid;
+ /*
+ * If the migrated task has no last cid, or if the current
+ * task on src rq uses the cid, it means the source cid does not need
+ * to be moved to the destination cpu.
+ */
+ if (last_mm_cid == -1)
+ return -1;
+ src_cid = READ_ONCE(src_pcpu_cid->cid);
+ if (!mm_cid_is_valid(src_cid) || last_mm_cid != src_cid)
+ return -1;
+
+ /*
+ * If we observe an active task using the mm on this rq, it means we
+ * are not the last task to be migrated from this cpu for this mm, so
+ * there is no need to move src_cid to the destination cpu.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ return src_cid;
+}
+
+static
+int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq,
+ struct task_struct *t,
+ struct mm_cid *src_pcpu_cid,
+ int src_cid)
+{
+ struct task_struct *src_task;
+ struct mm_struct *mm = t->mm;
+ int lazy_cid;
+
+ if (src_cid == -1)
+ return -1;
+
+ /*
+ * Attempt to clear the source cpu cid to move it to the destination
+ * cpu.
+ */
+ lazy_cid = mm_cid_set_lazy_put(src_cid);
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &src_cid, lazy_cid))
+ return -1;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, this task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ src_task = rcu_dereference(src_rq->curr);
+ if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) {
+ rcu_read_unlock();
+ /*
+ * We observed an active task for this mm, there is therefore
+ * no point in moving this cid to the destination cpu.
+ */
+ t->last_mm_cid = -1;
+ return -1;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The src_cid is unused, so it can be unset.
+ */
+ if (!try_cmpxchg(&src_pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ return -1;
+ return src_cid;
+}
+
+/*
+ * Migration to dst cpu. Called with dst_rq lock held.
+ * Interrupts are disabled, which keeps the window of cid ownership without the
+ * source rq lock held small.
+ */
+void sched_mm_cid_migrate_to(struct rq *dst_rq, struct task_struct *t)
+{
+ struct mm_cid *src_pcpu_cid, *dst_pcpu_cid;
+ struct mm_struct *mm = t->mm;
+ int src_cid, dst_cid, src_cpu;
+ struct rq *src_rq;
+
+ lockdep_assert_rq_held(dst_rq);
if (!mm)
return;
+ src_cpu = t->migrate_from_cpu;
+ if (src_cpu == -1) {
+ t->last_mm_cid = -1;
+ return;
+ }
+ /*
+ * Move the src cid if the dst cid is unset. This keeps id
+ * allocation closest to 0 in cases where few threads migrate around
+ * many cpus.
+ *
+ * If destination cid is already set, we may have to just clear
+ * the src cid to ensure compactness in frequent migrations
+ * scenarios.
+ *
+ * It is not useful to clear the src cid when the number of threads is
+ * greater or equal to the number of allowed cpus, because user-space
+ * can expect that the number of allowed cids can reach the number of
+ * allowed cpus.
+ */
+ dst_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu_of(dst_rq));
+ dst_cid = READ_ONCE(dst_pcpu_cid->cid);
+ if (!mm_cid_is_unset(dst_cid) &&
+ atomic_read(&mm->mm_users) >= t->nr_cpus_allowed)
+ return;
+ src_pcpu_cid = per_cpu_ptr(mm->pcpu_cid, src_cpu);
+ src_rq = cpu_rq(src_cpu);
+ src_cid = __sched_mm_cid_migrate_from_fetch_cid(src_rq, t, src_pcpu_cid);
+ if (src_cid == -1)
+ return;
+ src_cid = __sched_mm_cid_migrate_from_try_steal_cid(src_rq, t, src_pcpu_cid,
+ src_cid);
+ if (src_cid == -1)
+ return;
+ if (!mm_cid_is_unset(dst_cid)) {
+ __mm_cid_put(mm, src_cid);
+ return;
+ }
+ /* Move src_cid to dst cpu. */
+ mm_cid_snapshot_time(dst_rq, mm);
+ WRITE_ONCE(dst_pcpu_cid->cid, src_cid);
+}
+
+static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_cid,
+ int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *t;
+ unsigned long flags;
+ int cid, lazy_cid;
+
+ cid = READ_ONCE(pcpu_cid->cid);
+ if (!mm_cid_is_valid(cid))
+ return;
+
+ /*
+ * Clear the cpu cid if it is set to keep cid allocation compact. If
+ * there happens to be other tasks left on the source cpu using this
+ * mm, the next task using this mm will reallocate its cid on context
+ * switch.
+ */
+ lazy_cid = mm_cid_set_lazy_put(cid);
+ if (!try_cmpxchg(&pcpu_cid->cid, &cid, lazy_cid))
+ return;
+
+ /*
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm matches the scheduler barrier in context_switch()
+ * between store to rq->curr and load of prev and next task's
+ * per-mm/cpu cid.
+ *
+ * The implicit barrier after cmpxchg per-mm/cpu cid before loading
+ * rq->curr->mm_cid_active matches the barrier in
+ * sched_mm_cid_exit_signals(), sched_mm_cid_before_execve(), and
+ * sched_mm_cid_after_execve() between store to t->mm_cid_active and
+ * load of per-mm/cpu cid.
+ */
+
+ /*
+ * If we observe an active task using the mm on this rq after setting
+ * the lazy-put flag, that task will be responsible for transitioning
+ * from lazy-put flag set to MM_CID_UNSET.
+ */
+ rcu_read_lock();
+ t = rcu_dereference(rq->curr);
+ if (READ_ONCE(t->mm_cid_active) && t->mm == mm) {
+ rcu_read_unlock();
+ return;
+ }
+ rcu_read_unlock();
+
+ /*
+ * The cid is unused, so it can be unset.
+ * Disable interrupts to keep the window of cid ownership without rq
+ * lock small.
+ */
local_irq_save(flags);
- mm_cid_put(mm, t->mm_cid);
- t->mm_cid = -1;
- t->mm_cid_active = 0;
+ if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET))
+ __mm_cid_put(mm, cid);
local_irq_restore(flags);
}
+static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct mm_cid *pcpu_cid;
+ struct task_struct *curr;
+ u64 rq_clock;
+
+ /*
+ * rq->clock load is racy on 32-bit but one spurious clear once in a
+ * while is irrelevant.
+ */
+ rq_clock = READ_ONCE(rq->clock);
+ pcpu_cid = per_cpu_ptr(mm->pcpu_cid, cpu);
+
+ /*
+ * In order to take care of infrequently scheduled tasks, bump the time
+ * snapshot associated with this cid if an active task using the mm is
+ * observed on this rq.
+ */
+ rcu_read_lock();
+ curr = rcu_dereference(rq->curr);
+ if (READ_ONCE(curr->mm