summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/accounting/psi.rst4
-rw-r--r--drivers/vhost/vhost.c3
-rw-r--r--include/linux/livepatch.h1
-rw-r--r--include/linux/livepatch_sched.h29
-rw-r--r--include/linux/mm_types.h82
-rw-r--r--include/linux/psi.h2
-rw-r--r--include/linux/psi_types.h43
-rw-r--r--include/linux/sched.h23
-rw-r--r--include/linux/sched/mm.h5
-rw-r--r--kernel/cgroup/cgroup.c2
-rw-r--r--kernel/fork.c9
-rw-r--r--kernel/livepatch/core.c1
-rw-r--r--kernel/livepatch/transition.c122
-rw-r--r--kernel/sched/clock.c3
-rw-r--r--kernel/sched/core.c669
-rw-r--r--kernel/sched/deadline.c11
-rw-r--r--kernel/sched/fair.c22
-rw-r--r--kernel/sched/psi.c473
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/sched.h243
-rw-r--r--kernel/sched/topology.c4
21 files changed, 1424 insertions, 350 deletions
diff --git a/Documentation/accounting/psi.rst b/Documentation/accounting/psi.rst
index 5e40b3f437f9..df6062eb3abb 100644
--- a/Documentation/accounting/psi.rst
+++ b/Documentation/accounting/psi.rst
@@ -105,6 +105,10 @@ prevent overly frequent polling. Max limit is chosen as a high enough number
after which monitors are most likely not needed and psi averages can be used
instead.
+Unprivileged users can also create monitors, with the only limitation that the
+window size must be a multiple of 2s, in order to prevent excessive resource
+usage.
+
When activated, psi monitor stays active for at least the duration of one
tracking window to avoid repeated activations/deactivations when system is
bouncing in and out of the stall state.
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 10bf35a3db6e..a92af08e7864 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -361,8 +361,7 @@ static int vhost_worker(void *data)
kcov_remote_start_common(worker->kcov_handle);
work->fn(work);
kcov_remote_stop();
- if (need_resched())
- schedule();
+ cond_resched();
}
}
diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h
index 293e29960c6e..9b9b38e89563 100644
--- a/include/linux/livepatch.h
+++ b/include/linux/livepatch.h
@@ -13,6 +13,7 @@
#include <linux/ftrace.h>
#include <linux/completion.h>
#include <linux/list.h>
+#include <linux/livepatch_sched.h>
#if IS_ENABLED(CONFIG_LIVEPATCH)
diff --git a/include/linux/livepatch_sched.h b/include/linux/livepatch_sched.h
new file mode 100644
index 000000000000..013794fb5da0
--- /dev/null
+++ b/include/linux/livepatch_sched.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef _LINUX_LIVEPATCH_SCHED_H_
+#define _LINUX_LIVEPATCH_SCHED_H_
+
+#include <linux/jump_label.h>
+#include <linux/static_call_types.h>
+
+#ifdef CONFIG_LIVEPATCH
+
+void __klp_sched_try_switch(void);
+
+#if !defined(CONFIG_PREEMPT_DYNAMIC) || !defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+DECLARE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+
+static __always_inline void klp_sched_try_switch(void)
+{
+ if (static_branch_unlikely(&klp_sched_try_switch_key))
+ __klp_sched_try_switch();
+}
+
+#endif /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+#else /* !CONFIG_LIVEPATCH */
+static inline void klp_sched_try_switch(void) {}
+static inline void __klp_sched_try_switch(void) {}
+#endif /* CONFIG_LIVEPATCH */
+
+#endif /* _LINUX_LIVEPATCH_SCHED_H_ */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 3fc9e680f174..306a3d1a0fa6 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -573,6 +573,13 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;
+#ifdef CONFIG_SCHED_MM_CID
+struct mm_cid {
+ u64 time;
+ int cid;
+};
+#endif
+
struct kioctx_table;
struct mm_struct {
struct {
@@ -623,15 +630,19 @@ struct mm_struct {
atomic_t mm_count;
#ifdef CONFIG_SCHED_MM_CID
/**
- * @cid_lock: Protect cid bitmap updates vs lookups.
+ * @pcpu_cid: Per-cpu current cid.
*
- * Prevent situations where updates to the cid bitmap happen
- * concurrently with lookups. Those can lead to situations
- * where a lookup cannot find a free bit simply because it was
- * unlucky enough to load, non-atomically, bitmap words as they
- * were being concurrently updated by the updaters.
+ * Keep track of the currently allocated mm_cid for each cpu.
+ * The per-cpu mm_cid values are serialized by their respective
+ * runqueue locks.
*/
- raw_spinlock_t cid_lock;
+ struct mm_cid __percpu *pcpu_cid;
+ /*
+ * @mm_cid_next_scan: Next mm_cid scan (in jiffies).
+ *
+ * When the next mm_cid scan is due (in jiffies).
+ */
+ unsigned long mm_cid_next_scan;
#endif
#ifdef CONFIG_MMU
atomic_long_t pgtables_bytes; /* size of all page tables */
@@ -899,6 +910,37 @@ static inline void vma_iter_init(struct vma_iterator *vmi,
}
#ifdef CONFIG_SCHED_MM_CID
+
+enum mm_cid_state {
+ MM_CID_UNSET = -1U, /* Unset state has lazy_put flag set. */
+ MM_CID_LAZY_PUT = (1U << 31),
+};
+
+static inline bool mm_cid_is_unset(int cid)
+{
+ return cid == MM_CID_UNSET;
+}
+
+static inline bool mm_cid_is_lazy_put(int cid)
+{
+ return !mm_cid_is_unset(cid) && (cid & MM_CID_LAZY_PUT);
+}
+
+static inline bool mm_cid_is_valid(int cid)
+{
+ return !(cid & MM_CID_LAZY_PUT);
+}
+
+static inline int mm_cid_set_lazy_put(int cid)
+{
+ return cid | MM_CID_LAZY_PUT;
+}
+
+static inline int mm_cid_clear_lazy_put(int cid)
+{
+ return cid & ~MM_CID_LAZY_PUT;
+}
+
/* Accessor for struct mm_struct's cidmask. */
static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
{
@@ -912,16 +954,40 @@ static inline cpumask_t *mm_cidmask(struct mm_struct *mm)
static inline void mm_init_cid(struct mm_struct *mm)
{
- raw_spin_lock_init(&mm->cid_lock);
+ int i;
+
+ for_each_possible_cpu(i) {
+ struct mm_cid *pcpu_cid = per_cpu_ptr(mm->pcpu_cid, i);
+
+ pcpu_cid->cid = MM_CID_UNSET;
+ pcpu_cid->time = 0;
+ }
cpumask_clear(mm_cidmask(mm));
}
+static inline int mm_alloc_cid(struct mm_struct *mm)
+{
+ mm->pcpu_cid = alloc_percpu(struct mm_cid);
+ if (!mm->pcpu_cid)
+ return -ENOMEM;
+ mm_init_cid(mm);
+ return 0;
+}
+
+static inline void mm_destroy_cid(struct mm_struct *mm)
+{
+ free_percpu(mm->pcpu_cid);
+ mm->pcpu_cid = NULL;
+}
+
static inline unsigned int mm_cid_size(void)
{
return cpumask_size();
}
#else /* CONFIG_SCHED_MM_CID */
static inline void mm_init_cid(struct mm_struct *mm) { }
+static inline int mm_alloc_cid(struct mm_struct *mm) { return 0; }
+static inline void mm_destroy_cid(struct mm_struct *mm) { }
static inline unsigned int mm_cid_size(void)
{
return 0;
diff --git a/include/linux/psi.h b/include/linux/psi.h
index b029a847def1..ab26200c2803 100644
--- a/include/linux/psi.h
+++ b/include/linux/psi.h
@@ -24,7 +24,7 @@ void psi_memstall_leave(unsigned long *flags);
int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res);
struct psi_trigger *psi_trigger_create(struct psi_group *group,
- char *buf, enum psi_res res);
+ char *buf, enum psi_res res, struct file *file);
void psi_trigger_destroy(struct psi_trigger *t);
__poll_t psi_trigger_poll(void **trigger_ptr, struct file *file,
diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 1e0a0d7ace3a..040c089581c6 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -151,6 +151,9 @@ struct psi_trigger {
/* Deferred event(s) from previous ratelimit window */
bool pending_event;
+
+ /* Trigger type - PSI_AVGS for unprivileged, PSI_POLL for RT */
+ enum psi_aggregators aggregator;
};
struct psi_group {
@@ -171,30 +174,34 @@ struct psi_group {
/* Aggregator work control */
struct delayed_work avgs_work;
+ /* Unprivileged triggers against N*PSI_FREQ windows */
+ struct list_head avg_triggers;
+ u32 avg_nr_triggers[NR_PSI_STATES - 1];
+
/* Total stall times and sampled pressure averages */
u64 total[NR_PSI_AGGREGATORS][NR_PSI_STATES - 1];
unsigned long avg[NR_PSI_STATES - 1][3];
- /* Monitor work control */
- struct task_struct __rcu *poll_task;
- struct timer_list poll_timer;
- wait_queue_head_t poll_wait;
- atomic_t poll_wakeup;
- atomic_t poll_scheduled;
+ /* Monitor RT polling work control */
+ struct task_struct __rcu *rtpoll_task;
+ struct timer_list rtpoll_timer;
+ wait_queue_head_t rtpoll_wait;
+ atomic_t rtpoll_wakeup;
+ atomic_t rtpoll_scheduled;
/* Protects data used by the monitor */
- struct mutex trigger_lock;
-
- /* Configured polling triggers */
- struct list_head triggers;
- u32 nr_triggers[NR_PSI_STATES - 1];
- u32 poll_states;
- u64 poll_min_period;
-
- /* Total stall times at the start of monitor activation */
- u64 polling_total[NR_PSI_STATES - 1];
- u64 polling_next_update;
- u64 polling_until;
+ struct mutex rtpoll_trigger_lock;
+
+ /* Configured RT polling triggers */
+ struct list_head rtpoll_triggers;
+ u32 rtpoll_nr_triggers[NR_PSI_STATES - 1];
+ u32 rtpoll_states;
+ u64 rtpoll_min_period;
+
+ /* Total stall times at the start of RT polling monitor activation */
+ u64 rtpoll_total[NR_PSI_STATES - 1];
+ u64 rtpoll_next_update;
+ u64 rtpoll_until;
};
#else /* CONFIG_PSI */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3f5395ae86bc..dc4ad4c58fae 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -36,6 +36,7 @@
#include <linux/seqlock.h>
#include <linux/kcsan.h>
#include <linux/rv.h>
+#include <linux/livepatch_sched.h>
#include <asm/kmap_size.h>
/* task_struct member predeclarations (sorted alphabetically): */
@@ -1313,7 +1314,10 @@ struct task_struct {
#ifdef CONFIG_SCHED_MM_CID
int mm_cid; /* Current cid in mm */
+ int last_mm_cid; /* Most recent cid in mm */
+ int migrate_from_cpu;
int mm_cid_active; /* Whether cid bitmap is active */
+ struct callback_head cid_work;
#endif
struct tlbflush_unmap_batch tlb_ubc;
@@ -2067,6 +2071,9 @@ extern int __cond_resched(void);
#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+void sched_dynamic_klp_enable(void);
+void sched_dynamic_klp_disable(void);
+
DECLARE_STATIC_CALL(cond_resched, __cond_resched);
static __always_inline int _cond_resched(void)
@@ -2075,6 +2082,7 @@ static __always_inline int _cond_resched(void)
}
#elif defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
+
extern int dynamic_cond_resched(void);
static __always_inline int _cond_resched(void)
@@ -2082,20 +2090,25 @@ static __always_inline int _cond_resched(void)
return dynamic_cond_resched();
}
-#else
+#else /* !CONFIG_PREEMPTION */
static inline int _cond_resched(void)
{
+ klp_sched_try_switch();
return __cond_resched();
}
-#endif /* CONFIG_PREEMPT_DYNAMIC */
+#endif /* PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
-#else
+#else /* CONFIG_PREEMPTION && !CONFIG_PREEMPT_DYNAMIC */
-static inline int _cond_resched(void) { return 0; }
+static inline int _cond_resched(void)
+{
+ klp_sched_try_switch();
+ return 0;
+}
-#endif /* !defined(CONFIG_PREEMPTION) || defined(CONFIG_PREEMPT_DYNAMIC) */
+#endif /* !CONFIG_PREEMPTION || CONFIG_PREEMPT_DYNAMIC */
#define cond_resched() ({ \
__might_resched(__FILE__, __LINE__, 0); \
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index af12fcb11005..b114fbe3a93b 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -37,6 +37,11 @@ static inline void mmgrab(struct mm_struct *mm)
atomic_inc(&mm->mm_count);
}
+static inline void smp_mb__after_mmgrab(void)
+{
+ smp_mb__after_atomic();
+}
+
extern void __mmdrop(struct mm_struct *mm);
static inline void mmdrop(struct mm_struct *mm)
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index ae518166d70a..f2a77b1b45a2 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3771,7 +3771,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf,
}
psi = cgroup_psi(cgrp);
- new = psi_trigger_create(psi, buf, res);
+ new = psi_trigger_create(psi, buf, res, of->file);
if (IS_ERR(new)) {
cgroup_put(cgrp);
return PTR_ERR(new);
diff --git a/kernel/fork.c b/kernel/fork.c
index 4342200d5e2b..eccb35a85216 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -924,6 +924,7 @@ void __mmdrop(struct mm_struct *mm)
check_mm(mm);
put_user_ns(mm->user_ns);
mm_pasid_drop(mm);
+ mm_destroy_cid(mm);
for (i = 0; i < NR_MM_COUNTERS; i++)
percpu_counter_destroy(&mm->rss_stat[i]);
@@ -1188,7 +1189,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#ifdef CONFIG_SCHED_MM_CID
tsk->mm_cid = -1;
+ tsk->last_mm_cid = -1;
tsk->mm_cid_active = 0;
+ tsk->migrate_from_cpu = -1;
#endif
return tsk;
@@ -1296,18 +1299,22 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
if (init_new_context(p, mm))
goto fail_nocontext;
+ if (mm_alloc_cid(mm))
+ goto fail_cid;
+
for (i = 0; i < NR_MM_COUNTERS; i++)
if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT))
goto fail_pcpu;
mm->user_ns = get_user_ns(user_ns);
lru_gen_init_mm(mm);
- mm_init_cid(mm);
return mm;
fail_pcpu:
while (i > 0)
percpu_counter_destroy(&mm->rss_stat[--i]);
+ mm_destroy_cid(mm);
+fail_cid:
destroy_context(mm);
fail_nocontext:
mm_free_pgd(mm);
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
index fc851455740c..61328328c474 100644
--- a/kernel/livepatch/core.c
+++ b/kernel/livepatch/core.c
@@ -33,6 +33,7 @@
*
* - klp_ftrace_handler()
* - klp_update_patch_state()
+ * - __klp_sched_try_switch()
*/
DEFINE_MUTEX(klp_mutex);
diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c
index f1b25ec581e0..e9fd83a02228 100644
--- a/kernel/livepatch/transition.c
+++ b/kernel/livepatch/transition.c
@@ -9,11 +9,14 @@
#include <linux/cpu.h>
#include <linux/stacktrace.h>
+#include <linux/static_call.h>
#include "core.h"
#include "patch.h"
#include "transition.h"
#define MAX_STACK_ENTRIES 100
+DEFINE_PER_CPU(unsigned long[MAX_STACK_ENTRIES], klp_stack_entries);
+
#define STACK_ERR_BUF_SIZE 128
#define SIGNALS_TIMEOUT 15
@@ -25,6 +28,25 @@ static int klp_target_state = KLP_UNDEFINED;
static unsigned int klp_signals_cnt;
/*
+ * When a livepatch is in progress, enable klp stack checking in
+ * cond_resched(). This helps CPU-bound kthreads get patched.
+ */
+#if defined(CONFIG_PREEMPT_DYNAMIC) && defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL)
+
+#define klp_cond_resched_enable() sched_dynamic_klp_enable()
+#define klp_cond_resched_disable() sched_dynamic_klp_disable()
+
+#else /* !CONFIG_PREEMPT_DYNAMIC || !CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+DEFINE_STATIC_KEY_FALSE(klp_sched_try_switch_key);
+EXPORT_SYMBOL(klp_sched_try_switch_key);
+
+#define klp_cond_resched_enable() static_branch_enable(&klp_sched_try_switch_key)
+#define klp_cond_resched_disable() static_branch_disable(&klp_sched_try_switch_key)
+
+#endif /* CONFIG_PREEMPT_DYNAMIC && CONFIG_HAVE_PREEMPT_DYNAMIC_CALL */
+
+/*
* This work can be performed periodically to finish patching or unpatching any
* "straggler" tasks which failed to transition in the first attempt.
*/
@@ -172,8 +194,8 @@ void klp_update_patch_state(struct task_struct *task)
* barrier (smp_rmb) for two cases:
*
* 1) Enforce the order of the TIF_PATCH_PENDING read and the
- * klp_target_state read. The corresponding write barrier is in
- * klp_init_transition().
+ * klp_target_state read. The corresponding write barriers are in
+ * klp_init_transition() and klp_reverse_transition().
*
* 2) Enforce the order of the TIF_PATCH_PENDING read and a future read
* of func->transition, if klp_ftrace_handler() is called later on
@@ -240,12 +262,15 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
*/
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
- static unsigned long entries[MAX_STACK_ENTRIES];
+ unsigned long *entries = this_cpu_ptr(klp_stack_entries);
struct klp_object *obj;
struct klp_func *func;
int ret, nr_entries;
- ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
+ /* Protect 'klp_stack_entries' */
+ lockdep_assert_preemption_disabled();
+
+ ret = stack_trace_save_tsk_reliable(task, entries, MAX_STACK_ENTRIES);
if (ret < 0)
return -EINVAL;
nr_entries = ret;
@@ -307,7 +332,11 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
- ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+ if (task == current)
+ ret = klp_check_and_switch_task(current, &old_name);
+ else
+ ret = task_call_func(task, klp_check_and_switch_task, &old_name);
+
switch (ret) {
case 0: /* success */
break;
@@ -334,6 +363,44 @@ static bool klp_try_switch_task(struct task_struct *task)
return !ret;
}
+void __klp_sched_try_switch(void)
+{
+ if (likely(!klp_patch_pending(current)))
+ return;
+
+ /*
+ * This function is called from cond_resched() which is called in many
+ * places throughout the kernel. Using the klp_mutex here might
+ * deadlock.
+ *
+ * Instead, disable preemption to prevent racing with other callers of
+ * klp_try_switch_task(). Thanks to task_call_func() they won't be
+ * able to switch this task while it's running.
+ */
+ preempt_disable();
+
+ /*
+ * Make sure current didn't get patched between the above check and
+ * preempt_disable().
+ */
+ if (unlikely(!klp_patch_pending(current)))
+ goto out;
+
+ /*
+ * Enforce the order of the TIF_PATCH_PENDING read above and the
+ * klp_target_state read in klp_try_switch_task(). The corresponding
+ * write barriers are in klp_init_transition() and
+ * klp_reverse_transition().
+ */
+ smp_rmb();
+
+ klp_try_switch_task(current);
+
+out:
+ preempt_enable();
+}
+EXPORT_SYMBOL(__klp_sched_try_switch);
+
/*
* Sends a fake signal to all non-kthread tasks with TIF_PATCH_PENDING set.
* Kthreads with TIF_PATCH_PENDING set are woken up.
@@ -440,7 +507,8 @@ void klp_try_complete_transition(void)
return;
}
- /* we're done, now cleanup the data structures */
+ /* Done! Now cleanup the data structures. */
+ klp_cond_resched_disable();
patch = klp_transition_patch;
klp_complete_transition();
@@ -492,6 +560,8 @@ void klp_start_transition(void)
set_tsk_thread_flag(task, TIF_PATCH_PENDING);
}
+ klp_cond_resched_enable();
+
klp_signals_cnt = 0;
}
@@ -547,8 +617,9 @@ void klp_init_transition(struct klp_patch *patch, int state)
* see a func in transition with a task->patch_state of KLP_UNDEFINED.
*
* Also enforce the order of the klp_target_state write and future
- * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't
- * set a task->patch_state to KLP_UNDEFINED.
+ * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() and
+ * __klp_sched_try_switch() don't set a task->patch_state to
+ * KLP_UNDEFINED.
*/
smp_wmb();
@@ -584,14 +655,10 @@ void klp_reverse_transition(void)
klp_target_state == KLP_PATCHED ? "patching to unpatching" :
"unpatching to patching");
- klp_transition_patch->enabled = !klp_transition_patch->enabled;
-
- klp_target_state = !klp_target_state;
-
/*
* Clear all TIF_PATCH_PENDING flags to prevent races caused by
- * klp_update_patch_state() running in parallel with
- * klp_start_transition().
+ * klp_update_patch_state() or __klp_sched_try_switch() running in
+ * parallel with the reverse transition.
*/
read_lock(&tasklist_lock);
for_each_process_thread(g, task)
@@ -601,9 +668,28 @@ void klp_reverse_transition(void)
for_each_possible_cpu(cpu)
clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING);
- /* Let any remaining calls to klp_update_patch_state() complete */
+ /*
+ * Make sure all existing invocations of klp_update_patch_state() and
+ * __klp_sched_try_switch() see the cleared TIF_PATCH_PENDING before
+ * starting the reverse transition.
+ */
klp_synchronize_transition();
+ /*
+ * All patching has stopped, now re-initialize the global variables to
+ * prepare for the reverse transition.
+ */
+ klp_transition_patch->enabled = !klp_transition_patch->enabled;
+ klp_target_state = !klp_target_state;
+
+ /*
+ * Enforce the order of the klp_target_state write and the
+ * TIF_PATCH_PENDING writes in klp_start_transition() to ensure
+ * klp_update_patch_state() and __klp_sched_try_switch() don't set
+ * task->patch_state to the wrong value.
+ */
+ smp_wmb();
+
klp_start_transition();
}
@@ -617,9 +703,9 @@ void klp_copy_process(struct task_struct *child)
* the task flag up to date with the parent here.
*
* The operation is serialized against all klp_*_transition()
- * operations by the tasklist_lock. The only exception is
- * klp_update_patch_state(current), but we cannot race with
- * that because we are current.
+ * operations by the tasklist_lock. The only exceptions are
+ * klp_update_patch_state(current) and __klp_sched_try_switch(), but we
+ * cannot race with them because we are current.
*/
if (test_tsk_thread_flag(current, TIF_PATCH_PENDING))
set_tsk_thread_flag(child, TIF_PATCH_PENDING);
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index 5732fa75ebab..b5cc2b53464d 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -300,6 +300,9 @@ noinstr u64 local_clock(void)
if (static_branch_likely(&__sched_clock_stable))
return sched_clock() + __sched_clock_offset;
+ if (!static_branch_likely(&sched_clock_running))
+ return sched_clock();
+
preempt_disable_notrace();
clock = sched_clock_local(this_scd());
preempt_enable_notrace();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 8d2b6742d02c..54c75af24899 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -261,36 +261,51 @@ void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags)
resched_curr(rq);
}
-/*
- * Find left-most (aka, highest priority) task matching @cookie.
- */
-static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+static int sched_task_is_throttled(struct task_struct *p, int cpu)
{
- struct rb_node *node;
-
- node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
- /*
- * The idle task always matches any cookie!
- */
- if (!node)
- return idle_sched_class.pick_task(rq);
+ if (p->sched_class->task_is_throttled)
+ return p->sched_class->task_is_throttled(p, cpu);
- return __node_2_sc(node);
+ return 0;
}
static struct task_struct *sched_core_next(struct task_struct *p, unsigned long cookie)
{
struct rb_node *node = &p->core_node;
+ int cpu = task_cpu(p);
- node = rb_next(node);
+ do {
+ node = rb_next(node);
+ if (!node)
+ return NULL;
+
+ p = __node_2_sc(node);
+ if (p->core_cookie != cookie)
+ return NULL;
+
+ } while (sched_task_is_throttled(p, cpu));
+
+ return p;
+}
+
+/*
+ * Find left-most (aka, highest priority) and unthrottled task matching @cookie.
+ * If no suitable task is found, NULL will be returned.
+ */
+static struct task_struct *sched_core_find(struct rq *rq, unsigned long cookie)
+{
+ struct task_struct *p;
+ struct rb_node *node;
+
+ node = rb_find_first((void *)cookie, &rq->core_tree, rb_sched_core_cmp);
if (!node)
return NULL;
- p = container_of(node, struct task_struct, core_node);
- if (p->core_cookie != cookie)
- return NULL;
+ p = __node_2_sc(node);
+ if (!sched_task_is_throttled(p, rq->cpu))
+ return p;
- return p;
+ return sched_core_next(p, cookie);
}
/*
@@ -2087,6 +2102,8 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags)
{
if (task_on_rq_migrating(p))
flags |= ENQUEUE_MIGRATED;
+ if (flags & ENQUEUE_MIGRATED)
+ sched_mm_cid_migrate_to(rq, p);
enqueue_task(rq, p, flags);
@@ -3196,6 +3213,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
rseq_migrate(p);
+ sched_mm_cid_migrate_from(p);
perf_event_task_migrate(p);
}
@@ -4469,6 +4487,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->wake_entry.u_flags = CSD_TYPE_TTWU;
p->migration_pending = NULL;
#endif
+ init_sched_mm_cid(p);
}
DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
@@ -5115,7 +5134,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
sched_info_switch(rq, prev, next);
perf_event_task_sched_out(prev, next);
rseq_preempt(prev);
- switch_mm_cid(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
kmap_local_sched_out();
prepare_task(next);
@@ -5272,6 +5290,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
*
* kernel -> user switch + mmdrop_lazy_tlb() active
* user -> user switch
+ *
+ * switch_mm_cid() needs to be updated if the barriers provided
+ * by context_switch() are modified.
*/
if (!next->mm) { // to kernel
enter_lazy_tlb(prev->active_mm, next);
@@ -5301,6 +5322,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
}
}
+ /* switch_mm_cid() requires the memory barriers above. */
+ switch_mm_cid(rq, prev, next);
+
rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
prepare_lock_switch(rq, next, rf);
@@ -5589,6 +5613,7 @@ void scheduler_tick(void)
resched_latency = cpu_resched_latency(rq);
calc_global_load_tick(rq);
sched_core_tick(rq);
+ task_tick_mm_cid(rq, curr);
rq_unlock(rq, &rf);
@@ -6241,7 +6266,7 @@ static bool try_steal_cookie(int this, int that)
goto unlock;
p = sched_core_find(src, cookie);
- if (p == src->idle)
+ if (!p)
goto unlock;
do {
@@ -6253,6 +6278,13 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
+ /*
+ * sched_core_find() and sched_core_next() will ensure that task @p
+ * is not throttled now, we also need to check whether the runqueue
+ * of the destination CPU is being throttled.
+ */
+ if (sched_task_is_throttled(p, this))
+ goto next;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
@@ -8508,6 +8540,7 @@ EXPORT_STATIC_CALL_TRAMP(might_resched);
static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
int __sched dynamic_cond_resched(void)
{
+ klp_sched_try_switch();
if (!static_branch_unlikely(&sk_dynamic_cond_resched))
return 0;
return __cond_resched();
@@ -8656,13 +8689,17 @@ int sched_dynamic_mode(const char *str)
#error "Unsupported PREEMPT_DYNAMIC mechanism"
#endif
-void sched_dynamic_update(int mode)
+static DEFINE_MUTEX(sched_dynamic_mutex);
+static bool klp_override;
+
+static void __sched_dynamic_update(int mode)
{
/*
* Avoid {NONE,VOLUNTARY} -> FULL transitions from ever ending up in
* the ZERO state, which is invalid.
*/
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
@@ -8670,36 +8707,79 @@ void sched_dynamic_update(int mode)
switch (mode) {
case preempt_dynamic_none:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: none\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: none\n");
break;
case preempt_dynamic_voluntary:
- preempt_dynamic_enable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_enable(cond_resched);
preempt_dynamic_enable(might_resched);
preempt_dynamic_disable(preempt_schedule);
preempt_dynamic_disable(preempt_schedule_notrace);
preempt_dynamic_disable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: voluntary\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: voluntary\n");
break;
case preempt_dynamic_full:
- preempt_dynamic_disable(cond_resched);
+ if (!klp_override)
+ preempt_dynamic_disable(cond_resched);
preempt_dynamic_disable(might_resched);
preempt_dynamic_enable(preempt_schedule);
preempt_dynamic_enable(preempt_schedule_notrace);
preempt_dynamic_enable(irqentry_exit_cond_resched);
- pr_info("Dynamic Preempt: full\n");
+ if (mode != preempt_dynamic_mode)
+ pr_info("Dynamic Preempt: full\n");
break;
}
preempt_dynamic_mode = mode;
}
+void sched_dynamic_update(int mode)
+{
+ mutex_lock(&sched_dynamic_mutex);
+ __sched_dynamic_update(mode);
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+#ifdef CONFIG_HAVE_PREEMPT_DYNAMIC_CALL
+
+static int klp_cond_resched(void)
+{
+ __klp_sched_try_switch();
+ return __cond_resched();
+}
+
+void sched_dynamic_klp_enable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = true;
+ static_call_update(cond_resched, klp_cond_resched);
+
+ mutex_unlock(&sched_dynamic_mutex);
+}
+
+void sched_dynamic_klp_disable(void)
+{
+ mutex_lock(&sched_dynamic_mutex);
+
+ klp_override = false;
+ __sched_dynamic_upda