summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-09-21 09:44:57 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-09-21 09:44:57 -0700
commit88264981f2082248e892a706b2c5004650faac54 (patch)
tree1a137af36a73b441220ed768745b48d34f58baa8 /kernel
parent440b65232829fad69947b8de983c13a525cc8871 (diff)
parent902d67a2d40f5b0815f4f627b26d91f96cc51fb3 (diff)
downloadlinux-88264981f2082248e892a706b2c5004650faac54.tar.gz
linux-88264981f2082248e892a706b2c5004650faac54.tar.bz2
linux-88264981f2082248e892a706b2c5004650faac54.zip
Merge tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext
Pull sched_ext support from Tejun Heo: "This implements a new scheduler class called ‘ext_sched_class’, or sched_ext, which allows scheduling policies to be implemented as BPF programs. The goals of this are: - Ease of experimentation and exploration: Enabling rapid iteration of new scheduling policies. - Customization: Building application-specific schedulers which implement policies that are not applicable to general-purpose schedulers. - Rapid scheduler deployments: Non-disruptive swap outs of scheduling policies in production environments" See individual commits for more documentation, but also the cover letter for the latest series: Link: https://lore.kernel.org/all/20240618212056.2833381-1-tj@kernel.org/ * tag 'sched_ext-for-6.12' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext: (110 commits) sched: Move update_other_load_avgs() to kernel/sched/pelt.c sched_ext: Don't trigger ops.quiescent/runnable() on migrations sched_ext: Synchronize bypass state changes with rq lock scx_qmap: Implement highpri boosting sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq() sched_ext: Compact struct bpf_iter_scx_dsq_kern sched_ext: Replace consume_local_task() with move_local_task_to_local_dsq() sched_ext: Move consume_local_task() upward sched_ext: Move sanity check and dsq_mod_nr() into task_unlink_from_dsq() sched_ext: Reorder args for consume_local/remote_task() sched_ext: Restructure dispatch_to_local_dsq() sched_ext: Fix processs_ddsp_deferred_locals() by unifying DTL_INVALID handling sched_ext: Make find_dsq_for_dispatch() handle SCX_DSQ_LOCAL_ON sched_ext: Refactor consume_remote_task() sched_ext: Rename scx_kfunc_set_sleepable to unlocked and relocate sched_ext: Add missing static to scx_dump_data sched_ext: Add missing static to scx_has_op[] sched_ext: Temporarily work around pick_task_scx() being called without balance_scx() sched_ext: Add a cgroup scheduler which uses flattened hierarchy sched_ext: Add cgroup support ...
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt27
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/sched/build_policy.c11
-rw-r--r--kernel/sched/core.c281
-rw-r--r--kernel/sched/cpufreq_schedutil.c50
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/ext.c7173
-rw-r--r--kernel/sched/ext.h91
-rw-r--r--kernel/sched/fair.c23
-rw-r--r--kernel/sched/idle.c2
-rw-r--r--kernel/sched/pelt.c20
-rw-r--r--kernel/sched/pelt.h1
-rw-r--r--kernel/sched/sched.h190
-rw-r--r--kernel/sched/syscalls.c7
14 files changed, 7774 insertions, 122 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..fe782cd77388 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -133,4 +133,29 @@ config SCHED_CORE
which is the likely usage by Linux distributions, there should
be no measurable impact on performance.
-
+config SCHED_CLASS_EXT
+ bool "Extensible Scheduling Class"
+ depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+ select STACKTRACE if STACKTRACE_SUPPORT
+ help
+ This option enables a new scheduler class sched_ext (SCX), which
+ allows scheduling policies to be implemented as BPF programs to
+ achieve the following:
+
+ - Ease of experimentation and exploration: Enabling rapid
+ iteration of new scheduling policies.
+ - Customization: Building application-specific schedulers which
+ implement policies that are not applicable to general-purpose
+ schedulers.
+ - Rapid scheduler deployments: Non-disruptive swap outs of
+ scheduling policies in production environments.
+
+ sched_ext leverages BPF struct_ops feature to define a structure
+ which exports function callbacks and flags to BPF programs that
+ wish to implement scheduling policies. The struct_ops structure
+ exported by sched_ext is struct sched_ext_ops, and is conceptually
+ similar to struct sched_class.
+
+ For more information:
+ Documentation/scheduler/sched-ext.rst
+ https://github.com/sched-ext/scx
diff --git a/kernel/fork.c b/kernel/fork.c
index 5ac41db8aa69..cbdaca45d0c1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,6 +23,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/ext.h>
#include <linux/seq_file.h>
#include <linux/rtmutex.h>
#include <linux/init.h>
@@ -969,6 +970,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(refcount_read(&tsk->usage));
WARN_ON(tsk == current);
+ sched_ext_free(tsk);
io_uring_free(tsk);
cgroup_free(tsk);
task_numa_free(tsk, true);
@@ -2346,7 +2348,7 @@ __latent_entropy struct task_struct *copy_process(
retval = perf_event_init_task(p, clone_flags);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_sched_cancel_fork;
retval = audit_alloc(p);
if (retval)
goto bad_fork_cleanup_perf;
@@ -2479,7 +2481,9 @@ __latent_entropy struct task_struct *copy_process(
* cgroup specific, it unconditionally needs to place the task on a
* runqueue.
*/
- sched_cgroup_fork(p, args);
+ retval = sched_cgroup_fork(p, args);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
/*
* From this point on we must avoid any synchronous user-space
@@ -2525,13 +2529,13 @@ __latent_entropy struct task_struct *copy_process(
/* Don't start children in a dying pid namespace */
if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) {
retval = -ENOMEM;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* Let kill terminate clone/fork in the middle */
if (fatal_signal_pending(current)) {
retval = -EINTR;
- goto bad_fork_cancel_cgroup;
+ goto bad_fork_core_free;
}
/* No more failure paths after this point. */
@@ -2605,10 +2609,11 @@ __latent_entropy struct task_struct *copy_process(
return p;
-bad_fork_cancel_cgroup:
+bad_fork_core_free:
sched_core_free(p);
spin_unlock(&current->sighand->siglock);
write_unlock_irq(&tasklist_lock);
+bad_fork_cancel_cgroup:
cgroup_cancel_fork(p, args);
bad_fork_put_pidfd:
if (clone_flags & CLONE_PIDFD) {
@@ -2647,6 +2652,8 @@ bad_fork_cleanup_audit:
audit_free(p);
bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_sched_cancel_fork:
+ sched_cancel_fork(p);
bad_fork_cleanup_policy:
lockdep_free_task(p);
#ifdef CONFIG_NUMA
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 39c315182b35..fae1f5c921eb 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -16,18 +16,25 @@
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/sched/hotplug.h>
+#include <linux/sched/isolation.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
+#include <linux/kobject.h>
#include <linux/livepatch.h>
+#include <linux/pm.h>
#include <linux/psi.h>
+#include <linux/rhashtable.h>
+#include <linux/seq_buf.h>
#include <linux/seqlock_api.h>
#include <linux/slab.h>
#include <linux/suspend.h>
#include <linux/tsacct_kern.h>
#include <linux/vtime.h>
+#include <linux/sysrq.h>
+#include <linux/percpu-rwsem.h>
#include <uapi/linux/sched/types.h>
@@ -52,4 +59,8 @@
#include "cputime.c"
#include "deadline.c"
+#ifdef CONFIG_SCHED_CLASS_EXT
+# include "ext.c"
+#endif
+
#include "syscalls.c"
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c1a08035ea43..b6cc1cf499d6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -172,7 +172,10 @@ static inline int __task_prio(const struct task_struct *p)
if (p->sched_class == &idle_sched_class)
return MAX_RT_PRIO + NICE_WIDTH; /* 140 */
- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */
+ if (task_on_scx(p))
+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */
+
+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */
}
/*
@@ -217,6 +220,11 @@ static inline bool prio_less(const struct task_struct *a,
if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */
return cfs_prio_less(a, b, in_fi);
+#ifdef CONFIG_SCHED_CLASS_EXT
+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */
+ return scx_prio_less(a, b, in_fi);
+#endif
+
return false;
}
@@ -1280,11 +1288,14 @@ bool sched_can_stop_tick(struct rq *rq)
return true;
/*
- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
- * if there's more than one we need the tick for involuntary
- * preemption.
+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks
+ * left. For CFS, if there's more than one we need the tick for
+ * involuntary preemption. For SCX, ask.
*/
- if (rq->nr_running > 1)
+ if (scx_enabled() && !scx_can_stop_tick(rq))
+ return false;
+
+ if (rq->cfs.nr_running > 1)
return false;
/*
@@ -1366,8 +1377,8 @@ void set_load_weight(struct task_struct *p, bool update_load)
* SCHED_OTHER tasks have to update their load when changing their
* weight
*/
- if (update_load && p->sched_class == &fair_sched_class)
- reweight_task(p, &lw);
+ if (update_load && p->sched_class->reweight_task)
+ p->sched_class->reweight_task(task_rq(p), p, &lw);
else
p->se.load = lw;
}
@@ -2087,6 +2098,17 @@ inline int task_curr(const struct task_struct *p)
}
/*
+ * ->switching_to() is called with the pi_lock and rq_lock held and must not
+ * mess with locking.
+ */
+void check_class_changing(struct rq *rq, struct task_struct *p,
+ const struct sched_class *prev_class)
+{
+ if (prev_class != p->sched_class && p->sched_class->switching_to)
+ p->sched_class->switching_to(rq, p);
+}
+
+/*
* switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
* use the balance_callback list if you want balancing.
*
@@ -2356,7 +2378,7 @@ static inline bool rq_has_pinned_tasks(struct rq *rq)
static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
{
/* When not in the task's cpumask, no point in looking further. */
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!task_allowed_on_cpu(p, cpu))
return false;
/* migrate_disabled() must be allowed to finish. */
@@ -2365,7 +2387,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu) && task_cpu_possible(cpu, p);
+ return cpu_active(cpu);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -3843,6 +3865,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu)
static inline bool ttwu_queue_cond(struct task_struct *p, int cpu)
{
/*
+ * The BPF scheduler may depend on select_task_rq() being invoked during
+ * wakeups. In addition, @p may end up executing on a different CPU
+ * regardless of what happens in the wakeup path making the ttwu_queue
+ * optimization less meaningful. Skip if on SCX.
+ */
+ if (task_on_scx(p))
+ return false;
+
+ /*
* Do not complicate things with the async wake_list while the CPU is
* in hotplug state.
*/
@@ -4416,6 +4447,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ init_scx_entity(&p->scx);
+#endif
+
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
#endif
@@ -4658,10 +4693,18 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
if (dl_prio(p->prio))
return -EAGAIN;
- else if (rt_prio(p->prio))
+
+ scx_pre_fork(p);
+
+ if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
- else
+#ifdef CONFIG_SCHED_CLASS_EXT
+ } else if (task_should_scx(p)) {
+ p->sched_class = &ext_sched_class;
+#endif
+ } else {
p->sched_class = &fair_sched_class;
+ }
init_entity_runnable_average(&p->se);
@@ -4681,7 +4724,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
return 0;
}
-void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
{
unsigned long flags;
@@ -4708,11 +4751,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs)
if (p->sched_class->task_fork)
p->sched_class->task_fork(p);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ return scx_fork(p);
+}
+
+void sched_cancel_fork(struct task_struct *p)
+{
+ scx_cancel_fork(p);
}
void sched_post_fork(struct task_struct *p)
{
uclamp_post_fork(p);
+ scx_post_fork(p);
}
unsigned long to_ratio(u64 period, u64 runtime)
@@ -5545,6 +5596,7 @@ void sched_tick(void)
calc_global_load_tick(rq);
sched_core_tick(rq);
task_tick_mm_cid(rq, curr);
+ scx_tick(rq);
rq_unlock(rq, &rf);
@@ -5557,8 +5609,10 @@ void sched_tick(void)
wq_worker_tick(curr);
#ifdef CONFIG_SMP
- rq->idle_balance = idle_cpu(cpu);
- sched_balance_trigger(rq);
+ if (!scx_switched_all()) {
+ rq->idle_balance = idle_cpu(cpu);
+ sched_balance_trigger(rq);
+ }
#endif
}
@@ -5848,8 +5902,19 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt)
static void prev_balance(struct rq *rq, struct task_struct *prev,
struct rq_flags *rf)
{
-#ifdef CONFIG_SMP
+ const struct sched_class *start_class = prev->sched_class;
const struct sched_class *class;
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+ /*
+ * SCX requires a balance() call before every pick_next_task() including
+ * when waking up from SCHED_IDLE. If @start_class is below SCX, start
+ * from SCX instead.
+ */
+ if (scx_enabled() && sched_class_above(&ext_sched_class, start_class))
+ start_class = &ext_sched_class;
+#endif
+
/*
* We must do the balancing pass before put_prev_task(), such
* that when we release the rq->lock the task is in the same
@@ -5858,11 +5923,10 @@ static void prev_balance(struct rq *rq, struct task_struct *prev,
* We can terminate the balance pass as soon as we know there is
* a runnable task of @class priority or higher.
*/
- for_class_range(class, prev->sched_class, &idle_sched_class) {
- if (class->balance(rq, prev, rf))
+ for_active_class_range(class, start_class, &idle_sched_class) {
+ if (class->balance && class->balance(rq, prev, rf))
break;
}
-#endif
}
/*
@@ -5876,6 +5940,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
rq->dl_server = NULL;
+ if (scx_enabled())
+ goto restart;
+
/*
* Optimization: we know that if all tasks are in the fair class we can
* call that function directly, but only if the @prev task wasn't of a
@@ -5901,7 +5968,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
restart:
prev_balance(rq, prev, rf);
- for_each_class(class) {
+ for_each_active_class(class) {
if (class->pick_next_task) {
p = class->pick_next_task(rq, prev);
if (p)
@@ -5944,7 +6011,7 @@ static inline struct task_struct *pick_task(struct rq *rq)
rq->dl_server = NULL;
- for_each_class(class) {
+ for_each_active_class(class) {
p = class->pick_task(rq);
if (p)
return p;
@@ -6948,6 +7015,10 @@ void __setscheduler_prio(struct task_struct *p, int prio)
p->sched_class = &dl_sched_class;
else if (rt_prio(prio))
p->sched_class = &rt_sched_class;
+#ifdef CONFIG_SCHED_CLASS_EXT
+ else if (task_should_scx(p))
+ p->sched_class = &ext_sched_class;
+#endif
else
p->sched_class = &fair_sched_class;
@@ -7093,6 +7164,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
}
__setscheduler_prio(p, prio);
+ check_class_changing(rq, p, prev_class);
if (queued)
enqueue_task(rq, p, queue_flag);
@@ -7505,6 +7577,7 @@ void sched_show_task(struct task_struct *p)
print_worker_info(KERN_INFO, p);
print_stop_info(KERN_INFO, p);
+ print_scx_info(KERN_INFO, p);
show_stack(p, NULL, KERN_INFO);
put_task_stack(p);
}
@@ -8033,6 +8106,8 @@ int sched_cpu_activate(unsigned int cpu)
cpuset_cpu_active();
}
+ scx_rq_activate(rq);
+
/*
* Put the rq online, if not already. This happens:
*
@@ -8082,6 +8157,8 @@ int sched_cpu_deactivate(unsigned int cpu)
sched_set_rq_offline(rq, cpu);
+ scx_rq_deactivate(rq);
+
/*
* When going down, decrement the number of cores with SMT present.
*/
@@ -8266,11 +8343,15 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
- &fair_sched_class != &rt_sched_class + 1 ||
- &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class != &stop_sched_class + 1);
+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class));
+#endif
+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class));
+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class));
+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class));
+#ifdef CONFIG_SCHED_CLASS_EXT
+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class));
+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class));
#endif
wait_bit_init();
@@ -8294,6 +8375,9 @@ void __init sched_init(void)
root_task_group.shares = ROOT_TASK_GROUP_LOAD;
init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */
+#ifdef CONFIG_EXT_GROUP_SCHED
+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL;
+#endif /* CONFIG_EXT_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
root_task_group.rt_se = (struct sched_rt_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);
@@ -8445,6 +8529,7 @@ void __init sched_init(void)
balance_push_set(smp_processor_id(), false);
#endif
init_sched_fair_class();
+ init_sched_ext_class();
psi_init();
@@ -8730,6 +8815,7 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_rt_sched_group(tg, parent))
goto err;
+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL);
alloc_uclamp_sched_group(tg, parent);
return tg;
@@ -8857,6 +8943,7 @@ void sched_move_task(struct task_struct *tsk)
put_prev_task(rq, tsk);
sched_change_group(tsk, group);
+ scx_move_task(tsk);
if (queued)
enqueue_task(rq, tsk, queue_flags);
@@ -8871,11 +8958,6 @@ void sched_move_task(struct task_struct *tsk)
}
}
-static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
-{
- return css ? container_of(css, struct task_group, css) : NULL;
-}
-
static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -8899,6 +8981,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
struct task_group *parent = css_tg(css->parent);
+ int ret;
+
+ ret = scx_tg_online(tg);
+ if (ret)
+ return ret;
if (parent)
sched_online_group(tg, parent);
@@ -8913,6 +9000,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
return 0;
}
+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+
+ scx_tg_offline(tg);
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -8930,9 +9024,9 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
sched_unregister_group(tg);
}
-#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
{
+#ifdef CONFIG_RT_GROUP_SCHED
struct task_struct *task;
struct cgroup_subsys_state *css;
@@ -8940,9 +9034,9 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
if (!sched_rt_can_attach(css_tg(css), task))
return -EINVAL;
}
- return 0;
-}
#endif
+ return scx_cgroup_can_attach(tset);
+}
static void cpu_cgroup_attach(struct cgroup_taskset *tset)
{
@@ -8951,6 +9045,13 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, css, tset)
sched_move_task(task);
+
+ scx_cgroup_finish_attach();
+}
+
+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset)
+{
+ scx_cgroup_cancel_attach(tset);
}
#ifdef CONFIG_UCLAMP_TASK_GROUP
@@ -9127,22 +9228,36 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
}
#endif /* CONFIG_UCLAMP_TASK_GROUP */
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+static unsigned long tg_weight(struct task_group *tg)
+{
#ifdef CONFIG_FAIR_GROUP_SCHED
+ return scale_load_down(tg->shares);
+#else
+ return sched_weight_from_cgroup(tg->scx_weight);
+#endif
+}
+
static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
struct cftype *cftype, u64 shareval)
{
+ int ret;
+
if (shareval > scale_load_down(ULONG_MAX))
shareval = MAX_SHARES;
- return sched_group_set_shares(css_tg(css), scale_load(shareval));
+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(shareval));
+ return ret;
}
static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
-
- return (u64) scale_load_down(tg->shares);
+ return tg_weight(css_tg(css));
}
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
#ifdef CONFIG_CFS_BANDWIDTH
static DEFINE_MUTEX(cfs_constraints_mutex);
@@ -9488,7 +9603,6 @@ static int cpu_cfs_local_stat_show(struct seq_file *sf, void *v)
return 0;
}
#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
@@ -9516,7 +9630,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -9526,12 +9640,17 @@ static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 idle)
{
- return sched_group_set_idle(css_tg(css), idle);
+ int ret;
+
+ ret = sched_group_set_idle(css_tg(css), idle);
+ if (!ret)
+ scx_group_set_idle(css_tg(css), idle);
+ return ret;
}
#endif
static struct cftype cpu_legacy_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "shares",
.read_u64 = cpu_shares_read_u64,
@@ -9641,38 +9760,35 @@ static int cpu_local_stat_show(struct seq_file *sf,
return 0;
}
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
+
static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- struct task_group *tg = css_tg(css);
- u64 weight = scale_load_down(tg->shares);
-
- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024);
+ return sched_weight_to_cgroup(tg_weight(css_tg(css)));
}
static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
- struct cftype *cft, u64 weight)
+ struct cftype *cft, u64 cgrp_weight)
{
- /*
- * cgroup weight knobs should use the common MIN, DFL and MAX
- * values which are 1, 100 and 10000 respectively. While it loses
- * a bit of range on both ends, it maps pretty well onto the shares
- * value used by scheduler and the round-trip conversions preserve
- * the original value over the entire range.
- */
- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX)
+ unsigned long weight;
+ int ret;
+
+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX)
return -ERANGE;
- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL);
+ weight = sched_weight_from_cgroup(cgrp_weight);
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css), cgrp_weight);
+ return ret;
}
static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- unsigned long weight = scale_load_down(css_tg(css)->shares);
+ unsigned long weight = tg_weight(css_tg(css));
int last_delta = INT_MAX;
int prio, delta;
@@ -9691,7 +9807,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
struct cftype *cft, s64 nice)
{
unsigned long weight;
- int idx;
+ int idx, ret;
if (nice < MIN_NICE || nice > MAX_NICE)
return -ERANGE;
@@ -9700,9 +9816,13 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
idx = array_index_nospec(idx, 40);
weight = sched_prio_to_weight[idx];
- return sched_group_set_shares(css_tg(css), scale_load(weight));
+ ret = sched_group_set_shares(css_tg(css), scale_load(weight));
+ if (!ret)
+ scx_group_set_weight(css_tg(css),
+ sched_weight_to_cgroup(weight));
+ return ret;
}
-#endif
+#endif /* CONFIG_GROUP_SCHED_WEIGHT */
static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
long period, long quota)
@@ -9762,7 +9882,7 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of,
#endif
static struct cftype cpu_files[] = {
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_GROUP_SCHED_WEIGHT
{
.name = "weight",
.flags = CFTYPE_NOT_ON_ROOT,
@@ -9816,14 +9936,14 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
.css_online = cpu_cgroup_css_online,
+ .css_offline = cpu_cgroup_css_offline,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.css_extra_stat_show = cpu_extra_stat_show,
.css_local_stat_show = cpu_local_stat_show,
-#ifdef CONFIG_RT_GROUP_SCHED
.can_attach = cpu_cgroup_can_attach,
-#endif
.attach = cpu_cgroup_attach,
+ .cancel_attach = cpu_cgroup_cancel_attach,
.legacy_cftypes = cpu_legacy_files,
.dfl_cftypes = cpu_files,
.early_init = true,
@@ -10413,3 +10533,38 @@ void sched_mm_cid_fork(struct task_struct *t)
t->mm_cid_active = 1;
}
#endif
+
+#ifdef CONFIG_SCHED_CLASS_EXT
+void sched_deq_and_put_task(struct task_struct *p, int queue_flags,
+ struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(p);
+
+ lockdep_assert_rq_held(rq);
+
+ *ctx = (struct sched_enq_and_set_ctx){
+ .p = p,
+ .queue_flags = queue_flags,
+ .queued = task_on_rq_queued(p),
+ .running = task_current(rq, p),
+ };
+
+ update_rq_clock(rq);
+ if (ctx->queued)
+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK);
+ if (ctx->running)
+ put_prev_task(rq, p);
+}
+
+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx)
+{
+ struct rq *rq = task_rq(ctx->p);
+
+ lockdep_assert_rq_held(rq);
+
+ if (ctx->queued)
+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK);
+ if (ctx->running)
+ set_next_task(rq, ctx->p);
+}
+#endif /* CONFIG_SCHED_CLASS_EXT */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 43111a515a28..c6ba15388ea7 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -197,8 +197,10 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual,
static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost)
{
- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu);
+ unsigned long min, max, util = scx_cpuperf_target(sg_cpu->cpu);
+ if (!scx_switched_all())
+ util += cpu_util_cfs_boost(sg_cpu->cpu);
util = effective_cpu_util(sg_cpu->cpu, util, &min, &max);
util = max(util, boost);
sg_cpu->bw_min = min;
@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
}
#ifdef CONFIG_NO_HZ_COMMON
-static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu)
+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu)
{
- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
- bool ret = idle_calls == sg_cpu->saved_idle_calls;
+ unsigned long idle_calls;
+ bool ret;
+
+ /*
+ * The heuristics in this function is for the fair class. For SCX, the
+ * performance target comes directly from the BPF scheduler. Let's just
+ * follow it.
+ */
+ if (scx_switched_all())
+ return false;
+
+ /* if capped by uclamp_max, always update to be in compliance */
+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)))
+ return false;
+
+ /*
+ * Maintain the frequency if the CPU has not been idle recently, as
+ * reduction is likely to be premature.
+ */
+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu);
+ ret = idle_calls == sg_cpu->saved_idle_calls;
sg_cpu->saved_idle_calls = idle_calls;
return ret;
}
#else
-static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; }
#endif /* CONFIG_NO_HZ_COMMON */
/*
@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time,
return;
next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap);
- /*
- * Do not reduce the frequency if the CPU has not been idle
- * recently, as the reduction is likely to be premature then.
- *
- * Except when the rq is capped by uclamp_max.
- */
- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq &&
+
+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq &&
!sg_policy->need_freq_update) {
next_f = sg_policy->next_freq;
@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
if (!sugov_update_single_common(sg_cpu, time, max_cap, flags))
return;
- /*
- * Do not reduce the target performance level if the CPU has not been
- * idle recently, as the reduction is likely to be premature then.
- *
- * Except when the rq is capped by uclamp_max.
- */
- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) &&
- sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util)
+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index de1dc5264b3f..f4035c7a0fa1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1264,6 +1264,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
P(dl.runtime);
P(dl.deadline);
}
+#ifdef CONFIG_SCHED_CLASS_EXT
+ __PS("ext.enabled", task_on_scx(p));
+#endif
#undef PN_SCHEDSTAT
#undef P_SCHEDSTAT
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
new file mode 100644
index 000000000000..9ee5a9a261cc
--- /dev/null
+++ b/kernel/sched/ext.c
@@ -0,0 +1,7173 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler