diff options
| author | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:21 -1000 |
|---|---|---|
| committer | Tejun Heo <tj@kernel.org> | 2024-06-18 10:09:21 -1000 |
| commit | 06e51be3d5e7a07aea5c9012773df8d5de01db6c (patch) | |
| tree | d88254f8c89035e6c7449248354ec50171649226 /kernel/sched/ext.c | |
| parent | 7b0888b7cc1924b74ce660e02f6211df8dd46e7b (diff) | |
| download | linux-06e51be3d5e7a07aea5c9012773df8d5de01db6c.tar.gz linux-06e51be3d5e7a07aea5c9012773df8d5de01db6c.tar.bz2 linux-06e51be3d5e7a07aea5c9012773df8d5de01db6c.zip | |
sched_ext: Add vtime-ordered priority queue to dispatch_q's
Currently, a dsq is always a FIFO. A task which is dispatched earlier gets
consumed or executed earlier. While this is sufficient when dsq's are used
for simple staging areas for tasks which are ready to execute, it'd make
dsq's a lot more useful if they can implement custom ordering.
This patch adds a vtime-ordered priority queue to dsq's. When the BPF
scheduler dispatches a task with the new scx_bpf_dispatch_vtime() helper, it
can specify the vtime tha the task should be inserted at and the task is
inserted into the priority queue in the dsq which is ordered according to
time_before64() comparison of the vtime values.
A DSQ can either be a FIFO or priority queue and automatically switches
between the two depending on whether scx_bpf_dispatch() or
scx_bpf_dispatch_vtime() is used. Using the wrong variant while the DSQ
already has the other type queued is not allowed and triggers an ops error.
Built-in DSQs must always be FIFOs.
This makes it very easy for the BPF schedulers to implement proper vtime
based scheduling within each dsq very easy and efficient at a negligible
cost in terms of code complexity and overhead.
scx_simple and scx_example_flatcg are updated to default to weighted
vtime scheduling (the latter within each cgroup). FIFO scheduling can be
selected with -f option.
v4: - As allowing mixing priority queue and FIFO on the same DSQ sometimes
led to unexpected starvations, DSQs now error out if both modes are
used at the same time and the built-in DSQs are no longer allowed to
be priority queues.
- Explicit type struct scx_dsq_node added to contain fields needed to be
linked on DSQs. This will be used to implement stateful iterator.
- Tasks are now always linked on dsq->list whether the DSQ is in FIFO or
PRIQ mode. This confines PRIQ related complexities to the enqueue and
dequeue paths. Other paths only need to look at dsq->list. This will
also ease implementing BPF iterator.
- Print p->scx.dsq_flags in debug dump.
v3: - SCX_TASK_DSQ_ON_PRIQ flag is moved from p->scx.flags into its own
p->scx.dsq_flags. The flag is protected with the dsq lock unlike other
flags in p->scx.flags. This led to flag corruption in some cases.
- Add comments explaining the interaction between using consumption of
p->scx.slice to determine vtime progress and yielding.
v2: - p->scx.dsq_vtime was not initialized on load or across cgroup
migrations leading to some tasks being stalled for extended period of
time depending on how saturated the machine is. Fixed.
Signed-off-by: Tejun Heo <tj@kernel.org>
Reviewed-by: David Vernet <dvernet@meta.com>
Diffstat (limited to 'kernel/sched/ext.c')
| -rw-r--r-- | kernel/sched/ext.c | 156 |
1 files changed, 138 insertions, 18 deletions
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 1feb690be9d8..f186c576e7d9 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -638,6 +638,7 @@ enum scx_enq_flags { __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, SCX_ENQ_CLEAR_OPSS = 1LLU << 56, + SCX_ENQ_DSQ_PRIQ = 1LLU << 57, }; enum scx_deq_flags { @@ -1351,6 +1352,17 @@ static void update_curr_scx(struct rq *rq) } } +static bool scx_dsq_priq_less(struct rb_node *node_a, + const struct rb_node *node_b) +{ + const struct task_struct *a = + container_of(node_a, struct task_struct, scx.dsq_node.priq); + const struct task_struct *b = + container_of(node_b, struct task_struct, scx.dsq_node.priq); + + return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); +} + static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) { /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ @@ -1362,7 +1374,9 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, { bool is_local = dsq->id == SCX_DSQ_LOCAL; - WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list)); + WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) || + !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); if (!is_local) { raw_spin_lock(&dsq->lock); @@ -1375,10 +1389,59 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } } - if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) - list_add(&p->scx.dsq_node, &dsq->list); - else - list_add_tail(&p->scx.dsq_node, &dsq->list); + if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && + (enq_flags & SCX_ENQ_DSQ_PRIQ))) { + /* + * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from + * their FIFO queues. To avoid confusion and accidentally + * starving vtime-dispatched tasks by FIFO-dispatched tasks, we + * disallow any internal DSQ from doing vtime ordering of + * tasks. + */ + scx_ops_error("cannot use vtime ordering for built-in DSQs"); + enq_flags &= ~SCX_ENQ_DSQ_PRIQ; + } + + if (enq_flags & SCX_ENQ_DSQ_PRIQ) { + struct rb_node *rbp; + + /* + * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are + * linked to both the rbtree and list on PRIQs, this can only be + * tested easily when adding the first task. + */ + if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && + !list_empty(&dsq->list))) + scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", + dsq->id); + + p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ; + rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); + + /* + * Find the previous task and insert after it on the list so + * that @dsq->list is vtime ordered. + */ + rbp = rb_prev(&p->scx.dsq_node.priq); + if (rbp) { + struct task_struct *prev = + container_of(rbp, struct task_struct, + scx.dsq_node.priq); + list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list); + } else { + list_add(&p->scx.dsq_node.list, &dsq->list); + } + } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ + if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) + scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + + if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) + list_add(&p->scx.dsq_node.list, &dsq->list); + else + list_add_tail(&p->scx.dsq_node.list, &dsq->list); + } dsq_mod_nr(dsq, 1); p->scx.dsq = dsq; @@ -1417,13 +1480,30 @@ static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, } } +static void task_unlink_from_dsq(struct task_struct *p, + struct scx_dispatch_q *dsq) +{ + if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) { + rb_erase(&p->scx.dsq_node.priq, &dsq->priq); + RB_CLEAR_NODE(&p->scx.dsq_node.priq); + p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ; + } + + list_del_init(&p->scx.dsq_node.list); +} + +static bool task_linked_on_dsq(struct task_struct *p) +{ + return !list_empty(&p->scx.dsq_node.list); +} + static void dispatch_dequeue(struct rq *rq, struct task_struct *p) { struct scx_dispatch_q *dsq = p->scx.dsq; bool is_local = dsq == &rq->scx.local_dsq; if (!dsq) { - WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(task_linked_on_dsq(p)); /* * When dispatching directly from the BPF scheduler to a local * DSQ, the task isn't associated with any DSQ but @@ -1444,8 +1524,8 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) */ if (p->scx.holding_cpu < 0) { /* @p must still be on @dsq, dequeue */ - WARN_ON_ONCE(list_empty(&p->scx.dsq_node)); - list_del_init(&p->scx.dsq_node); + WARN_ON_ONCE(!task_linked_on_dsq(p)); + task_unlink_from_dsq(p, dsq); dsq_mod_nr(dsq, -1); } else { /* @@ -1454,7 +1534,7 @@ static void dispatch_dequeue(struct rq *rq, struct task_struct *p) * holding_cpu which tells dispatch_to_local_dsq() that it lost * the race. */ - WARN_ON_ONCE(!list_empty(&p->scx.dsq_node)); + WARN_ON_ONCE(task_linked_on_dsq(p)); p->scx.holding_cpu = -1; } p->scx.dsq = NULL; @@ -1949,7 +2029,8 @@ static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, /* @dsq is locked and @p is on this rq */ WARN_ON_ONCE(p->scx.holding_cpu >= 0); - list_move_tail(&p->scx.dsq_node, &rq->scx.local_dsq.list); + task_unlink_from_dsq(p, dsq); + list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list); dsq_mod_nr(dsq, -1); dsq_mod_nr(&rq->scx.local_dsq, 1); p->scx.dsq = &rq->scx.local_dsq; @@ -1992,7 +2073,7 @@ static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, * move_task_to_local_dsq(). */ WARN_ON_ONCE(p->scx.holding_cpu >= 0); - list_del_init(&p->scx.dsq_node); + task_unlink_from_dsq(p, dsq); dsq_mod_nr(dsq, -1); p->scx.holding_cpu = raw_smp_processor_id(); raw_spin_unlock(&dsq->lock); @@ -2024,7 +2105,7 @@ retry: raw_spin_lock(&dsq->lock); - list_for_each_entry(p, &dsq->list, scx.dsq_node) { + list_for_each_entry(p, &dsq->list, scx.dsq_node.list) { struct rq *task_rq = task_rq(p); if (rq == task_rq) { @@ -2543,7 +2624,7 @@ static void put_prev_task_scx(struct rq *rq, struct task_struct *p) static struct task_struct *first_local_task(struct rq *rq) { return list_first_entry_or_null(&rq->scx.local_dsq.list, - struct task_struct, scx.dsq_node); + struct task_struct, scx.dsq_node.list); } static struct task_struct *pick_next_task_scx(struct rq *rq) @@ -3225,7 +3306,8 @@ void init_scx_entity(struct sched_ext_entity *scx) */ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); - INIT_LIST_HEAD(&scx->dsq_node); + INIT_LIST_HEAD(&scx->dsq_node.list); + RB_CLEAR_NODE(&scx->dsq_node.priq); scx->sticky_cpu = -1; scx->holding_cpu = -1; INIT_LIST_HEAD(&scx->runnable_node); @@ -4070,12 +4152,13 @@ static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, dump_line(s, " %c%c %s[%d] %+ldms", marker, task_state_to_char(p), p->comm, p->pid, jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); - dump_line(s, " scx_state/flags=%u/0x%x ops_state/qseq=%lu/%lu", + dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, - ops_state & SCX_OPSS_STATE_MASK, + p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK, ops_state >> SCX_OPSS_QSEQ_SHIFT); - dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s", - p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf); + dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", + p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, + p->scx.dsq_vtime); dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); if (SCX_HAS_OP(dump_task)) { @@ -4663,6 +4746,9 @@ static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, if (off >= offsetof(struct task_struct, scx.slice) && off + size <= offsetofend(struct task_struct, scx.slice)) return SCALAR_VALUE; + if (off >= offsetof(struct task_struct, scx.dsq_vtime) && + off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) + return SCALAR_VALUE; if (off >= offsetof(struct task_struct, scx.disallow) && off + size <= offsetofend(struct task_struct, scx.disallow)) return SCALAR_VALUE; @@ -5298,10 +5384,44 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, scx_dispatch_commit(p, dsq_id, enq_flags); } +/** + * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ + * @p: task_struct to dispatch + * @dsq_id: DSQ to dispatch to + * @slice: duration @p can run for in nsecs + * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ + * @enq_flags: SCX_ENQ_* + * + * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. + * Tasks queued into the priority queue are ordered by @vtime and always + * consumed after the tasks in the FIFO queue. All other aspects are identical + * to scx_bpf_dispatch(). + * + * @vtime ordering is according to time_before64() which considers wrapping. A + * numerically larger vtime may indicate an earlier position in the ordering and + * vice-versa. + */ +__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, + u64 slice, u64 vtime, u64 enq_flags) +{ + if (!scx_dispatch_preamble(p, enq_flags)) + return; + + if (slice) + p->scx.slice = slice; + else + p->scx.slice = p->scx.slice ?: 1; + + p->scx.dsq_vtime = vtime; + + scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); +} + __bpf_kfunc_end_defs(); BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) +BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { |
