From 20bdedafd2f63e0ba70991127f9b5c0826ebdb32 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 30 Jun 2023 21:28:53 +0900 Subject: workqueue: Warn attempt to flush system-wide workqueues. Based on commit c4f135d643823a86 ("workqueue: Wrap flush_workqueue() using a macro"), all in-tree users stopped flushing system-wide workqueues. Therefore, start emitting runtime message so that all out-of-tree users will understand that they need to update their code. Signed-off-by: Tetsuo Handa Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 02a8f402eeb5..f8891552fdd6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -6571,10 +6571,9 @@ void __init workqueue_init(void) wq_watchdog_init(); } -/* - * Despite the naming, this is a no-op function which is here only for avoiding - * link error. Since compile-time warning may fail to catch, we will need to - * emit run-time warning from __flush_workqueue(). - */ -void __warn_flushing_systemwide_wq(void) { } +void __warn_flushing_systemwide_wq(void) +{ + pr_warn("WARNING: Flushing system-wide workqueues will be prohibited in near future.\n"); + dump_stack(); +} EXPORT_SYMBOL(__warn_flushing_systemwide_wq); -- cgit v1.2.3 From ace3c5499e61ef7c0433a7a297227a9bdde54a55 Mon Sep 17 00:00:00 2001 From: tiozhang Date: Thu, 29 Jun 2023 11:50:50 +0800 Subject: workqueue: add cmdline parameter `workqueue.unbound_cpus` to further constrain wq_unbound_cpumask at boot time Motivation of doing this is to better improve boot times for devices when we want to prevent our workqueue works from running on some specific CPUs, e,g, some CPUs are busy with interrupts. Signed-off-by: tiozhang Signed-off-by: Tejun Heo --- kernel/workqueue.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f8891552fdd6..83f8993af57c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -368,6 +368,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -6455,6 +6458,9 @@ void __init workqueue_init_early(void) cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); + if (!cpumask_empty(&wq_cmdline_cpumask)) + cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask); + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); /* initialize CPU pools */ @@ -6577,3 +6583,14 @@ void __warn_flushing_systemwide_wq(void) dump_stack(); } EXPORT_SYMBOL(__warn_flushing_systemwide_wq); + +static int __init workqueue_unbound_cpus_setup(char *str) +{ + if (cpulist_parse(str, &wq_cmdline_cpumask) < 0) { + cpumask_clear(&wq_cmdline_cpumask); + pr_warn("workqueue.unbound_cpus: incorrect CPU range, using default\n"); + } + + return 1; +} +__setup("workqueue.unbound_cpus=", workqueue_unbound_cpus_setup); -- cgit v1.2.3 From 9680540c0c56a1f75a2d6aab31bf38aa429aa9d9 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Fri, 4 Aug 2023 11:22:15 +0800 Subject: workqueue: use LIST_HEAD to initialize cull_list Use LIST_HEAD() to initialize cull_list instead of open-coding it. Signed-off-by: Yang Yingliang Reviewed-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 83f8993af57c..a90f1e642548 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2304,9 +2304,8 @@ static void idle_worker_timeout(struct timer_list *t) static void idle_cull_fn(struct work_struct *work) { struct worker_pool *pool = container_of(work, struct worker_pool, idle_cull_work); - struct list_head cull_list; + LIST_HEAD(cull_list); - INIT_LIST_HEAD(&cull_list); /* * Grabbing wq_pool_attach_mutex here ensures an already-running worker * cannot proceed beyong worker_detach_from_pool() in its self-destruct @@ -3872,10 +3871,8 @@ static void rcu_free_pool(struct rcu_head *rcu) static void put_unbound_pool(struct worker_pool *pool) { DECLARE_COMPLETION_ONSTACK(detach_completion); - struct list_head cull_list; struct worker *worker; - - INIT_LIST_HEAD(&cull_list); + LIST_HEAD(cull_list); lockdep_assert_held(&wq_pool_mutex); -- cgit v1.2.3 From bc8b50c2dfac946c1beed782c1823e52cf55a352 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: workqueue: Drop the special locking rule for worker->flags and worker_pool->flags worker->flags used to be accessed from scheduler hooks without grabbing pool->lock for concurrency management. This is no longer true since 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock"). Also, it's unclear why worker_pool->flags was using the "X" rule. All relevant users are accessing it under the pool lock. Let's drop the special "X" rule and use the "L" rule for these flag fields instead. While at it, replace the CONTEXT comment with lockdep_assert_held(). This allows worker_set/clr_flags() to be used from context which isn't the worker itself. This will be used later to implement assinging work items to workers before waking them up so that workqueue can have better control over which worker executes which work item on which CPU. The only actual changes are sanity checks. There shouldn't be any visible behavior changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ae975a7c9f69..598009ad97d1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,11 +122,6 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. - * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -160,7 +155,7 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ @@ -910,15 +905,12 @@ static void wake_up_worker(struct worker_pool *pool) * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && @@ -935,16 +927,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; -- cgit v1.2.3 From c0ab017d43f4c4147f7ecf3ca3cb872a416e17c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: workqueue: Cleanups around process_scheduled_works() * Drop the trivial optimization in worker_thread() where it bypasses calling process_scheduled_works() if the first work item isn't linked. This is a mostly pointless micro optimization and gets in the way of improving the work processing path. * Consolidate pool->watchdog_ts updates in the two callers into process_scheduled_works(). Signed-off-by: Tejun Heo --- kernel/workqueue.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 598009ad97d1..20722de9937a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2652,9 +2652,15 @@ __acquires(&pool->lock) */ static void process_scheduled_works(struct worker *worker) { - while (!list_empty(&worker->scheduled)) { - struct work_struct *work = list_first_entry(&worker->scheduled, - struct work_struct, entry); + struct work_struct *work; + bool first = true; + + while ((work = list_first_entry_or_null(&worker->scheduled, + struct work_struct, entry))) { + if (first) { + worker->pool->watchdog_ts = jiffies; + first = false; + } process_one_work(worker, work); } } @@ -2735,17 +2741,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); - pool->watchdog_ts = jiffies; - - if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { - /* optimization path, not strictly necessary */ - process_one_work(worker, work); - if (unlikely(!list_empty(&worker->scheduled))) - process_scheduled_works(worker); - } else { - move_linked_works(work, &worker->scheduled, NULL); - process_scheduled_works(worker); - } + move_linked_works(work, &worker->scheduled, NULL); + process_scheduled_works(worker); } while (keep_working(pool)); worker_set_flags(worker, WORKER_PREP); @@ -2820,7 +2817,6 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; - bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2838,12 +2834,9 @@ repeat: WARN_ON_ONCE(!list_empty(scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) { if (get_work_pwq(work) == pwq) { - if (first) - pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); pwq->stats[PWQ_STAT_RESCUED]++; } - first = false; } if (!list_empty(scheduled)) { -- cgit v1.2.3 From fe089f87cccb066e8ad20f49ddf05e95adc1fa8d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:22 -1000 Subject: workqueue: Not all work insertion needs to wake up a worker insert_work() always tried to wake up a worker; however, the only time it needs to try to wake up a worker is when a new active work item is queued. When a work item goes on the inactive list or queueing a flush work item, there's no reason to try to wake up a worker. This patch moves the worker wakeup logic out of insert_work() and places it in the active new work item queueing path in __queue_work(). While at it: * __queue_work() is dereferencing pwq->pool repeatedly. Add local variable pool. * Every caller of insert_work() calls debug_work_activate(). Consolidate the invocations into insert_work(). * In __queue_work() pool->watchdog_ts update is relocated slightly. This is to better accommodate future changes. This makes wakeups more precise and will help the planned change to assign work items to workers before waking them up. No behavior changes intended. v2: WARN_ON_ONCE(pool != last_pool) added in __queue_work() to clarify as suggested by Lai. Signed-off-by: Tejun Heo Cc: Lai Jiangshan --- kernel/workqueue.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 20722de9937a..1332bd545b92 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1542,7 +1542,7 @@ fail: static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, struct list_head *head, unsigned int extra_flags) { - struct worker_pool *pool = pwq->pool; + debug_work_activate(work); /* record the work call stack in order to print it in KASAN reports */ kasan_record_aux_stack_noalloc(work); @@ -1551,9 +1551,6 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); get_pwq(pwq); - - if (__need_more_worker(pool)) - wake_up_worker(pool); } /* @@ -1607,8 +1604,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; - struct worker_pool *last_pool; - struct list_head *worklist; + struct worker_pool *last_pool, *pool; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1642,13 +1638,15 @@ retry: pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); } + pool = pwq->pool; + /* * If @work was previously on a different pool, it might still be * running there, in which case the work needs to be queued on that * pool to guarantee non-reentrancy. */ last_pool = get_work_pool(work); - if (last_pool && last_pool != pwq->pool) { + if (last_pool && last_pool != pool) { struct worker *worker; raw_spin_lock(&last_pool->lock); @@ -1657,13 +1655,15 @@ retry: if (worker && worker->current_pwq->wq == wq) { pwq = worker->current_pwq; + pool = pwq->pool; + WARN_ON_ONCE(pool != last_pool); } else { /* meh... not running there, queue here */ raw_spin_unlock(&last_pool->lock); - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } } else { - raw_spin_lock(&pwq->pool->lock); + raw_spin_lock(&pool->lock); } /* @@ -1676,7 +1676,7 @@ retry: */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); cpu_relax(); goto retry; } @@ -1695,21 +1695,22 @@ retry: work_flags = work_color_to_flags(pwq->work_color); if (likely(pwq->nr_active < pwq->max_active)) { + if (list_empty(&pool->worklist)) + pool->watchdog_ts = jiffies; + trace_workqueue_activate_work(work); pwq->nr_active++; - worklist = &pwq->pool->worklist; - if (list_empty(worklist)) - pwq->pool->watchdog_ts = jiffies; + insert_work(pwq, work, &pool->worklist, work_flags); + + if (__need_more_worker(pool)) + wake_up_worker(pool); } else { work_flags |= WORK_STRUCT_INACTIVE; - worklist = &pwq->inactive_works; + insert_work(pwq, work, &pwq->inactive_works, work_flags); } - debug_work_activate(work); - insert_work(pwq, work, worklist, work_flags); - out: - raw_spin_unlock(&pwq->pool->lock); + raw_spin_unlock(&pool->lock); rcu_read_unlock(); } @@ -3012,7 +3013,6 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, pwq->nr_in_flight[work_color]++; work_flags |= work_color_to_flags(work_color); - debug_work_activate(&barr->work); insert_work(pwq, &barr->work, head, work_flags); } -- cgit v1.2.3 From ee1ceef72754427e5167743108c52f826fa4ca5b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Rename wq->cpu_pwqs to wq->cpu_pwq wq->cpu_pwqs is a percpu variable carraying one pointer to a pool_workqueue. The field name being plural is unusual and confusing. Rename it to singular. This patch doesn't cause any functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 1332bd545b92..ea94ad63386e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -321,7 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __percpu *cpu_pwq; /* I: per-cpu pwqs */ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; @@ -1635,7 +1635,7 @@ retry: } else { if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + pwq = per_cpu_ptr(wq->cpu_pwq, cpu); } pool = pwq->pool; @@ -3826,7 +3826,7 @@ static void rcu_free_wq(struct rcu_head *rcu) wq_free_lockdep(wq); if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwqs); + free_percpu(wq->cpu_pwq); else free_workqueue_attrs(wq->unbound_attrs); @@ -4518,13 +4518,13 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); - if (!wq->cpu_pwqs) + wq->cpu_pwq = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwq) return -ENOMEM; for_each_possible_cpu(cpu) { struct pool_workqueue *pwq = - per_cpu_ptr(wq->cpu_pwqs, cpu); + per_cpu_ptr(wq->cpu_pwq, cpu); struct worker_pool *cpu_pools = per_cpu(cpu_worker_pools, cpu); @@ -4905,7 +4905,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) cpu = smp_processor_id(); if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + pwq = per_cpu_ptr(wq->cpu_pwq, cpu); else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); -- cgit v1.2.3 From 797e8345cbb0d2913300ee9838eb74cce19485cf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Relocate worker and work management functions Collect first_idle_worker(), worker_enter/leave_idle(), find_worker_executing_work(), move_linked_works() and wake_up_worker() into one place. These functions will later be used to implement higher level worker management logic. No functional changes. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 340 ++++++++++++++++++++++++++--------------------------- 1 file changed, 168 insertions(+), 172 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ea94ad63386e..8f5338e7331c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -869,36 +869,6 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Called with pool->lock held. */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self @@ -947,6 +917,174 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; + + /* can't use worker_set_flags(), also called from create_worker() */ + worker->flags |= WORKER_IDLE; + pool->nr_idle++; + worker->last_active = jiffies; + + /* idle_list is LIFO */ + list_add(&worker->entry, &pool->idle_list); + + if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) + mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); + + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); +} + +/** + * worker_leave_idle - leave idle state + * @worker: worker which is leaving idle state + * + * @worker is leaving idle state. Update stats. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_leave_idle(struct worker *worker) +{ + struct worker_pool *pool = worker->pool; + + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; + worker_clr_flags(worker, WORKER_IDLE); + pool->nr_idle--; + list_del_init(&worker->entry); +} + +/** + * find_worker_executing_work - find worker which is executing a work + * @pool: pool of interest + * @work: work to find worker for + * + * Find a worker which is executing @work on @pool by searching + * @pool->busy_hash which is keyed by the address of @work. For a worker + * to match, its current execution should match the address of @work and + * its work function. This is to avoid unwanted dependency between + * unrelated work executions through a work item being recycled while still + * being executed. + * + * This is a bit tricky. A work item may be freed once its execution + * starts and nothing prevents the freed area from being recycled for + * another work item. If the same work item address ends up being reused + * before the original execution finishes, workqueue will identify the + * recycled work item as currently executing and make it wait until the + * current execution finishes, introducing an unwanted dependency. + * + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + * + * Return: + * Pointer to worker which is executing @work if found, %NULL + * otherwise. + */ +static struct worker *find_worker_executing_work(struct worker_pool *pool, + struct work_struct *work) +{ + struct worker *worker; + + hash_for_each_possible(pool->busy_hash, worker, hentry, + (unsigned long)work) + if (worker->current_work == work && + worker->current_func == work->func) + return worker; + + return NULL; +} + +/** + * move_linked_works - move linked works to a list + * @work: start of series of works to be scheduled + * @head: target list to append @work to + * @nextp: out parameter for nested worklist walking + * + * Schedule linked works starting from @work to @head. Work series to + * be scheduled starts at @work and includes any consecutive work with + * WORK_STRUCT_LINKED set in its predecessor. + * + * If @nextp is not NULL, it's updated to point to the next work of + * the last scheduled work. This allows move_linked_works() to be + * nested inside outer list_for_each_entry_safe(). + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void move_linked_works(struct work_struct *work, struct list_head *head, + struct work_struct **nextp) +{ + struct work_struct *n; + + /* + * Linked worklist will always end before the end of the list, + * use NULL for list head. + */ + list_for_each_entry_safe_from(work, n, NULL, entry) { + list_move_tail(&work->entry, head); + if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) + break; + } + + /* + * If we're already inside safe list traversal and have moved + * multiple works to the scheduled queue, the next position + * needs to be updated. + */ + if (nextp) + *nextp = n; +} + +/** + * wake_up_worker - wake up an idle worker + * @pool: worker pool to wake worker from + * + * Wake up the first idle worker of @pool. + * + * CONTEXT: + * raw_spin_lock_irq(pool->lock). + */ +static void wake_up_worker(struct worker_pool *pool) +{ + struct worker *worker = first_idle_worker(pool); + + if (likely(worker)) + wake_up_process(worker->task); +} + #ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT /* @@ -1202,94 +1340,6 @@ work_func_t wq_worker_last_func(struct task_struct *task) return worker->last_func; } -/** - * find_worker_executing_work - find worker which is executing a work - * @pool: pool of interest - * @work: work to find worker for - * - * Find a worker which is executing @work on @pool by searching - * @pool->busy_hash which is keyed by the address of @work. For a worker - * to match, its current execution should match the address of @work and - * its work function. This is to avoid unwanted dependency between - * unrelated work executions through a work item being recycled while still - * being executed. - * - * This is a bit tricky. A work item may be freed once its execution - * starts and nothing prevents the freed area from being recycled for - * another work item. If the same work item address ends up being reused - * before the original execution finishes, workqueue will identify the - * recycled work item as currently executing and make it wait until the - * current execution finishes, introducing an unwanted dependency. - * - * This function checks the work item address and work function to avoid - * false positives. Note that this isn't complete as one may construct a - * work function which can introduce dependency onto itself through a - * recycled work item. Well, if somebody wants to shoot oneself in the - * foot that badly, there's only so much we can do, and if such deadlock - * actually occurs, it should be easy to locate the culprit work function. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - * - * Return: - * Pointer to worker which is executing @work if found, %NULL - * otherwise. - */ -static struct worker *find_worker_executing_work(struct worker_pool *pool, - struct work_struct *work) -{ - struct worker *worker; - - hash_for_each_possible(pool->busy_hash, worker, hentry, - (unsigned long)work) - if (worker->current_work == work && - worker->current_func == work->func) - return worker; - - return NULL; -} - -/** - * move_linked_works - move linked works to a list - * @work: start of series of works to be scheduled - * @head: target list to append @work to - * @nextp: out parameter for nested worklist walking - * - * Schedule linked works starting from @work to @head. Work series to - * be scheduled starts at @work and includes any consecutive work with - * WORK_STRUCT_LINKED set in its predecessor. - * - * If @nextp is not NULL, it's updated to point to the next work of - * the last scheduled work. This allows move_linked_works() to be - * nested inside outer list_for_each_entry_safe(). - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void move_linked_works(struct work_struct *work, struct list_head *head, - struct work_struct **nextp) -{ - struct work_struct *n; - - /* - * Linked worklist will always end before the end of the list, - * use NULL for list head. - */ - list_for_each_entry_safe_from(work, n, NULL, entry) { - list_move_tail(&work->entry, head); - if (!(*work_data_bits(work) & WORK_STRUCT_LINKED)) - break; - } - - /* - * If we're already inside safe list traversal and have moved - * multiple works to the scheduled queue, the next position - * needs to be updated. - */ - if (nextp) - *nextp = n; -} - /** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get @@ -1974,60 +2024,6 @@ bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork) } EXPORT_SYMBOL(queue_rcu_work); -/** - * worker_enter_idle - enter idle state - * @worker: worker which is entering idle state - * - * @worker is entering idle state. Update stats and idle timer if - * necessary. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_enter_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || - WARN_ON_ONCE(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev))) - return; - - /* can't use worker_set_flags(), also called from create_worker() */ - worker->flags |= WORKER_IDLE; - pool->nr_idle++; - worker->last_active = jiffies; - - /* idle_list is LIFO */ - list_add(&worker->entry, &pool->idle_list); - - if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) - mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - - /* Sanity check nr_running. */ - WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && pool->nr_running); -} - -/** - * worker_leave_idle - leave idle state - * @worker: worker which is leaving idle state - * - * @worker is leaving idle state. Update stats. - * - * LOCKING: - * raw_spin_lock_irq(pool->lock). - */ -static void worker_leave_idle(struct worker *worker) -{ - struct worker_pool *pool = worker->pool; - - if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) - return; - worker_clr_flags(worker, WORKER_IDLE); - pool->nr_idle--; - list_del_init(&worker->entry); -} - static struct worker *alloc_worker(int node) { struct worker *worker; -- cgit v1.2.3 From fcecfa8f271acdf130acbb30842e7848a138af0f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Remove module param disable_numa and sysfs knobs pool_ids and numa Unbound workqueue CPU affinity is going to receive an overhaul and the NUMA specific knobs won't make sense anymore. Remove them. Also, the pool_ids knob was used for debugging and not really meaningful given that there is no visibility into the pools associated with those IDs. Remove it too. A future patch will improve overall visibility. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 73 ------------------------------------------------------ 1 file changed, 73 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 8f5338e7331c..4347f493eca1 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -340,9 +340,6 @@ static cpumask_var_t *wq_numa_possible_cpumask; static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); - /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); @@ -5794,10 +5791,8 @@ out_unlock: * * Unbound workqueues have the following extra attributes. * - * pool_ids RO int : the associated pool IDs for each node * nice RW int : nice value of the workers * cpumask RW mask : bitmask of allowed CPUs for the workers - * numa RW bool : whether enable NUMA affinity */ struct wq_device { struct workqueue_struct *wq; @@ -5850,28 +5845,6 @@ static struct attribute *wq_sysfs_attrs[] = { }; ATTRIBUTE_GROUPS(wq_sysfs); -static ssize_t wq_pool_ids_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - const char *delim = ""; - int node, written = 0; - - cpus_read_lock(); - rcu_read_lock(); - for_each_node(node) { - written += scnprintf(buf + written, PAGE_SIZE - written, - "%s%d:%d", delim, node, - unbound_pwq_by_node(wq, node)->pool->id); - delim = " "; - } - written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); - rcu_read_unlock(); - cpus_read_unlock(); - - return written; -} - static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -5962,50 +5935,9 @@ out_unlock: return ret ?: count; } -static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - int written; - - mutex_lock(&wq->mutex); - written = scnprintf(buf, PAGE_SIZE, "%d\n", - !wq->unbound_attrs->no_numa); - mutex_unlock(&wq->mutex); - - return written; -} - -static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct workqueue_struct *wq = dev_to_wq(dev); - struct workqueue_attrs *attrs; - int v, ret = -ENOMEM; - - apply_wqattrs_lock(); - - attrs = wq_sysfs_prep_attrs(wq); - if (!attrs) - goto out_unlock; - - ret = -EINVAL; - if (sscanf(buf, "%d", &v) == 1) { - attrs->no_numa = !v; - ret = apply_workqueue_attrs_locked(wq, attrs); - } - -out_unlock: - apply_wqattrs_unlock(); - free_workqueue_attrs(attrs); - return ret ?: count; -} - static struct device_attribute wq_sysfs_unbound_attrs[] = { - __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), __ATTR(nice, 0644, wq_nice_show, wq_nice_store), __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), - __ATTR(numa, 0644, wq_numa_show, wq_numa_store), __ATTR_NULL, }; @@ -6379,11 +6311,6 @@ static void __init wq_numa_init(void) if (num_possible_nodes() <= 1) return; - if (wq_disable_numa) { - pr_info("workqueue: NUMA affinity support disabled\n"); - return; - } - for_each_possible_cpu(cpu) { if (WARN_ON(cpu_to_node(cpu) == NUMA_NO_NODE)) { pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); -- cgit v1.2.3 From 967b494e2fd143a9c1a3201422aceadb5fa9fbfc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Use a kthread_worker to release pool_workqueues pool_workqueue release path is currently bounced to system_wq; however, this is a bit tricky because this bouncing occurs while holding a pool lock and thus has risk of causing a A-A deadlock. This is currently addressed by the fact that only unbound workqueues use this bouncing path and system_wq is a per-cpu workqueue. While this works, it's brittle and requires a work-around like setting the lockdep subclass for the lock of unbound pools. Besides, future changes will use the bouncing path for per-cpu workqueues too making the current approach unusable. Let's just use a dedicated kthread_worker to untangle the dependency. This is just one more kthread for all workqueues and makes the pwq release logic simpler and more robust. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 4347f493eca1..01bf22c5d515 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -257,12 +257,12 @@ struct pool_workqueue { u64 stats[PWQ_NR_STATS]; /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue itself + * is also RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work unbound_release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -395,6 +395,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -1366,14 +1373,10 @@ static void put_pwq(struct pool_workqueue *pwq) if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) return; /* - * @pwq can't be released under pool->lock, bounce to - * pwq_unbound_release_workfn(). This never recurses on the same - * pool->lock as this path is taken only for unbound workqueues and - * the release work item is scheduled on a per-cpu workqueue. To - * avoid lockdep warning, unbound pool->locks are given lockdep - * subclass of 1 in get_unbound_pool(). + * @pwq can't be released under pool->lock, bounce to a dedicated + * kthread_worker to avoid A-A deadlocks. */ - schedule_work(&pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->unbound_release_work); } /** @@ -3965,7 +3968,6 @@ static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) if (!pool || init_worker_pool(pool) < 0) goto fail; - lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ copy_workqueue_attrs(pool->attrs, attrs); pool->node = target_node; @@ -3999,10 +4001,10 @@ static void rcu_free_pwq(struct rcu_head *rcu) } /* - * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt - * and needs to be destroyed. + * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero + * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct work_struct *work) +static void pwq_unbound_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, unbound_release_work); @@ -4110,7 +4112,8 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); + kthread_init_work(&pwq->unbound_release_work, + pwq_unbound_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -6433,6 +6436,9 @@ static void __init wq_cpu_intensive_thresh_init(void) if (wq_cpu_intensive_thresh_us != ULONG_MAX) return; + pwq_release_worker = kthread_create_worker(0, "pool_workqueue_release"); + BUG_ON(IS_ERR(pwq_release_worker)); + /* * The default of 10ms is derived from the fact that most modern (as of * 2023) processors can do a lot in 10ms and that it's just below what -- cgit v1.2.3 From 687a9aa56f811b381e63f7f8f9149428ac708a3b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Make per-cpu pool_workqueues allocated and released like unbound ones Currently, all per-cpu pwq's (pool_workqueue's) are allocated directly through a per-cpu allocation and thus, unlike unbound workqueues, not reference counted. This difference in lifetime management between the two types is a bit confusing. Unbound workqueues are currently accessed through wq->numa_pwq_tbl[] which isn't suitiable for the planned CPU locality related improvements. The plan is to unify pwq handling across per-cpu and unbound workqueues so that they're always accessed through wq->cpu_pwq. In preparation, this patch makes per-cpu pwq's to be allocated, reference counted and released the same way as unbound pwq's. wq->cpu_pwq now holds pointers to pwq's instead of containing them directly. pwq_unbound_release_workfn() is renamed to pwq_release_workfn() as it's now also used for per-cpu work items. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 74 +++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 34 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 01bf22c5d515..05bf5427124a 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -258,11 +258,11 @@ struct pool_workqueue { /* * Release of unbound pwq is punted to a kthread_worker. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue itself - * is also RCU protected so that the first pwq can be determined without + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without * grabbing wq->mutex. */ - struct kthread_work unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -321,7 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwq; /* I: per-cpu pwqs */ + struct pool_workqueue __percpu **cpu_pwq; /* I: per-cpu pwqs */ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ }; @@ -1370,13 +1370,11 @@ static void put_pwq(struct pool_workqueue *pwq) lockdep_assert_held(&pwq->pool->lock); if (likely(--pwq->refcnt)) return; - if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) - return; /* * @pwq can't be released under pool->lock, bounce to a dedicated * kthread_worker to avoid A-A deadlocks. */ - kthread_queue_work(pwq_release_worker, &pwq->unbound_release_work); + kthread_queue_work(pwq_release_worker, &pwq->release_work); } /** @@ -1685,7 +1683,7 @@ retry: } else { if (req_cpu == WORK_CPU_UNBOUND) cpu = raw_smp_processor_id(); - pwq = per_cpu_ptr(wq->cpu_pwq, cpu); + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); } pool = pwq->pool; @@ -4004,31 +4002,30 @@ static void rcu_free_pwq(struct rcu_head *rcu) * Scheduled on pwq_release_worker by put_pwq() when an unbound pwq hits zero * refcnt and needs to be destroyed. */ -static void pwq_unbound_release_workfn(struct kthread_work *work) +static void pwq_release_workfn(struct kthread_work *work) { struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, - unbound_release_work); + release_work); struct workqueue_struct *wq = pwq->wq; struct worker_pool *pool = pwq->pool; bool is_last = false; /* - * when @pwq is not linked, it doesn't hold any reference to the + * When @pwq is not linked, it doesn't hold any reference to the * @wq, and @wq is invalid to access. */ if (!list_empty(&pwq->pwqs_node)) { - if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) - return; - mutex_lock(&wq->mutex); list_del_rcu(&pwq->pwqs_node); is_last = list_empty(&wq->pwqs); mutex_unlock(&wq->mutex); } - mutex_lock(&wq_pool_mutex); - put_unbound_pool(pool); - mutex_unlock(&wq_pool_mutex); + if (wq->flags & WQ_UNBOUND) { + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + } call_rcu(&pwq->rcu, rcu_free_pwq); @@ -4112,8 +4109,7 @@ static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, INIT_LIST_HEAD(&pwq->inactive_works); INIT_LIST_HEAD(&pwq->pwqs_node); INIT_LIST_HEAD(&pwq->mayday_node); - kthread_init_work(&pwq->unbound_release_work, - pwq_unbound_release_workfn); + kthread_init_work(&pwq->release_work, pwq_release_workfn); } /* sync @pwq with the current state of its associated wq and link it */ @@ -4514,20 +4510,25 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) int cpu, ret; if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwq = alloc_percpu(struct pool_workqueue); + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); if (!wq->cpu_pwq) - return -ENOMEM; + goto enomem; for_each_possible_cpu(cpu) { - struct pool_workqueue *pwq = + struct pool_workqueue **pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); - struct worker_pool *cpu_pools = - per_cpu(cpu_worker_pools, cpu); + struct worker_pool *pool = + &(per_cpu_ptr(cpu_worker_pools, cpu)[highpri]); - init_pwq(pwq, wq, &cpu_pools[highpri]); + *pwq_p = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, + pool->node); + if (!*pwq_p) + goto enomem; + + init_pwq(*pwq_p, wq, pool); mutex_lock(&wq->mutex); - link_pwq(pwq); + link_pwq(*pwq_p); mutex_unlock(&wq->mutex); } return 0; @@ -4546,6 +4547,15 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) cpus_read_unlock(); return ret; + +enomem: + if (wq->cpu_pwq) { + for_each_possible_cpu(cpu) + kfree(*per_cpu_ptr(wq->cpu_pwq, cpu)); + free_percpu(wq->cpu_pwq); + wq->cpu_pwq = NULL; + } + return -ENOMEM; } static int wq_clamp_max_active(int max_active, unsigned int flags, @@ -4719,7 +4729,7 @@ static bool pwq_busy(struct pool_workqueue *pwq) void destroy_workqueue(struct workqueue_struct *wq) { struct pool_workqueue *pwq; - int node; + int cpu, node; /* * Remove it from sysfs first so that sanity check failure doesn't @@ -4779,12 +4789,8 @@ void destroy_workqueue(struct workqueue_struct *wq) mutex_unlock(&wq_pool_mutex); if (!(wq->flags & WQ_UNBOUND)) { - wq_unregister_lockdep(wq); - /* - * The base ref is never dropped on per-cpu pwqs. Directly - * schedule RCU free. - */ - call_rcu(&wq->rcu, rcu_free_wq); + for_each_possible_cpu(cpu) + put_pwq_unlocked(*per_cpu_ptr(wq->cpu_pwq, cpu)); } else { /* * We're the sole accessor of @wq at this point. Directly @@ -4901,7 +4907,7 @@ bool workqueue_congested(int cpu, struct workqueue_struct *wq) cpu = smp_processor_id(); if (!(wq->flags & WQ_UNBOUND)) - pwq = per_cpu_ptr(wq->cpu_pwq, cpu); + pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); else pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); -- cgit v1.2.3 From 4cbfd3de737b9d00544ff0f673cb75fc37bffb6a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Call wq_update_unbound_numa() on all CPUs in NUMA node on CPU hotplug When a CPU went online or offline, wq_update_unbound_numa() was called only on the CPU which was going up or down. This works fine because all CPUs on the same NUMA node share the same pool_workqueue slot - one CPU updating it updates it for everyone in the node. However, future changes will make each CPU use a separate pool_workqueue even when they're sharing the same worker_pool, which requires updating pool_workqueue's for all CPUs which may be sharing the same pool_workqueue on hotplug. To accommodate the planned changes, this patch updates workqueue_on/offline_cpu() so that they call wq_update_unbound_numa() for all CPUs sharing the same NUMA node as the CPU going up or down. In the current code, the second+ calls would be noops and there shouldn't be any behavior changes. * As wq_update_unbound_numa() is now called on multiple CPUs per each hotplug event, @cpu is renamed to @hotplug_cpu and another @cpu argument is added. The former indicates the CPU being hot[un]plugged and the latter the CPU whose pool_workqueue is being updated. * In wq_update_unbound_numa(), cpu_off is renamed to off_cpu for consistency with the new @hotplug_cpu. Signed-off-by: Tejun Heo --- kernel/workqueue.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 05bf5427124a..9f4341885f60 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4422,7 +4422,8 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, /** * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug * @wq: the target workqueue - * @cpu: the CPU coming up or going down + * @cpu: the CPU to update pool association for + * @hotplug_cpu: the CPU coming up or going down * @online: whether @cpu is coming up or going down * * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and @@ -4442,10 +4443,10 @@ int apply_workqueue_attrs(struct workqueue_struct *wq, * CPU_DOWN_PREPARE. */ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, - bool online) + int hotplug_cpu, bool online) { int node = cpu_to_node(cpu); - int cpu_off = online ? -1 : cpu; + int off_cpu = online ? -1 : hotplug_cpu; struct pool_workqueue *old_pwq = NULL, *pwq; struct workqueue_attrs *target_attrs; cpumask_t *cpumask; @@ -4473,7 +4474,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, * and create a new one if they don't match. If the target cpumask * equals the default pwq's, the default pwq should be used. */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, cpumask)) { + if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask)) { if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) return; } else { @@ -5514,8 +5515,15 @@ int workqueue_online_cpu(unsigned int cpu) } /* update NUMA affinity of unbound workqueues */ - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, true); + list_for_each_entry(wq, &workqueues, list) { + int tcpu; + + for_each_possible_cpu(tcpu) { + if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + wq_update_unbound_numa(wq, tcpu, cpu, true); + } + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -5533,8 +5541,15 @@ int workqueue_offline_cpu(unsigned int cpu) /* update NUMA affinity of unbound workqueues */ mutex_lock(&wq_pool_mutex); - list_for_each_entry(wq, &workqueues, list) - wq_update_unbound_numa(wq, cpu, false); + list_for_each_entry(wq, &workqueues, list) { + int tcpu; + + for_each_possible_cpu(tcpu) { + if (cpu_to_node(tcpu) == cpu_to_node(cpu)) { + wq_update_unbound_numa(wq, tcpu, cpu, false); + } + } + } mutex_unlock(&wq_pool_mutex); return 0; @@ -6509,7 +6524,8 @@ void __init workqueue_init(void) } list_for_each_entry(wq, &workqueues, list) { - wq_update_unbound_numa(wq, smp_processor_id(), true); + wq_update_unbound_numa(wq, smp_processor_id(), smp_processor_id(), + true); WARN(init_rescuer(wq), "workqueue: failed to create early rescuer for %s", wq->name); -- cgit v1.2.3 From 636b927eba5bc633753f8eb80f35e1d5be806e51 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Aug 2023 15:57:23 -1000 Subject: workqueue: Make unbound workqueues to use per-cpu pool_workqueues A pwq (pool_workqueue) represents an association between a workqueue and a worker_pool. When a work item is queued, the workqueue selects the pwq to use, which in turn determines the pool, and queues the work item to the pool through the pwq. pwq is also what implements the maximum concurrency limit - @max_active. As a per-cpu workqueue should be assocaited with a different worker_pool on each CPU, it always had per-cpu pwq's that are accessed through wq->cpu_pwq. However, unbound workqueues were sharing a pwq within each NUMA node by default. The sharing has several downsides: * Because @max_active is per-pwq, the meaning of @max_active changes depending on the machine configuration and whether workqueue NUMA locality support is enabled. * Makes per-cpu and unbound code deviate. * Gets in the way of making workqueue CPU locality awareness more flexible. This patch makes unbound workqueues use per-cpu pwq's the same way per-cpu workqueues do by making the following changes: * wq->numa_pwq_tbl[] is removed and unbound workqueues now use wq->cpu_pwq just like per-cpu workqueues. wq->cpu_pwq is now RCU protected for unbound workqueues. * numa_pwq_tbl_install() is renamed to install_unbound_pwq() and installs the specified pwq to the target CPU's wq->cpu_pwq. * apply_wqattrs_prepare() now always allocates a separate pwq for each CPU unless the workqueue is ordered. If ordered, all CPUs use wq->dfl_pwq. This makes the return value of wq_calc_node_cpumask() unnecessary. It now returns void. * @max_active now means the same thing for both per-cpu and unbound workqueues. WQ_UNBOUND_MAX_ACTIVE now equals WQ_MAX_ACTIVE and documentation is updated accordingly. WQ_UNBOUND_MAX_ACTIVE is no longer used in workqueue implementation and will be removed later. * All unbound pwq operations which used to be per-numa-node are now per-cpu. For most unbound workqueue users, this shouldn't cause noticeable changes. Work item issue and completion will be a small bit faster, flush_workqueue() would become a bit more expensive, and the total concurrency limit would likely become higher. All @max_active==1 use cases are currently being audited for conversion into alloc_ordered_workqueue() and they shouldn't be affected once the audit and conversion is complete. One area where the behavior change may be more noticeable is workqueue_congested() as the reported congestion state is now per CPU instead of NUMA node. There are only two users of this interface - drivers/infiniband/hw/hfi1 and net/smc. Maintainers of both subsystems are cc'd. Inputs on the behavior change would be very much appreciated. Signed-off-by: Tejun Heo Acked-by: Dennis Dalessandro Cc: Jason Gunthorpe Cc: Leon Romanovsky Cc: Karsten Graul Cc: Wenjia Zhang Cc: Jan Karcher --- kernel/workqueue.c | 218 +++++++++++++++++++---------------------------------- 1 file changed, 77 insertions(+), 141 deletions(-) (limited to 'kernel/workqueue.c') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 9f4341885f60..48208888aee0 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -321,8 +321,7 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu **cpu_pwq; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; @@ -608,35 +607,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -1676,16 +1646,14 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, rcu_read_lock(); retry: /* pwq which will be used unless @work is executing elsewhere */ - if (wq->flags & WQ_UNBOUND) { - if (req_cpu == WORK_CPU_UNBOUND) + if (req_cpu == WORK_CPU_UNBOUND) { + if (wq->flags & WQ_UNBOUND) cpu = wq_select_unbound_cpu(raw_smp_processor_id()); - pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - } else { - if (req_cpu == WORK_CPU_UNBOUND) + else cpu = raw_smp_processor_id(); - pwq = *per_cpu_ptr(wq->cpu_pwq, cpu); } + pwq = rcu_dereference(*per_cpu_ptr(wq->cpu_pwq, cpu)); pool = pwq->pool; /* @@ -1715,12 +1683,11 @@ retry: } /* - * pwq is determined and locked. For unbound pools, we could have - * raced with pwq release and it could already be dead. If its - * refcnt is zero, repeat pwq selection. Note that pwqs never die - * without another pwq replacing it in the numa_pwq_tbl or while - * work items are executing on it, so the retrying is guaranteed to - * make forward-progress. + * pwq is determined and locked. For unbound pools, we could have raced + * with pwq release and it could already be dead. If its refcnt is zero, + * repeat pwq selection. Note that unbound pwqs never die without + * another pwq replacing it in cpu_pwq or while work items are executing + * on it, so the retrying is guaranteed to make forward-progress. */ if (unlikely(!pwq->refcnt)) { if (wq->flags & WQ_UNBOUND) { @@ -3818,12 +3785,8 @@ static void rcu_free_wq(struct rcu_head *rcu) container_of(rcu, struct workqueue_struct, rcu); wq_free_lockdep(wq); - - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->cpu_pwq); - else - free_workqueue_attrs(wq->unbound_attrs); - + free_percpu(wq->cpu_pwq); + free_workqueue_attrs(wq->unbound_attrs); kfree(wq); } @@ -4174,11 +4137,8 @@ static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, * * The caller is responsible for ensuring that the cpumask of @node stays * stable. - * - * Return: %true if the resulting @cpumask is different from @attrs->cpumask, - * %false if equal. */ -static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, +static void wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, int cpu_going_down, cpumask_t *cpumask) { if (!wq_numa_enabled || attrs->no_numa) @@ -4195,23 +4155,18 @@ static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, /* yeap, return possible CPUs in @node that @attrs wants */ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); - if (cpumask_empty(cpumask)) { + if (cpumask_empty(cpumask)) pr_warn_once("WARNING: workqueue cpumask: online intersect > " "possible intersect\n"); - return false; - } - - return !cpumask_equal(cpumask, attrs->cpumask); + return; use_dfl: cpumask_copy(cpumask, attrs->cpumask); - return false; } -/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ -static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, - int node, - struct pool_workqueue *pwq) +/* install @pwq into @wq's cpu_pwq and return the old pwq */ +static struct pool_workqueue *install_unbound_pwq(struct workqueue_struct *wq, + int cpu, struct pool_workqueue *pwq) { struct pool_workqueue *old_pwq; @@ -4221,8 +4176,8 @@ static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, /* link_pwq() can handle duplicate calls */ link_pwq(pwq); - old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); - rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + old_pwq = rcu_access_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu)); + rcu_assign_pointer(*per_cpu_ptr(wq->cpu_pwq, cpu), pwq); return old_pwq; } @@ -4239,10 +4194,10 @@ struct apply_wqattrs_ctx { static void apply_wqattrs_cleanup(struct apply_wqattrs_ctx *ctx) { if (ctx) { - int node; + int cpu; - for_each_node(node) - put_pwq_unlocked(ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + put_pwq_unlocked(ctx->pwq_tbl[cpu]); put_pwq_unlocked(ctx->dfl_pwq); free_workqueue_attrs(ctx->attrs); @@ -4259,11 +4214,11 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, { struct apply_wqattrs_ctx *ctx; struct workqueue_attrs *new_attrs, *tmp_attrs; - int node; + int cpu; lockdep_assert_held(&wq_pool_mutex); - ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_node_ids), GFP_KERNEL); + ctx = kzalloc(struct_size(ctx, pwq_tbl, nr_cpu_ids), GFP_KERNEL); new_attrs = alloc_workqueue_attrs(); tmp_attrs = alloc_workqueue_attrs(); @@ -4297,14 +4252,16 @@ apply_wqattrs_prepare(struct workqueue_struct *wq, if (!ctx->dfl_pwq) goto out_free; - for_each_node(node) { - if (wq_calc_node_cpumask(new_attrs, node, -1, tmp_attrs->cpumask)) { - ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); - if (!ctx->pwq_tbl[node]) - goto out_free; - } else { + for_each_possible_cpu(cpu) { + if (new_attrs->no_numa) { ctx->dfl_pwq->refcnt++; - ctx->pwq_tbl[node] = ctx->dfl_pwq; + ctx->pwq_tbl[cpu] = ctx->dfl_pwq; + } else { + wq_calc_node_cpumask(new_attrs, cpu_to_node(cpu), -1, + tmp_attrs->cpumask); + ctx->pwq_tbl[cpu] = alloc_unbound_pwq(wq, tmp_attrs); + if (!ctx->pwq_tbl[cpu]) + goto out_free; } } @@ -4327,7 +4284,7 @@ out_free: /* set attrs and install prepared pwqs, @ctx points to old pwqs on return */ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) { - int node; + int cpu; /* all pwqs have been created successfully, let's install'em */ mutex_lock(&ctx->wq->mutex); @@ -4335,9 +4292,9 @@ static void apply_wqattrs_commit(struct apply_wqattrs_ctx *ctx) copy_workqueue_attrs(ctx->wq->unbound_attrs, ctx->attrs); /* save the previous pwq and install the new one */ - for_each_node(node) - ctx->pwq_tbl[node] = numa_pwq_tbl_install(ctx->wq, node, - ctx->pwq_tbl[node]); + for_each_possible_cpu(cpu) + ctx->pwq_tbl[cpu] = install_unbound_pwq(ctx->wq, cpu, + ctx->pwq_tbl[cpu]); /* @dfl_pwq might not have been used, ensure it's linked */ link_pwq(ctx->dfl_pwq); @@ -4466,20 +4423,13 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, cpumask = target_attrs->cpumask; copy_workqueue_attrs(target_attrs, wq->unbound_attrs); - pwq = unbound_pwq_by_node(wq, node); - /* - * Let's determine what needs to be done. If the target cpumask is - * different from the default pwq's, we need to compare it to @pwq's - * and create a new one if they don't match. If the target cpumask - * equals the default pwq's, the default pwq should be used. - */ - if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask)) { - if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) - return; - } else { - goto use_dfl_pwq; - } + /* nothing to do if the target cpumask matches the current pwq */ + wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, off_cpu, cpumask); + pwq = rcu_dereference_protected(*per_cpu_ptr(wq->cpu_pwq, cpu), + lockdep_is_held(&wq_pool_mutex)); + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + return; /* create a new pwq */ pwq = alloc_unbound_pwq(wq, target_attrs); @@ -4491,7 +4441,7 @@ static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, /* Install the new pwq. */ mutex_lock(&wq->mutex); - old_pwq = numa_pwq_tbl_install(wq, node, pwq); + old_pwq = install_unbound_pwq(wq, cpu, pwq); goto out_unlock; use_dfl_pwq: @@ -4499,7 +4449,7 @@ use_dfl_pwq: raw_spin_lock_irq(&wq->dfl_pwq->pool->lock); get_pwq(wq->dfl_pwq); raw_spin_unlock_irq(&wq->dfl_pwq->pool->lock); - old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); + old_pwq = install_unbound_pwq(wq, cpu, wq->dfl_pwq); out_unlock: mutex_unlock(&wq->mutex); put_pwq_unlocked(old_pwq); @@ -4510,11 +4460,11 @@ static int alloc_and_link_pwqs(struct workqueue_struct *wq) bool highpri = wq->flags & WQ_HIGHPRI; int cpu, ret; - if (!(wq->flags & WQ_UNBOUND)) { - wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); - if (!wq->cpu_pwq) - goto enomem; + wq->cpu_pwq = alloc_percpu(struct pool_workqueue *); + if (!wq->cpu_pwq) + goto enomem; + if (!(wq->flags & WQ_UNBOUND)) { for_each_possible_cpu(cpu) { struct pool_workqueue **pwq_p = per_cpu_ptr(wq->cpu_pwq, cpu); @@ -4562,13 +4512,11 @@ enomem: static int wq_clamp_max_active(int max_active, unsigned int flags, const char *name) { - int lim = flags & WQ_UNBOUND ? WQ_UNBOUND_MAX_ACTIVE : WQ_MAX_ACTIVE; - - if (max_active < 1 || max_active