diff options
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 28 | ||||
-rw-r--r-- | Documentation/core-api/workqueue.rst | 356 | ||||
-rw-r--r-- | include/linux/workqueue.h | 115 | ||||
-rw-r--r-- | init/main.c | 1 | ||||
-rw-r--r-- | kernel/workqueue.c | 1616 | ||||
-rw-r--r-- | kernel/workqueue_internal.h | 2 | ||||
-rw-r--r-- | tools/workqueue/wq_dump.py | 177 | ||||
-rw-r--r-- | tools/workqueue/wq_monitor.py | 21 |
8 files changed, 1512 insertions, 804 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b2b128034225..0a1731a0f0ef 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7076,6 +7076,13 @@ disables both lockup detectors. Default is 10 seconds. + workqueue.unbound_cpus= + [KNL,SMP] Specify to constrain one or some CPUs + to use in unbound workqueues. + Format: <cpu-list> + By default, all online CPUs are available for + unbound workqueues. + workqueue.watchdog_thresh= If CONFIG_WQ_WATCHDOG is configured, workqueue can warn stall conditions and dump internal state to @@ -7097,15 +7104,6 @@ threshold repeatedly. They are likely good candidates for using WQ_UNBOUND workqueues instead. - workqueue.disable_numa - By default, all work items queued to unbound - workqueues are affine to the NUMA nodes they're - issued on, which results in better behavior in - general. If NUMA affinity needs to be disabled for - whatever reason, this option can be used. Note - that this also can be controlled per-workqueue for - workqueues visible under /sys/bus/workqueue/. - workqueue.power_efficient Per-cpu workqueues are generally preferred because they show better performance thanks to cache @@ -7121,6 +7119,18 @@ The default value of this parameter is determined by the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT. + workqueue.default_affinity_scope= + Select the default affinity scope to use for unbound + workqueues. Can be one of "cpu", "smt", "cache", + "numa" and "system". Default is "cache". For more + information, see the Affinity Scopes section in + Documentation/core-api/workqueue.rst. + + This can be changed after boot by writing to the + matching /sys/module/workqueue/parameters file. All + workqueues with the "default" affinity scope will be + updated accordignly. + workqueue.debug_force_rr_cpu Workqueue used to implicitly guarantee that work items queued without explicit CPU specified are put diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst index a4c9b9d1905f..5d7b01aed1fe 100644 --- a/Documentation/core-api/workqueue.rst +++ b/Documentation/core-api/workqueue.rst @@ -1,6 +1,6 @@ -==================================== -Concurrency Managed Workqueue (cmwq) -==================================== +========= +Workqueue +========= :Date: September, 2010 :Author: Tejun Heo <tj@kernel.org> @@ -25,8 +25,8 @@ there is no work item left on the workqueue the worker becomes idle. When a new work item gets queued, the worker begins executing again. -Why cmwq? -========= +Why Concurrency Managed Workqueue? +================================== In the original wq implementation, a multi threaded (MT) wq had one worker thread per CPU and a single threaded (ST) wq had one worker @@ -220,17 +220,16 @@ resources, scheduled and executed. ``max_active`` -------------- -``@max_active`` determines the maximum number of execution contexts -per CPU which can be assigned to the work items of a wq. For example, -with ``@max_active`` of 16, at most 16 work items of the wq can be -executing at the same time per CPU. +``@max_active`` determines the maximum number of execution contexts per +CPU which can be assigned to the work items of a wq. For example, with +``@max_active`` of 16, at most 16 work items of the wq can be executing +at the same time per CPU. This is always a per-CPU attribute, even for +unbound workqueues. -Currently, for a bound wq, the maximum limit for ``@max_active`` is -512 and the default value used when 0 is specified is 256. For an -unbound wq, the limit is higher of 512 and 4 * -``num_possible_cpus()``. These values are chosen sufficiently high -such that they are not the limiting factor while providing protection -in runaway cases. +The maximum limit for ``@max_active`` is 512 and the default value used +when 0 is specified is 256. These values are chosen sufficiently high +such that they are not the limiting factor while providing protection in +runaway cases. The number of active work items of a wq is usually regulated by the users of the wq, more specifically, by how many work items the users @@ -348,27 +347,346 @@ Guidelines level of locality in wq operations and work item execution. +Affinity Scopes +=============== + +An unbound workqueue groups CPUs according to its affinity scope to improve +cache locality. For example, if a workqueue is using the default affinity +scope of "cache", it will group CPUs according to last level cache +boundaries. A work item queued on the workqueue will be assigned to a worker +on one of the CPUs which share the last level cache with the issuing CPU. +Once started, the worker may or may not be allowed to move outside the scope +depending on the ``affinity_strict`` setting of the scope. + +Workqueue currently supports the following affinity scopes. + +``default`` + Use the scope in module parameter ``workqueue.default_affinity_scope`` + which is always set to one of the scopes below. + +``cpu`` + CPUs are not grouped. A work item issued on one CPU is processed by a + worker on the same CPU. This makes unbound workqueues behave as per-cpu + workqueues without concurrency management. + +``smt`` + CPUs are grouped according to SMT boundaries. This usually means that the + logical threads of each physical CPU core are grouped together. + +``cache`` + CPUs are grouped according to cache boundaries. Which specific cache + boundary is used is determined by the arch code. L3 is used in a lot of + cases. This is the default affinity scope. + +``numa`` + CPUs are grouped according to NUMA bounaries. + +``system`` + All CPUs are put in the same group. Workqueue makes no effort to process a + work item on a CPU close to the issuing CPU. + +The default affinity scope can be changed with the module parameter +``workqueue.default_affinity_scope`` and a specific workqueue's affinity +scope can be changed using ``apply_workqueue_attrs()``. + +If ``WQ_SYSFS`` is set, the workqueue will have the following affinity scope +related interface files under its ``/sys/devices/virtual/WQ_NAME/`` +directory. + +``affinity_scope`` + Read to see the current affinity scope. Write to change. + + When default is the current scope, reading this file will also show the + current effective scope in parentheses, for example, ``default (cache)``. + +``affinity_strict`` + 0 by default indicating that affinity scopes are not strict. When a work + item starts execution, workqueue makes a best-effort attempt to ensure + that the worker is inside its affinity scope, which is called + repatriation. Once started, the scheduler is free to move the worker + anywhere in the system as it sees fit. This enables benefiting from scope + locality while still being able to utilize other CPUs if necessary and + available. + + If set to 1, all workers of the scope are guaranteed always to be in the + scope. This may be useful when crossing affinity scopes has other + implications, for example, in terms of power consumption or workload + isolation. Strict NUMA scope can also be used to match the workqueue + behavior of older kernels. + + +Affinity Scopes and Performance +=============================== + +It'd be ideal if an unbound workqueue's behavior is optimal for vast +majority of use cases without further tuning. Unfortunately, in the current +kernel, there exists a pronounced trade-off between locality and utilization +necessitating explicit configurations when workqueues are heavily used. + +Higher locality leads to higher efficiency where more work is performed for +the same number of consumed CPU cycles. However, higher locality may also +cause lower overall system utilization if the work items are not spread +enough across the affinity scopes by the issuers. The following performance +testing with dm-crypt clearly illustrates this trade-off. + +The tests are run on a CPU with 12-cores/24-threads split across four L3 +caches (AMD Ryzen 9 3900x). CPU clock boost is turned off for consistency. +``/dev/dm-0`` is a dm-crypt device created on NVME SSD (Samsung 990 PRO) and +opened with ``cryptsetup`` with default settings. + + +Scenario 1: Enough issuers and work spread across the machine +------------------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k --ioengine=libaio \ + --iodepth=64 --runtime=60 --numjobs=24 --time_based --group_reporting \ + --name=iops-test-job --verify=sha512 + +There are 24 issuers, each issuing 64 IOs concurrently. ``--verify=sha512`` +makes ``fio`` generate and read back the content each time which makes +execution locality matter between the issuer and ``kcryptd``. The followings +are the read bandwidths and CPU utilizations depending on different affinity +scope settings on ``kcryptd`` measured over five runs. Bandwidths are in +MiBps, and CPU util in percents. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 1159.40 ±1.34 + - 99.31 ±0.02 + + * - cache + - 1166.40 ±0.89 + - 99.34 ±0.01 + + * - cache (strict) + - 1166.00 ±0.71 + - 99.35 ±0.01 + +With enough issuers spread across the system, there is no downside to +"cache", strict or otherwise. All three configurations saturate the whole +machine but the cache-affine ones outperform by 0.6% thanks to improved +locality. + + +Scenario 2: Fewer issuers, enough work for saturation +----------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \ + --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=8 \ + --time_based --group_reporting --name=iops-test-job --verify=sha512 + +The only difference from the previous scenario is ``--numjobs=8``. There are +a third of the issuers but is still enough total work to saturate the +system. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 1155.40 ±0.89 + - 97.41 ±0.05 + + * - cache + - 1154.40 ±1.14 + - 96.15 ±0.09 + + * - cache (strict) + - 1112.00 ±4.64 + - 93.26 ±0.35 + +This is more than enough work to saturate the system. Both "system" and +"cache" are nearly saturating the machine but not fully. "cache" is using +less CPU but the better efficiency puts it at the same bandwidth as +"system". + +Eight issuers moving around over four L3 cache scope still allow "cache +(strict)" to mostly saturate the machine but the loss of work conservation +is now starting to hurt with 3.7% bandwidth loss. + + +Scenario 3: Even fewer issuers, not enough work to saturate +----------------------------------------------------------- + +The command used: :: + + $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \ + --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=4 \ + --time_based --group_reporting --name=iops-test-job --verify=sha512 + +Again, the only difference is ``--numjobs=4``. With the number of issuers +reduced to four, there now isn't enough work to saturate the whole system +and the bandwidth becomes dependent on completion latencies. + +.. list-table:: + :widths: 16 20 20 + :header-rows: 1 + + * - Affinity + - Bandwidth (MiBps) + - CPU util (%) + + * - system + - 993.60 ±1.82 + - 75.49 ±0.06 + + * - cache + - 973.40 ±1.52 + - 74.90 ±0.07 + + * - cache (strict) + - 828.20 ±4.49 + - 66.84 ±0.29 + +Now, the tradeoff between locality and utilization is clearer. "cache" shows +2% bandwidth loss compared to "system" and "cache (struct)" whopping 20%. + + +Conclusion and Recommendations +------------------------------ + +In the above experiments, the efficiency advantage of the "cache" affinity +scope over "system" is, while consistent and noticeable, small. However, the +impact is dependent on the distances between the scopes and may be more +pronounced in processors with more complex topologies. + +While the loss of work-conservation in certain scenarios hurts, it is a lot +better than "cache (strict)" and maximizing workqueue utilization is +unlikely to be the common case anyway. As such, "cache" is the default +affinity scope for unbound pools. + +* As there is no one option which is great for most cases, workqueue usages + that may consume a significant amount of CPU are recommended to configure + the workqueues using ``apply_workqueue_attrs()`` and/or enable + ``WQ_SYSFS``. + +* An unbound workqueue with strict "cpu" affinity scope behaves the same as + ``WQ_CPU_INTENSIVE`` per-cpu workqueue. There is no real advanage to the + latter and an unbound workqueue provides a lot more flexibility. + +* Affinity scopes are introduced in Linux v6.5. To emulate the previous + behavior, use strict "numa" affinity scope. + +* The loss of work-conservation in non-strict affinity scopes is likely + originating from the scheduler. There is no theoretical reason why the + kernel wouldn't be able to do the right thing and maintain + work-conservation in most cases. As such, it is possible that future + scheduler improvements may make most of these tunables unnecessary. + + +Examining Configuration +======================= + +Use tools/workqueue/wq_dump.py to examine unbound CPU affinity +configuration, worker pools and how workqueues map to the pools: :: + + $ tools/workqueue/wq_dump.py + Affinity Scopes + =============== + wq_unbound_cpumask=0000000f + + CPU + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + SMT + nr_pods 4 + pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008 + pod_node [0]=0 [1]=0 [2]=1 [3]=1 + cpu_pod [0]=0 [1]=1 [2]=2 [3]=3 + + CACHE (default) + nr_pods 2 + pod_cpus [0]=00000003 [1]=0000000c + pod_node [0]=0 [1]=1 + cpu_pod [0]=0 [1]=0 [2]=1 [3]=1 + + NUMA + nr_pods 2 + pod_cpus [0]=00000003 [1]=0000000c + pod_node [0]=0 [1]=1 + cpu_pod [0]=0 [1]=0 [2]=1 [3]=1 + + SYSTEM + nr_pods 1 + pod_cpus [0]=0000000f + pod_node [0]=-1 + cpu_pod [0]=0 [1]=0 [2]=0 [3]=0 + + Worker Pools + ============ + pool[00] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 0 + pool[01] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 0 + pool[02] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 1 + pool[03] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 1 + pool[04] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 2 + pool[05] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 2 + pool[06] ref= 1 nice= 0 idle/workers= 3/ 3 cpu= 3 + pool[07] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 3 + pool[08] ref=42 nice= 0 idle/workers= 6/ 6 cpus=0000000f + pool[09] ref=28 nice= 0 idle/workers= 3/ 3 cpus=00000003 + pool[10] ref=28 nice= 0 idle/workers= 17/ 17 cpus=0000000c + pool[11] ref= 1 nice=-20 idle/workers= 1/ 1 cpus=0000000f + pool[12] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=00000003 + pool[13] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=0000000c + + Workqueue CPU -> pool + ===================== + [ workqueue \ CPU 0 1 2 3 dfl] + events percpu 0 2 4 6 + events_highpri percpu 1 3 5 7 + events_long percpu 0 2 4 6 + events_unbound unbound 9 9 10 10 8 + events_freezable percpu 0 2 4 6 + events_power_efficient percpu 0 2 4 6 + events_freezable_power_ percpu 0 2 4 6 + rcu_gp percpu 0 2 4 6 + rcu_par_gp percpu 0 2 4 6 + slub_flushwq percpu 0 2 4 6 + netns ordered 8 8 8 8 8 + ... + +See the command's help message for more info. + + Monitoring ========== Use tools/workqueue/wq_monitor.py to monitor workqueue operations: :: $ tools/workqueue/wq_monitor.py events - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18545 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38306 0 0.1 - - - - + events_unbound 38306 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29598 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - sock_diag_events 0 0 0.0 0 0 - - - total infl CPUtime CPUhog CMwake mayday rescued + total infl CPUtime CPUhog CMW/RPR mayday rescued events 18548 0 6.1 0 5 - - events_highpri 8 0 0.0 0 0 - - events_long 3 0 0.0 0 0 - - - events_unbound 38322 0 0.1 - - - - + events_unbound 38322 0 0.1 - 7 - - events_freezable 0 0 0.0 0 0 - - events_power_efficient 29603 0 0.2 0 0 - - events_freezable_power_ 10 0 0.0 0 0 - - diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 683efe29fa69..1c1d06804d45 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -125,6 +125,17 @@ struct rcu_work { struct workqueue_struct *wq; }; +enum wq_affn_scope { + WQ_AFFN_DFL, /* use system default */ + WQ_AFFN_CPU, /* one pod per CPU */ + WQ_AFFN_SMT, /* one pod poer SMT */ + WQ_AFFN_CACHE, /* one pod per LLC */ + WQ_AFFN_NUMA, /* one pod per NUMA node */ + WQ_AFFN_SYSTEM, /* one pod across the whole system */ + + WQ_AFFN_NR_TYPES, +}; + /** * struct workqueue_attrs - A struct for workqueue attributes. * @@ -138,17 +149,58 @@ struct workqueue_attrs { /** * @cpumask: allowed CPUs + * + * Work items in this workqueue are affine to these CPUs and not allowed + * to execute on other CPUs. A pool serving a workqueue must have the + * same @cpumask. */ cpumask_var_t cpumask; /** - * @no_numa: disable NUMA affinity + * @__pod_cpumask: internal attribute used to create per-pod pools + * + * Internal use only. + * + * Per-pod unbound worker pools are used to improve locality. Always a + * subset of ->cpumask. A workqueue can be associated with multiple + * worker pools with disjoint @__pod_cpumask's. Whether the enforcement + * of a pool's @__pod_cpumask is strict depends on @affn_strict. + */ + cpumask_var_t __pod_cpumask; + + /** + * @affn_strict: affinity scope is strict + * + * If clear, workqueue will make a best-effort attempt at starting the + * worker inside @__pod_cpumask but the scheduler is free to migrate it + * outside. * - * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It - * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus - * doesn't participate in pool hash calculations or equality comparisons. + * If set, workers are only allowed to run inside @__pod_cpumask. + */ + bool affn_strict; + + /* + * Below fields aren't properties of a worker_pool. They only modify how + * :c:func:`apply_workqueue_attrs` select pools and thus don't + * participate in pool hash calculations or equality comparisons. + */ + + /** + * @affn_scope: unbound CPU affinity scope + * + * CPU pods are used to improve execution locality of unbound work + * items. There are multiple pod types, one for each wq_affn_scope, and + * every CPU in the system belongs to one pod in every pod type. CPUs + * that belong to the same pod share the worker pool. For example, + * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker + * pool for each NUMA node. + */ + enum wq_affn_scope affn_scope; + + /** + * @ordered: work items must be executed one by one in queueing order */ - bool no_numa; + bool ordered; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) @@ -343,14 +395,10 @@ enum { __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ - WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ + WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; -/* unbound wq's aren't per-cpu, scale max_active according to #cpus */ -#define WQ_UNBOUND_MAX_ACTIVE \ - max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) - /* * System-wide workqueues which are always present. * @@ -391,7 +439,7 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq; * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags - * @max_active: max in-flight work items, 0 for default + * @max_active: max in-flight work items per CPU, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed @@ -569,6 +617,7 @@ static inline bool schedule_work(struct work_struct *work) /* * Detect attempt to flush system-wide workqueues at compile time when possible. + * Warn attempt to flush system-wide workqueues at runtime. * * See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp * for reasons and steps for converting system-wide workqueues into local workqueues. @@ -576,52 +625,13 @@ static inline bool schedule_work(struct work_struct *work) extern void __warn_flushing_systemwide_wq(void) __compiletime_warning("Please avoid flushing system-wide workqueues."); -/** - * flush_scheduled_work - ensure that any scheduled work has run to completion. - * - * Forces execution of the kernel-global workqueue and blocks until its - * completion. - * - * It's very easy to get into trouble if you don't take great care. - * Either of the following situations will lead to deadlock: - * - * One of the work items currently on the workqueue needs to acquire - * a lock held by your code or its caller. - * - * Your code is running in the context of a work routine. - * - * They will be detected by lockdep when they occur, but the first might not - * occur very often. It depends on what work items are on the workqueue and - * what locks they need, which you have no control over. - * - * In most situations flushing the entire workqueue is overkill; you merely - * need to know that a particular work item isn't queued and isn't running. - * In such cases you should use cancel_delayed_work_sync() or - * cancel_work_sync() instead. - * - * Please stop calling this function! A conversion to stop flushing system-wide - * workqueues is in progress. This function will be removed after all in-tree - * users stopped calling this function. - */ -/* - * The background of commit 771c035372a036f8 ("deprecate the - * '__deprecated' attribute warnings entirely and for good") is that, - * since Linus builds all modules between every single pull he does, - * the standard kernel build needs to be _clean_ in order to be able to - * notice when new problems happen. Therefore, don't emit warning while - * there are in-tree users. - */ +/* Please stop using this function, for this function will be removed in near future. */ #define flush_scheduled_work() \ ({ \ - if (0) \ - __warn_flushing_systemwide_wq(); \ + __warn_flushing_systemwide_wq(); \ __flush_workqueue(system_wq); \ }) -/* - * Although there is no longer in-tree caller, for now just emit warning - * in order to give out-of-tree callers time to update. - */ #define flush_workqueue(wq) \ ({ \ struct workqueue_struct *_wq = (wq); \ @@ -714,5 +724,6 @@ int workqueue_offline_cpu(unsigned int cpu); void __init workqueue_init_early(void); void __init workqueue_init(void); +void __init workqueue_init_topology(void); #endif diff --git a/init/main.c b/init/main.c index ad920fac325c..436d73261810 100644 --- a/init/main.c +++ b/init/main.c @@ -1540,6 +1540,7 @@ static noinline void __init kernel_init_freeable(void) smp_init(); sched_init_smp(); + workqueue_init_topology(); padata_init(); page_alloc_init_late(); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 800b4208dba9..c85825e17df8 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -122,11 +122,6 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * - * X: During normal operation, modification requires pool->lock and should - * be done only from local cpu. Either disabling preemption on local - * cpu or grabbing pool->lock is enough for read access. If - * POOL_DISASSOCIATED is set, it's identical to L. - * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -160,7 +155,7 @@ struct worker_pool { int cpu; /* I: the associated cpu */ int node; /* I: the associated node ID */ int id; /* I: pool ID */ - unsigned int flags; /* X: flags */ + unsigned int flags; /* L: flags */ unsigned long watchdog_ts; /* L: watchdog timestamp */ bool cpu_stall; /* WD: stalled cpu bound pool */ @@ -216,6 +211,7 @@ enum pool_workqueue_stats { PWQ_STAT_CPU_TIME, /* total CPU time consumed */ PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */ PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */ + PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */ PWQ_STAT_MAYDAY, /* maydays to rescuer */ PWQ_STAT_RESCUED, /* linked work items executed by rescuer */ @@ -262,12 +258,12 @@ struct pool_workqueue { u64 stats[PWQ_NR_STATS]; /* - * Release of unbound pwq is punted to system_wq. See put_pwq() - * and pwq_unbound_release_workfn() for details. pool_workqueue - * itself is also RCU protected so that the first pwq can be - * determined without grabbing wq->mutex. + * Release of unbound pwq is punted to a kthread_worker. See put_pwq() + * and pwq_release_workfn() for details. pool_workqueue itself is also + * RCU protected so that the first pwq can be determined without + * grabbing wq->mutex. */ - struct work_struct unbound_release_work; + struct kthread_work release_work; struct rcu_head rcu; } __aligned(1 << WORK_STRUCT_FLAG_BITS); @@ -326,14 +322,33 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ - struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ - struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */ + struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ }; static struct kmem_cache *pwq_cache; -static cpumask_var_t *wq_numa_possible_cpumask; - /* possible CPUs of each node */ +/* + * Each pod type describes how CPUs should be grouped for unbound workqueues. + * See the comment above workqueue_attrs->affn_scope. + */ +struct wq_pod_type { + int nr_pods; /* number of pods */ + cpumask_var_t *pod_cpus; /* pod -> cpus */ + int *pod_node; /* pod -> node */ + int *cpu_pod; /* cpu -> pod */ +}; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; + +static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", +}; /* * Per-cpu work items which run for longer than the following threshold are @@ -345,19 +360,14 @@ static cpumask_var_t *wq_numa_possible_cpumask; static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); -static bool wq_disable_numa; -module_param_named(disable_numa, wq_disable_numa, bool, 0444); - /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ -static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ - -/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ -static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; +/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_pod_attrs_buf; static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */ @@ -371,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */ /* PL&A: allowable cpus for unbound wqs and work items */ static cpumask_var_t wq_unbound_cpumask; +/* for further constrain wq_unbound_cpumask by cmdline parameter*/ +static struct cpumask wq_cmdline_cpumask __initdata; + /* CPU where unbound work was last round robin scheduled from this CPU */ static DEFINE_PER_CPU(int, wq_rr_cpu_last); @@ -400,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; /* I: attributes used when instantiating ordered pools on demand */ static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; +/* + * I: kthread_worker to release pwq's. pwq release needs to be bounced to a + * process context while holding a pool lock. Bounce to a dedicated kthread + * worker to avoid A-A deadlocks. + */ +static struct kthread_worker *pwq_release_worker; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -606,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } -/** - * unbound_pwq_by_node - return the unbound pool_workqueue for the given node - * @wq: the target workqueue - * @node: the node ID - * - * This must be called with any of wq_pool_mutex, wq->mutex or RCU - * read locked. - * If the pwq needs to be used beyond the locking in effect, the caller is - * responsible for guaranteeing that the pwq stays online. - * - * Return: The unbound pool_workqueue for @node. - */ -static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, - int node) -{ - assert_rcu_or_wq_mutex_or_pool_mutex(wq); - - /* - * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a - * delayed item is pending. The plan is to keep CPU -> NODE - * mapping valid and stable across CPU on/offlines. Once that - * happens, this workaround can be removed. - */ - if (unlikely(node == NUMA_NO_NODE)) - return wq->dfl_pwq; - - return rcu_dereference_raw(wq->numa_pwq_tbl[node]); -} - static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -825,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work) * they're being called with pool->lock held. */ -static bool __need_more_worker(struct worker_pool *pool) -{ - return !pool->nr_running; -} - /* * Need to wake up a worker? Called from anything but currently * running workers. @@ -840,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool) */ static bool need_more_worker(struct worker_pool *pool) { - return !list_empty(&pool->worklist) && __need_more_worker(pool); + return !list_empty(&pool->worklist) && !pool->nr_running; } /* Can I start working? Called from busy but !running workers. */ @@ -871,51 +857,18 @@ static bool too_many_workers(struct worker_pool *pool) return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy; } -/* - * Wake up functions. - */ - -/* Return the first idle worker. Called with pool->lock held. */ -static struct worker *first_idle_worker(struct worker_pool *pool) -{ - if (unlikely(list_empty(&pool->idle_list))) - return NULL; - - return list_first_entry(&pool->idle_list, struct worker, entry); -} - -/** - * wake_up_worker - wake up an idle worker - * @pool: worker pool to wake worker from - * - * Wake up the first idle worker of @pool. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock). - */ -static void wake_up_worker(struct worker_pool *pool) -{ - struct worker *worker = first_idle_worker(pool); - - if (likely(worker)) - wake_up_process(worker->task); -} - /** * worker_set_flags - set worker flags and adjust nr_running accordingly * @worker: self * @flags: flags to set * * Set @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_set_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); /* If transitioning into NOT_RUNNING, adjust nr_running. */ if ((flags & WORKER_NOT_RUNNING) && @@ -932,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags) * @flags: flags to clear * * Clear @flags in @worker->flags and adjust nr_running accordingly. - * - * CONTEXT: - * raw_spin_lock_irq(pool->lock) */ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) { struct worker_pool *pool = worker->pool; unsigned int oflags = worker->flags; - WARN_ON_ONCE(worker->task != current); + lockdep_assert_held(&pool->lock); worker->flags &= ~flags; @@ -955,6 +905,244 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) pool->nr_running++; } +/* Return the first idle worker. Called with pool->lock held. */ +static struct worker *first_idle_worker(struct worker_pool *pool) +{ + if (unlikely(list_empty(&pool->idle_list))) + return NULL; + + return list_first_entry(&pool->idle_list, struct worker, entry); +} + +/** + * worker_enter_idle - enter idle state + * @worker: worker which is entering idle state + * + * @worker is entering idle state. Update stats and idle timer if + * necessary. + * + * LOCKING: + * raw_spin_lock_irq(pool->lock). + */ +static void worker_enter_idle(struct worker *worker) +{ + struct |