summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt28
-rw-r--r--Documentation/core-api/workqueue.rst356
-rw-r--r--include/linux/workqueue.h115
-rw-r--r--init/main.c1
-rw-r--r--kernel/workqueue.c1616
-rw-r--r--kernel/workqueue_internal.h2
-rw-r--r--tools/workqueue/wq_dump.py177
-rw-r--r--tools/workqueue/wq_monitor.py21
8 files changed, 1512 insertions, 804 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index b2b128034225..0a1731a0f0ef 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -7076,6 +7076,13 @@
disables both lockup detectors. Default is 10
seconds.
+ workqueue.unbound_cpus=
+ [KNL,SMP] Specify to constrain one or some CPUs
+ to use in unbound workqueues.
+ Format: <cpu-list>
+ By default, all online CPUs are available for
+ unbound workqueues.
+
workqueue.watchdog_thresh=
If CONFIG_WQ_WATCHDOG is configured, workqueue can
warn stall conditions and dump internal state to
@@ -7097,15 +7104,6 @@
threshold repeatedly. They are likely good
candidates for using WQ_UNBOUND workqueues instead.
- workqueue.disable_numa
- By default, all work items queued to unbound
- workqueues are affine to the NUMA nodes they're
- issued on, which results in better behavior in
- general. If NUMA affinity needs to be disabled for
- whatever reason, this option can be used. Note
- that this also can be controlled per-workqueue for
- workqueues visible under /sys/bus/workqueue/.
-
workqueue.power_efficient
Per-cpu workqueues are generally preferred because
they show better performance thanks to cache
@@ -7121,6 +7119,18 @@
The default value of this parameter is determined by
the config option CONFIG_WQ_POWER_EFFICIENT_DEFAULT.
+ workqueue.default_affinity_scope=
+ Select the default affinity scope to use for unbound
+ workqueues. Can be one of "cpu", "smt", "cache",
+ "numa" and "system". Default is "cache". For more
+ information, see the Affinity Scopes section in
+ Documentation/core-api/workqueue.rst.
+
+ This can be changed after boot by writing to the
+ matching /sys/module/workqueue/parameters file. All
+ workqueues with the "default" affinity scope will be
+ updated accordignly.
+
workqueue.debug_force_rr_cpu
Workqueue used to implicitly guarantee that work
items queued without explicit CPU specified are put
diff --git a/Documentation/core-api/workqueue.rst b/Documentation/core-api/workqueue.rst
index a4c9b9d1905f..5d7b01aed1fe 100644
--- a/Documentation/core-api/workqueue.rst
+++ b/Documentation/core-api/workqueue.rst
@@ -1,6 +1,6 @@
-====================================
-Concurrency Managed Workqueue (cmwq)
-====================================
+=========
+Workqueue
+=========
:Date: September, 2010
:Author: Tejun Heo <tj@kernel.org>
@@ -25,8 +25,8 @@ there is no work item left on the workqueue the worker becomes idle.
When a new work item gets queued, the worker begins executing again.
-Why cmwq?
-=========
+Why Concurrency Managed Workqueue?
+==================================
In the original wq implementation, a multi threaded (MT) wq had one
worker thread per CPU and a single threaded (ST) wq had one worker
@@ -220,17 +220,16 @@ resources, scheduled and executed.
``max_active``
--------------
-``@max_active`` determines the maximum number of execution contexts
-per CPU which can be assigned to the work items of a wq. For example,
-with ``@max_active`` of 16, at most 16 work items of the wq can be
-executing at the same time per CPU.
+``@max_active`` determines the maximum number of execution contexts per
+CPU which can be assigned to the work items of a wq. For example, with
+``@max_active`` of 16, at most 16 work items of the wq can be executing
+at the same time per CPU. This is always a per-CPU attribute, even for
+unbound workqueues.
-Currently, for a bound wq, the maximum limit for ``@max_active`` is
-512 and the default value used when 0 is specified is 256. For an
-unbound wq, the limit is higher of 512 and 4 *
-``num_possible_cpus()``. These values are chosen sufficiently high
-such that they are not the limiting factor while providing protection
-in runaway cases.
+The maximum limit for ``@max_active`` is 512 and the default value used
+when 0 is specified is 256. These values are chosen sufficiently high
+such that they are not the limiting factor while providing protection in
+runaway cases.
The number of active work items of a wq is usually regulated by the
users of the wq, more specifically, by how many work items the users
@@ -348,27 +347,346 @@ Guidelines
level of locality in wq operations and work item execution.
+Affinity Scopes
+===============
+
+An unbound workqueue groups CPUs according to its affinity scope to improve
+cache locality. For example, if a workqueue is using the default affinity
+scope of "cache", it will group CPUs according to last level cache
+boundaries. A work item queued on the workqueue will be assigned to a worker
+on one of the CPUs which share the last level cache with the issuing CPU.
+Once started, the worker may or may not be allowed to move outside the scope
+depending on the ``affinity_strict`` setting of the scope.
+
+Workqueue currently supports the following affinity scopes.
+
+``default``
+ Use the scope in module parameter ``workqueue.default_affinity_scope``
+ which is always set to one of the scopes below.
+
+``cpu``
+ CPUs are not grouped. A work item issued on one CPU is processed by a
+ worker on the same CPU. This makes unbound workqueues behave as per-cpu
+ workqueues without concurrency management.
+
+``smt``
+ CPUs are grouped according to SMT boundaries. This usually means that the
+ logical threads of each physical CPU core are grouped together.
+
+``cache``
+ CPUs are grouped according to cache boundaries. Which specific cache
+ boundary is used is determined by the arch code. L3 is used in a lot of
+ cases. This is the default affinity scope.
+
+``numa``
+ CPUs are grouped according to NUMA bounaries.
+
+``system``
+ All CPUs are put in the same group. Workqueue makes no effort to process a
+ work item on a CPU close to the issuing CPU.
+
+The default affinity scope can be changed with the module parameter
+``workqueue.default_affinity_scope`` and a specific workqueue's affinity
+scope can be changed using ``apply_workqueue_attrs()``.
+
+If ``WQ_SYSFS`` is set, the workqueue will have the following affinity scope
+related interface files under its ``/sys/devices/virtual/WQ_NAME/``
+directory.
+
+``affinity_scope``
+ Read to see the current affinity scope. Write to change.
+
+ When default is the current scope, reading this file will also show the
+ current effective scope in parentheses, for example, ``default (cache)``.
+
+``affinity_strict``
+ 0 by default indicating that affinity scopes are not strict. When a work
+ item starts execution, workqueue makes a best-effort attempt to ensure
+ that the worker is inside its affinity scope, which is called
+ repatriation. Once started, the scheduler is free to move the worker
+ anywhere in the system as it sees fit. This enables benefiting from scope
+ locality while still being able to utilize other CPUs if necessary and
+ available.
+
+ If set to 1, all workers of the scope are guaranteed always to be in the
+ scope. This may be useful when crossing affinity scopes has other
+ implications, for example, in terms of power consumption or workload
+ isolation. Strict NUMA scope can also be used to match the workqueue
+ behavior of older kernels.
+
+
+Affinity Scopes and Performance
+===============================
+
+It'd be ideal if an unbound workqueue's behavior is optimal for vast
+majority of use cases without further tuning. Unfortunately, in the current
+kernel, there exists a pronounced trade-off between locality and utilization
+necessitating explicit configurations when workqueues are heavily used.
+
+Higher locality leads to higher efficiency where more work is performed for
+the same number of consumed CPU cycles. However, higher locality may also
+cause lower overall system utilization if the work items are not spread
+enough across the affinity scopes by the issuers. The following performance
+testing with dm-crypt clearly illustrates this trade-off.
+
+The tests are run on a CPU with 12-cores/24-threads split across four L3
+caches (AMD Ryzen 9 3900x). CPU clock boost is turned off for consistency.
+``/dev/dm-0`` is a dm-crypt device created on NVME SSD (Samsung 990 PRO) and
+opened with ``cryptsetup`` with default settings.
+
+
+Scenario 1: Enough issuers and work spread across the machine
+-------------------------------------------------------------
+
+The command used: ::
+
+ $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k --ioengine=libaio \
+ --iodepth=64 --runtime=60 --numjobs=24 --time_based --group_reporting \
+ --name=iops-test-job --verify=sha512
+
+There are 24 issuers, each issuing 64 IOs concurrently. ``--verify=sha512``
+makes ``fio`` generate and read back the content each time which makes
+execution locality matter between the issuer and ``kcryptd``. The followings
+are the read bandwidths and CPU utilizations depending on different affinity
+scope settings on ``kcryptd`` measured over five runs. Bandwidths are in
+MiBps, and CPU util in percents.
+
+.. list-table::
+ :widths: 16 20 20
+ :header-rows: 1
+
+ * - Affinity
+ - Bandwidth (MiBps)
+ - CPU util (%)
+
+ * - system
+ - 1159.40 ±1.34
+ - 99.31 ±0.02
+
+ * - cache
+ - 1166.40 ±0.89
+ - 99.34 ±0.01
+
+ * - cache (strict)
+ - 1166.00 ±0.71
+ - 99.35 ±0.01
+
+With enough issuers spread across the system, there is no downside to
+"cache", strict or otherwise. All three configurations saturate the whole
+machine but the cache-affine ones outperform by 0.6% thanks to improved
+locality.
+
+
+Scenario 2: Fewer issuers, enough work for saturation
+-----------------------------------------------------
+
+The command used: ::
+
+ $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \
+ --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=8 \
+ --time_based --group_reporting --name=iops-test-job --verify=sha512
+
+The only difference from the previous scenario is ``--numjobs=8``. There are
+a third of the issuers but is still enough total work to saturate the
+system.
+
+.. list-table::
+ :widths: 16 20 20
+ :header-rows: 1
+
+ * - Affinity
+ - Bandwidth (MiBps)
+ - CPU util (%)
+
+ * - system
+ - 1155.40 ±0.89
+ - 97.41 ±0.05
+
+ * - cache
+ - 1154.40 ±1.14
+ - 96.15 ±0.09
+
+ * - cache (strict)
+ - 1112.00 ±4.64
+ - 93.26 ±0.35
+
+This is more than enough work to saturate the system. Both "system" and
+"cache" are nearly saturating the machine but not fully. "cache" is using
+less CPU but the better efficiency puts it at the same bandwidth as
+"system".
+
+Eight issuers moving around over four L3 cache scope still allow "cache
+(strict)" to mostly saturate the machine but the loss of work conservation
+is now starting to hurt with 3.7% bandwidth loss.
+
+
+Scenario 3: Even fewer issuers, not enough work to saturate
+-----------------------------------------------------------
+
+The command used: ::
+
+ $ fio --filename=/dev/dm-0 --direct=1 --rw=randrw --bs=32k \
+ --ioengine=libaio --iodepth=64 --runtime=60 --numjobs=4 \
+ --time_based --group_reporting --name=iops-test-job --verify=sha512
+
+Again, the only difference is ``--numjobs=4``. With the number of issuers
+reduced to four, there now isn't enough work to saturate the whole system
+and the bandwidth becomes dependent on completion latencies.
+
+.. list-table::
+ :widths: 16 20 20
+ :header-rows: 1
+
+ * - Affinity
+ - Bandwidth (MiBps)
+ - CPU util (%)
+
+ * - system
+ - 993.60 ±1.82
+ - 75.49 ±0.06
+
+ * - cache
+ - 973.40 ±1.52
+ - 74.90 ±0.07
+
+ * - cache (strict)
+ - 828.20 ±4.49
+ - 66.84 ±0.29
+
+Now, the tradeoff between locality and utilization is clearer. "cache" shows
+2% bandwidth loss compared to "system" and "cache (struct)" whopping 20%.
+
+
+Conclusion and Recommendations
+------------------------------
+
+In the above experiments, the efficiency advantage of the "cache" affinity
+scope over "system" is, while consistent and noticeable, small. However, the
+impact is dependent on the distances between the scopes and may be more
+pronounced in processors with more complex topologies.
+
+While the loss of work-conservation in certain scenarios hurts, it is a lot
+better than "cache (strict)" and maximizing workqueue utilization is
+unlikely to be the common case anyway. As such, "cache" is the default
+affinity scope for unbound pools.
+
+* As there is no one option which is great for most cases, workqueue usages
+ that may consume a significant amount of CPU are recommended to configure
+ the workqueues using ``apply_workqueue_attrs()`` and/or enable
+ ``WQ_SYSFS``.
+
+* An unbound workqueue with strict "cpu" affinity scope behaves the same as
+ ``WQ_CPU_INTENSIVE`` per-cpu workqueue. There is no real advanage to the
+ latter and an unbound workqueue provides a lot more flexibility.
+
+* Affinity scopes are introduced in Linux v6.5. To emulate the previous
+ behavior, use strict "numa" affinity scope.
+
+* The loss of work-conservation in non-strict affinity scopes is likely
+ originating from the scheduler. There is no theoretical reason why the
+ kernel wouldn't be able to do the right thing and maintain
+ work-conservation in most cases. As such, it is possible that future
+ scheduler improvements may make most of these tunables unnecessary.
+
+
+Examining Configuration
+=======================
+
+Use tools/workqueue/wq_dump.py to examine unbound CPU affinity
+configuration, worker pools and how workqueues map to the pools: ::
+
+ $ tools/workqueue/wq_dump.py
+ Affinity Scopes
+ ===============
+ wq_unbound_cpumask=0000000f
+
+ CPU
+ nr_pods 4
+ pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008
+ pod_node [0]=0 [1]=0 [2]=1 [3]=1
+ cpu_pod [0]=0 [1]=1 [2]=2 [3]=3
+
+ SMT
+ nr_pods 4
+ pod_cpus [0]=00000001 [1]=00000002 [2]=00000004 [3]=00000008
+ pod_node [0]=0 [1]=0 [2]=1 [3]=1
+ cpu_pod [0]=0 [1]=1 [2]=2 [3]=3
+
+ CACHE (default)
+ nr_pods 2
+ pod_cpus [0]=00000003 [1]=0000000c
+ pod_node [0]=0 [1]=1
+ cpu_pod [0]=0 [1]=0 [2]=1 [3]=1
+
+ NUMA
+ nr_pods 2
+ pod_cpus [0]=00000003 [1]=0000000c
+ pod_node [0]=0 [1]=1
+ cpu_pod [0]=0 [1]=0 [2]=1 [3]=1
+
+ SYSTEM
+ nr_pods 1
+ pod_cpus [0]=0000000f
+ pod_node [0]=-1
+ cpu_pod [0]=0 [1]=0 [2]=0 [3]=0
+
+ Worker Pools
+ ============
+ pool[00] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 0
+ pool[01] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 0
+ pool[02] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 1
+ pool[03] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 1
+ pool[04] ref= 1 nice= 0 idle/workers= 4/ 4 cpu= 2
+ pool[05] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 2
+ pool[06] ref= 1 nice= 0 idle/workers= 3/ 3 cpu= 3
+ pool[07] ref= 1 nice=-20 idle/workers= 2/ 2 cpu= 3
+ pool[08] ref=42 nice= 0 idle/workers= 6/ 6 cpus=0000000f
+ pool[09] ref=28 nice= 0 idle/workers= 3/ 3 cpus=00000003
+ pool[10] ref=28 nice= 0 idle/workers= 17/ 17 cpus=0000000c
+ pool[11] ref= 1 nice=-20 idle/workers= 1/ 1 cpus=0000000f
+ pool[12] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=00000003
+ pool[13] ref= 2 nice=-20 idle/workers= 1/ 1 cpus=0000000c
+
+ Workqueue CPU -> pool
+ =====================
+ [ workqueue \ CPU 0 1 2 3 dfl]
+ events percpu 0 2 4 6
+ events_highpri percpu 1 3 5 7
+ events_long percpu 0 2 4 6
+ events_unbound unbound 9 9 10 10 8
+ events_freezable percpu 0 2 4 6
+ events_power_efficient percpu 0 2 4 6
+ events_freezable_power_ percpu 0 2 4 6
+ rcu_gp percpu 0 2 4 6
+ rcu_par_gp percpu 0 2 4 6
+ slub_flushwq percpu 0 2 4 6
+ netns ordered 8 8 8 8 8
+ ...
+
+See the command's help message for more info.
+
+
Monitoring
==========
Use tools/workqueue/wq_monitor.py to monitor workqueue operations: ::
$ tools/workqueue/wq_monitor.py events
- total infl CPUtime CPUhog CMwake mayday rescued
+ total infl CPUtime CPUhog CMW/RPR mayday rescued
events 18545 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
- events_unbound 38306 0 0.1 - - - -
+ events_unbound 38306 0 0.1 - 7 - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29598 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
sock_diag_events 0 0 0.0 0 0 - -
- total infl CPUtime CPUhog CMwake mayday rescued
+ total infl CPUtime CPUhog CMW/RPR mayday rescued
events 18548 0 6.1 0 5 - -
events_highpri 8 0 0.0 0 0 - -
events_long 3 0 0.0 0 0 - -
- events_unbound 38322 0 0.1 - - - -
+ events_unbound 38322 0 0.1 - 7 - -
events_freezable 0 0 0.0 0 0 - -
events_power_efficient 29603 0 0.2 0 0 - -
events_freezable_power_ 10 0 0.0 0 0 - -
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 683efe29fa69..1c1d06804d45 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -125,6 +125,17 @@ struct rcu_work {
struct workqueue_struct *wq;
};
+enum wq_affn_scope {
+ WQ_AFFN_DFL, /* use system default */
+ WQ_AFFN_CPU, /* one pod per CPU */
+ WQ_AFFN_SMT, /* one pod poer SMT */
+ WQ_AFFN_CACHE, /* one pod per LLC */
+ WQ_AFFN_NUMA, /* one pod per NUMA node */
+ WQ_AFFN_SYSTEM, /* one pod across the whole system */
+
+ WQ_AFFN_NR_TYPES,
+};
+
/**
* struct workqueue_attrs - A struct for workqueue attributes.
*
@@ -138,17 +149,58 @@ struct workqueue_attrs {
/**
* @cpumask: allowed CPUs
+ *
+ * Work items in this workqueue are affine to these CPUs and not allowed
+ * to execute on other CPUs. A pool serving a workqueue must have the
+ * same @cpumask.
*/
cpumask_var_t cpumask;
/**
- * @no_numa: disable NUMA affinity
+ * @__pod_cpumask: internal attribute used to create per-pod pools
+ *
+ * Internal use only.
+ *
+ * Per-pod unbound worker pools are used to improve locality. Always a
+ * subset of ->cpumask. A workqueue can be associated with multiple
+ * worker pools with disjoint @__pod_cpumask's. Whether the enforcement
+ * of a pool's @__pod_cpumask is strict depends on @affn_strict.
+ */
+ cpumask_var_t __pod_cpumask;
+
+ /**
+ * @affn_strict: affinity scope is strict
+ *
+ * If clear, workqueue will make a best-effort attempt at starting the
+ * worker inside @__pod_cpumask but the scheduler is free to migrate it
+ * outside.
*
- * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It
- * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus
- * doesn't participate in pool hash calculations or equality comparisons.
+ * If set, workers are only allowed to run inside @__pod_cpumask.
+ */
+ bool affn_strict;
+
+ /*
+ * Below fields aren't properties of a worker_pool. They only modify how
+ * :c:func:`apply_workqueue_attrs` select pools and thus don't
+ * participate in pool hash calculations or equality comparisons.
+ */
+
+ /**
+ * @affn_scope: unbound CPU affinity scope
+ *
+ * CPU pods are used to improve execution locality of unbound work
+ * items. There are multiple pod types, one for each wq_affn_scope, and
+ * every CPU in the system belongs to one pod in every pod type. CPUs
+ * that belong to the same pod share the worker pool. For example,
+ * selecting %WQ_AFFN_NUMA makes the workqueue use a separate worker
+ * pool for each NUMA node.
+ */
+ enum wq_affn_scope affn_scope;
+
+ /**
+ * @ordered: work items must be executed one by one in queueing order
*/
- bool no_numa;
+ bool ordered;
};
static inline struct delayed_work *to_delayed_work(struct work_struct *work)
@@ -343,14 +395,10 @@ enum {
__WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */
WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */
- WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */
+ WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE,
WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2,
};
-/* unbound wq's aren't per-cpu, scale max_active according to #cpus */
-#define WQ_UNBOUND_MAX_ACTIVE \
- max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU)
-
/*
* System-wide workqueues which are always present.
*
@@ -391,7 +439,7 @@ extern struct workqueue_struct *system_freezable_power_efficient_wq;
* alloc_workqueue - allocate a workqueue
* @fmt: printf format for the name of the workqueue
* @flags: WQ_* flags
- * @max_active: max in-flight work items, 0 for default
+ * @max_active: max in-flight work items per CPU, 0 for default
* remaining args: args for @fmt
*
* Allocate a workqueue with the specified parameters. For detailed
@@ -569,6 +617,7 @@ static inline bool schedule_work(struct work_struct *work)
/*
* Detect attempt to flush system-wide workqueues at compile time when possible.
+ * Warn attempt to flush system-wide workqueues at runtime.
*
* See https://lkml.kernel.org/r/49925af7-78a8-a3dd-bce6-cfc02e1a9236@I-love.SAKURA.ne.jp
* for reasons and steps for converting system-wide workqueues into local workqueues.
@@ -576,52 +625,13 @@ static inline bool schedule_work(struct work_struct *work)
extern void __warn_flushing_systemwide_wq(void)
__compiletime_warning("Please avoid flushing system-wide workqueues.");
-/**
- * flush_scheduled_work - ensure that any scheduled work has run to completion.
- *
- * Forces execution of the kernel-global workqueue and blocks until its
- * completion.
- *
- * It's very easy to get into trouble if you don't take great care.
- * Either of the following situations will lead to deadlock:
- *
- * One of the work items currently on the workqueue needs to acquire
- * a lock held by your code or its caller.
- *
- * Your code is running in the context of a work routine.
- *
- * They will be detected by lockdep when they occur, but the first might not
- * occur very often. It depends on what work items are on the workqueue and
- * what locks they need, which you have no control over.
- *
- * In most situations flushing the entire workqueue is overkill; you merely
- * need to know that a particular work item isn't queued and isn't running.
- * In such cases you should use cancel_delayed_work_sync() or
- * cancel_work_sync() instead.
- *
- * Please stop calling this function! A conversion to stop flushing system-wide
- * workqueues is in progress. This function will be removed after all in-tree
- * users stopped calling this function.
- */
-/*
- * The background of commit 771c035372a036f8 ("deprecate the
- * '__deprecated' attribute warnings entirely and for good") is that,
- * since Linus builds all modules between every single pull he does,
- * the standard kernel build needs to be _clean_ in order to be able to
- * notice when new problems happen. Therefore, don't emit warning while
- * there are in-tree users.
- */
+/* Please stop using this function, for this function will be removed in near future. */
#define flush_scheduled_work() \
({ \
- if (0) \
- __warn_flushing_systemwide_wq(); \
+ __warn_flushing_systemwide_wq(); \
__flush_workqueue(system_wq); \
})
-/*
- * Although there is no longer in-tree caller, for now just emit warning
- * in order to give out-of-tree callers time to update.
- */
#define flush_workqueue(wq) \
({ \
struct workqueue_struct *_wq = (wq); \
@@ -714,5 +724,6 @@ int workqueue_offline_cpu(unsigned int cpu);
void __init workqueue_init_early(void);
void __init workqueue_init(void);
+void __init workqueue_init_topology(void);
#endif
diff --git a/init/main.c b/init/main.c
index ad920fac325c..436d73261810 100644
--- a/init/main.c
+++ b/init/main.c
@@ -1540,6 +1540,7 @@ static noinline void __init kernel_init_freeable(void)
smp_init();
sched_init_smp();
+ workqueue_init_topology();
padata_init();
page_alloc_init_late();
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 800b4208dba9..c85825e17df8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -122,11 +122,6 @@ enum {
*
* L: pool->lock protected. Access with pool->lock held.
*
- * X: During normal operation, modification requires pool->lock and should
- * be done only from local cpu. Either disabling preemption on local
- * cpu or grabbing pool->lock is enough for read access. If
- * POOL_DISASSOCIATED is set, it's identical to L.
- *
* K: Only modified by worker while holding pool->lock. Can be safely read by
* self, while holding pool->lock or from IRQ context if %current is the
* kworker.
@@ -160,7 +155,7 @@ struct worker_pool {
int cpu; /* I: the associated cpu */
int node; /* I: the associated node ID */
int id; /* I: pool ID */
- unsigned int flags; /* X: flags */
+ unsigned int flags; /* L: flags */
unsigned long watchdog_ts; /* L: watchdog timestamp */
bool cpu_stall; /* WD: stalled cpu bound pool */
@@ -216,6 +211,7 @@ enum pool_workqueue_stats {
PWQ_STAT_CPU_TIME, /* total CPU time consumed */
PWQ_STAT_CPU_INTENSIVE, /* wq_cpu_intensive_thresh_us violations */
PWQ_STAT_CM_WAKEUP, /* concurrency-management worker wakeups */
+ PWQ_STAT_REPATRIATED, /* unbound workers brought back into scope */
PWQ_STAT_MAYDAY, /* maydays to rescuer */
PWQ_STAT_RESCUED, /* linked work items executed by rescuer */
@@ -262,12 +258,12 @@ struct pool_workqueue {
u64 stats[PWQ_NR_STATS];
/*
- * Release of unbound pwq is punted to system_wq. See put_pwq()
- * and pwq_unbound_release_workfn() for details. pool_workqueue
- * itself is also RCU protected so that the first pwq can be
- * determined without grabbing wq->mutex.
+ * Release of unbound pwq is punted to a kthread_worker. See put_pwq()
+ * and pwq_release_workfn() for details. pool_workqueue itself is also
+ * RCU protected so that the first pwq can be determined without
+ * grabbing wq->mutex.
*/
- struct work_struct unbound_release_work;
+ struct kthread_work release_work;
struct rcu_head rcu;
} __aligned(1 << WORK_STRUCT_FLAG_BITS);
@@ -326,14 +322,33 @@ struct workqueue_struct {
/* hot fields used during command issue, aligned to cacheline */
unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
- struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
- struct pool_workqueue __rcu *numa_pwq_tbl[]; /* PWR: unbound pwqs indexed by node */
+ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */
};
static struct kmem_cache *pwq_cache;
-static cpumask_var_t *wq_numa_possible_cpumask;
- /* possible CPUs of each node */
+/*
+ * Each pod type describes how CPUs should be grouped for unbound workqueues.
+ * See the comment above workqueue_attrs->affn_scope.
+ */
+struct wq_pod_type {
+ int nr_pods; /* number of pods */
+ cpumask_var_t *pod_cpus; /* pod -> cpus */
+ int *pod_node; /* pod -> node */
+ int *cpu_pod; /* cpu -> pod */
+};
+
+static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES];
+static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE;
+
+static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = {
+ [WQ_AFFN_DFL] = "default",
+ [WQ_AFFN_CPU] = "cpu",
+ [WQ_AFFN_SMT] = "smt",
+ [WQ_AFFN_CACHE] = "cache",
+ [WQ_AFFN_NUMA] = "numa",
+ [WQ_AFFN_SYSTEM] = "system",
+};
/*
* Per-cpu work items which run for longer than the following threshold are
@@ -345,19 +360,14 @@ static cpumask_var_t *wq_numa_possible_cpumask;
static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX;
module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644);
-static bool wq_disable_numa;
-module_param_named(disable_numa, wq_disable_numa, bool, 0444);
-
/* see the comment above the definition of WQ_POWER_EFFICIENT */
static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT);
module_param_named(power_efficient, wq_power_efficient, bool, 0444);
static bool wq_online; /* can kworkers be created yet? */
-static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
-
-/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
-static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+/* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_pod_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_MUTEX(wq_pool_attach_mutex); /* protects worker attach/detach */
@@ -371,6 +381,9 @@ static bool workqueue_freezing; /* PL: have wqs started freezing? */
/* PL&A: allowable cpus for unbound wqs and work items */
static cpumask_var_t wq_unbound_cpumask;
+/* for further constrain wq_unbound_cpumask by cmdline parameter*/
+static struct cpumask wq_cmdline_cpumask __initdata;
+
/* CPU where unbound work was last round robin scheduled from this CPU */
static DEFINE_PER_CPU(int, wq_rr_cpu_last);
@@ -400,6 +413,13 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
/* I: attributes used when instantiating ordered pools on demand */
static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS];
+/*
+ * I: kthread_worker to release pwq's. pwq release needs to be bounced to a
+ * process context while holding a pool lock. Bounce to a dedicated kthread
+ * worker to avoid A-A deadlocks.
+ */
+static struct kthread_worker *pwq_release_worker;
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -606,35 +626,6 @@ static int worker_pool_assign_id(struct worker_pool *pool)
return ret;
}
-/**
- * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
- * @wq: the target workqueue
- * @node: the node ID
- *
- * This must be called with any of wq_pool_mutex, wq->mutex or RCU
- * read locked.
- * If the pwq needs to be used beyond the locking in effect, the caller is
- * responsible for guaranteeing that the pwq stays online.
- *
- * Return: The unbound pool_workqueue for @node.
- */
-static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
- int node)
-{
- assert_rcu_or_wq_mutex_or_pool_mutex(wq);
-
- /*
- * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
- * delayed item is pending. The plan is to keep CPU -> NODE
- * mapping valid and stable across CPU on/offlines. Once that
- * happens, this workaround can be removed.
- */
- if (unlikely(node == NUMA_NO_NODE))
- return wq->dfl_pwq;
-
- return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
-}
-
static unsigned int work_color_to_flags(int color)
{
return color << WORK_STRUCT_COLOR_SHIFT;
@@ -825,11 +816,6 @@ static bool work_is_canceling(struct work_struct *work)
* they're being called with pool->lock held.
*/
-static bool __need_more_worker(struct worker_pool *pool)
-{
- return !pool->nr_running;
-}
-
/*
* Need to wake up a worker? Called from anything but currently
* running workers.
@@ -840,7 +826,7 @@ static bool __need_more_worker(struct worker_pool *pool)
*/
static bool need_more_worker(struct worker_pool *pool)
{
- return !list_empty(&pool->worklist) && __need_more_worker(pool);
+ return !list_empty(&pool->worklist) && !pool->nr_running;
}
/* Can I start working? Called from busy but !running workers. */
@@ -871,51 +857,18 @@ static bool too_many_workers(struct worker_pool *pool)
return nr_idle > 2 && (nr_idle - 2) * MAX_IDLE_WORKERS_RATIO >= nr_busy;
}
-/*
- * Wake up functions.
- */
-
-/* Return the first idle worker. Called with pool->lock held. */
-static struct worker *first_idle_worker(struct worker_pool *pool)
-{
- if (unlikely(list_empty(&pool->idle_list)))
- return NULL;
-
- return list_first_entry(&pool->idle_list, struct worker, entry);
-}
-
-/**
- * wake_up_worker - wake up an idle worker
- * @pool: worker pool to wake worker from
- *
- * Wake up the first idle worker of @pool.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock).
- */
-static void wake_up_worker(struct worker_pool *pool)
-{
- struct worker *worker = first_idle_worker(pool);
-
- if (likely(worker))
- wake_up_process(worker->task);
-}
-
/**
* worker_set_flags - set worker flags and adjust nr_running accordingly
* @worker: self
* @flags: flags to set
*
* Set @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_set_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
/* If transitioning into NOT_RUNNING, adjust nr_running. */
if ((flags & WORKER_NOT_RUNNING) &&
@@ -932,16 +885,13 @@ static inline void worker_set_flags(struct worker *worker, unsigned int flags)
* @flags: flags to clear
*
* Clear @flags in @worker->flags and adjust nr_running accordingly.
- *
- * CONTEXT:
- * raw_spin_lock_irq(pool->lock)
*/
static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
{
struct worker_pool *pool = worker->pool;
unsigned int oflags = worker->flags;
- WARN_ON_ONCE(worker->task != current);
+ lockdep_assert_held(&pool->lock);
worker->flags &= ~flags;
@@ -955,6 +905,244 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
pool->nr_running++;
}
+/* Return the first idle worker. Called with pool->lock held. */
+static struct worker *first_idle_worker(struct worker_pool *pool)
+{
+ if (unlikely(list_empty(&pool->idle_list)))
+ return NULL;
+
+ return list_first_entry(&pool->idle_list, struct worker, entry);
+}
+
+/**
+ * worker_enter_idle - enter idle state
+ * @worker: worker which is entering idle state
+ *
+ * @worker is entering idle state. Update stats and idle timer if
+ * necessary.
+ *
+ * LOCKING:
+ * raw_spin_lock_irq(pool->lock).
+ */
+static void worker_enter_idle(struct worker *worker)
+{
+ struct