diff options
Diffstat (limited to 'kernel/workqueue.c')
-rw-r--r-- | kernel/workqueue.c | 1847 |
1 files changed, 1412 insertions, 435 deletions
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 7b482a26d741..0066c8f6c154 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -29,6 +29,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/init.h> +#include <linux/interrupt.h> #include <linux/signal.h> #include <linux/completion.h> #include <linux/workqueue.h> @@ -53,10 +54,11 @@ #include <linux/nmi.h> #include <linux/kvm_para.h> #include <linux/delay.h> +#include <linux/irq_work.h> #include "workqueue_internal.h" -enum { +enum worker_pool_flags { /* * worker_pool flags * @@ -72,10 +74,17 @@ enum { * Note that DISASSOCIATED should be flipped only while holding * wq_pool_attach_mutex to avoid changing binding state while * worker_attach_to_pool() is in progress. + * + * As there can only be one concurrent BH execution context per CPU, a + * BH pool is per-CPU and always DISASSOCIATED. */ - POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */ + POOL_BH = 1 << 0, /* is a BH pool */ + POOL_MANAGER_ACTIVE = 1 << 1, /* being managed */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ + POOL_BH_DRAINING = 1 << 3, /* draining after CPU offline */ +}; +enum worker_flags { /* worker flags */ WORKER_DIE = 1 << 1, /* die die die */ WORKER_IDLE = 1 << 2, /* is idle */ @@ -86,7 +95,13 @@ enum { WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | WORKER_UNBOUND | WORKER_REBOUND, +}; +enum work_cancel_flags { + WORK_CANCEL_DELAYED = 1 << 0, /* canceling a delayed_work */ +}; + +enum wq_internal_consts { NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ @@ -108,10 +123,18 @@ enum { RESCUER_NICE_LEVEL = MIN_NICE, HIGHPRI_NICE_LEVEL = MIN_NICE, - WQ_NAME_LEN = 24, + WQ_NAME_LEN = 32, }; /* + * We don't want to trap softirq for too long. See MAX_SOFTIRQ_TIME and + * MAX_SOFTIRQ_RESTART in kernel/softirq.c. These are macros because + * msecs_to_jiffies() can't be an initializer. + */ +#define BH_WORKER_JIFFIES msecs_to_jiffies(2) +#define BH_WORKER_RESTARTS 10 + +/* * Structure fields follow one of the following exclusion rules. * * I: Modifiable by initialization/destruction paths and read-only for @@ -122,6 +145,9 @@ enum { * * L: pool->lock protected. Access with pool->lock held. * + * LN: pool->lock and wq_node_nr_active->lock protected for writes. Either for + * reads. + * * K: Only modified by worker while holding pool->lock. Can be safely read by * self, while holding pool->lock or from IRQ context if %current is the * kworker. @@ -143,6 +169,9 @@ enum { * * WR: wq->mutex protected for writes. RCU protected for reads. * + * WO: wq->mutex protected for writes. Updated with WRITE_ONCE() and can be read + * with READ_ONCE() without locking. + * * MD: wq_mayday_lock protected. * * WD: Used internally by the watchdog. @@ -219,7 +248,7 @@ enum pool_workqueue_stats { }; /* - * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS + * The per-pool workqueue. While queued, bits below WORK_PWQ_SHIFT * of work_struct->data are used for flags and the remaining high bits * point to the pwq; thus, pwqs need to be aligned at two's power of the * number of flag bits. @@ -232,6 +261,7 @@ struct pool_workqueue { int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ + bool plugged; /* L: execution suspended */ /* * nr_active management and WORK_STRUCT_INACTIVE: @@ -240,18 +270,18 @@ struct pool_workqueue { * pwq->inactive_works instead of pool->worklist and marked with * WORK_STRUCT_INACTIVE. * - * All work items marked with WORK_STRUCT_INACTIVE do not participate - * in pwq->nr_active and all work items in pwq->inactive_works are - * marked with WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE - * work items are in pwq->inactive_works. Some of them are ready to - * run in pool->worklist or worker->scheduled. Those work itmes are - * only struct wq_barrier which is used for flush_work() and should - * not participate in pwq->nr_active. For non-barrier work item, it - * is marked with WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. + * All work items marked with WORK_STRUCT_INACTIVE do not participate in + * nr_active and all work items in pwq->inactive_works are marked with + * WORK_STRUCT_INACTIVE. But not all WORK_STRUCT_INACTIVE work items are + * in pwq->inactive_works. Some of them are ready to run in + * pool->worklist or worker->scheduled. Those work itmes are only struct + * wq_barrier which is used for flush_work() and should not participate + * in nr_active. For non-barrier work item, it is marked with + * WORK_STRUCT_INACTIVE iff it is in pwq->inactive_works. */ int nr_active; /* L: nr of active works */ - int max_active; /* L: max active works */ struct list_head inactive_works; /* L: inactive works */ + struct list_head pending_node; /* LN: node on wq_node_nr_active->pending_pwqs */ struct list_head pwqs_node; /* WR: node on wq->pwqs */ struct list_head mayday_node; /* MD: node on wq->maydays */ @@ -265,7 +295,7 @@ struct pool_workqueue { */ struct kthread_work release_work; struct rcu_head rcu; -} __aligned(1 << WORK_STRUCT_FLAG_BITS); +} __aligned(1 << WORK_STRUCT_PWQ_SHIFT); /* * Structure used to wait for workqueue flush. @@ -279,6 +309,26 @@ struct wq_flusher { struct wq_device; /* + * Unlike in a per-cpu workqueue where max_active limits its concurrency level + * on each CPU, in an unbound workqueue, max_active applies to the whole system. + * As sharing a single nr_active across multiple sockets can be very expensive, + * the counting and enforcement is per NUMA node. + * + * The following struct is used to enforce per-node max_active. When a pwq wants + * to start executing a work item, it should increment ->nr using + * tryinc_node_nr_active(). If acquisition fails due to ->nr already being over + * ->max, the pwq is queued on ->pending_pwqs. As in-flight work items finish + * and decrement ->nr, node_activate_pending_pwq() activates the pending pwqs in + * round-robin order. + */ +struct wq_node_nr_active { + int max; /* per-node max_active */ + atomic_t nr; /* per-node nr_active */ + raw_spinlock_t lock; /* nests inside pool locks */ + struct list_head pending_pwqs; /* LN: pwqs with inactive works */ +}; + +/* * The externally visible workqueue. It relays the issued work items to * the appropriate worker_pool through its pool_workqueues. */ @@ -298,10 +348,15 @@ struct workqueue_struct { struct worker *rescuer; /* MD: rescue worker */ int nr_drainers; /* WQ: drain in progress */ - int saved_max_active; /* WQ: saved pwq max_active */ + + /* See alloc_workqueue() function comment for info on min/max_active */ + int max_active; /* WO: max active works */ + int min_active; /* WO: min active works */ + int saved_max_active; /* WQ: saved max_active */ + int saved_min_active; /* WQ: saved min_active */ struct workqueue_attrs *unbound_attrs; /* PW: only for unbound wqs */ - struct pool_workqueue *dfl_pwq; /* PW: only for unbound wqs */ + struct pool_workqueue __rcu *dfl_pwq; /* PW: only for unbound wqs */ #ifdef CONFIG_SYSFS struct wq_device *wq_dev; /* I: for sysfs interface */ @@ -323,10 +378,9 @@ struct workqueue_struct { /* hot fields used during command issue, aligned to cacheline */ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ struct pool_workqueue __percpu __rcu **cpu_pwq; /* I: per-cpu pwqs */ + struct wq_node_nr_active *node_nr_active[]; /* I: per-node nr_active */ }; -static struct kmem_cache *pwq_cache; - /* * Each pod type describes how CPUs should be grouped for unbound workqueues. * See the comment above workqueue_attrs->affn_scope. @@ -338,16 +392,13 @@ struct wq_pod_type { int *cpu_pod; /* cpu -> pod */ }; -static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; -static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; - static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { - [WQ_AFFN_DFL] = "default", - [WQ_AFFN_CPU] = "cpu", - [WQ_AFFN_SMT] = "smt", - [WQ_AFFN_CACHE] = "cache", - [WQ_AFFN_NUMA] = "numa", - [WQ_AFFN_SYSTEM] = "system", + [WQ_AFFN_DFL] = "default", + [WQ_AFFN_CPU] = "cpu", + [WQ_AFFN_SMT] = "smt", + [WQ_AFFN_CACHE] = "cache", + [WQ_AFFN_NUMA] = "numa", + [WQ_AFFN_SYSTEM] = "system", }; /* @@ -359,12 +410,22 @@ static const char *wq_affn_names[WQ_AFFN_NR_TYPES] = { */ static unsigned long wq_cpu_intensive_thresh_us = ULONG_MAX; module_param_named(cpu_intensive_thresh_us, wq_cpu_intensive_thresh_us, ulong, 0644); +#ifdef CONFIG_WQ_CPU_INTENSIVE_REPORT +static unsigned int wq_cpu_intensive_warning_thresh = 4; +module_param_named(cpu_intensive_warning_thresh, wq_cpu_intensive_warning_thresh, uint, 0644); +#endif /* see the comment above the definition of WQ_POWER_EFFICIENT */ static bool wq_power_efficient = IS_ENABLED(CONFIG_WQ_POWER_EFFICIENT_DEFAULT); module_param_named(power_efficient, wq_power_efficient, bool, 0444); static bool wq_online; /* can kworkers be created yet? */ +static bool wq_topo_initialized __read_mostly = false; + +static struct kmem_cache *pwq_cache; + +static struct wq_pod_type wq_pod_types[WQ_AFFN_NR_TYPES]; +static enum wq_affn_scope wq_affn_dfl = WQ_AFFN_CACHE; /* buf for wq_update_unbound_pod_attrs(), protected by CPU hotplug exclusion */ static struct workqueue_attrs *wq_update_pod_attrs_buf; @@ -405,8 +466,17 @@ static bool wq_debug_force_rr_cpu = false; #endif module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); +/* to raise softirq for the BH worker pools on other CPUs */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct irq_work [NR_STD_WORKER_POOLS], + bh_pool_irq_works); + +/* the BH worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + bh_worker_pools); + /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -420,6 +490,12 @@ static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; static struct workqueue_attrs *ordered_wq_attrs[NR_STD_WORKER_POOLS]; /* + * Used to synchronize multiple cancel_sync attempts on the same work item. See + * work_grab_pending() and __cancel_work_sync(). + */ +static DECLARE_WAIT_QUEUE_HEAD(wq_cancel_waitq); + +/* * I: kthread_worker to release pwq's. pwq release needs to be bounced to a * process context while holding a pool lock. Bounce to a dedicated kthread * worker to avoid A-A deadlocks. @@ -440,6 +516,10 @@ struct workqueue_struct *system_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_power_efficient_wq); struct workqueue_struct *system_freezable_power_efficient_wq __ro_after_init; EXPORT_SYMBOL_GPL(system_freezable_power_efficient_wq); +struct workqueue_struct *system_bh_wq; +EXPORT_SYMBOL_GPL(system_bh_wq); +struct workqueue_struct *system_bh_highpri_wq; +EXPORT_SYMBOL_GPL(system_bh_highpri_wq); static int worker_thread(void *__worker); static void workqueue_sysfs_unregister(struct workqueue_struct *wq); @@ -450,16 +530,21 @@ static void show_one_worker_pool(struct worker_pool *pool); #include <trace/events/workqueue.h> #define assert_rcu_or_pool_mutex() \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU or wq_pool_mutex should be held") #define assert_rcu_or_wq_mutex_or_pool_mutex(wq) \ - RCU_LOCKDEP_WARN(!rcu_read_lock_held() && \ + RCU_LOCKDEP_WARN(!rcu_read_lock_any_held() && \ !lockdep_is_held(&wq->mutex) && \ !lockdep_is_held(&wq_pool_mutex), \ "RCU, wq->mutex or wq_pool_mutex should be held") +#define for_each_bh_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(bh_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(bh_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) + #define for_each_cpu_worker_pool(pool, cpu) \ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ @@ -632,6 +717,36 @@ static int worker_pool_assign_id(struct worker_pool *pool) return ret; } +static struct pool_workqueue __rcu ** +unbound_pwq_slot(struct workqueue_struct *wq, int cpu) +{ + if (cpu >= 0) + return per_cpu_ptr(wq->cpu_pwq, cpu); + else + return &wq->dfl_pwq; +} + +/* @cpu < 0 for dfl_pwq */ +static struct pool_workqueue *unbound_pwq(struct workqueue_struct *wq, int cpu) +{ + return rcu_dereference_check(*unbound_pwq_slot(wq, cpu), + lockdep_is_held(&wq_pool_mutex) || + lockdep_is_held(&wq->mutex)); +} + +/** + * unbound_effective_cpumask - effective cpumask of an unbound workqueue + * @wq: workqueue of interest + * + * @wq->unbound_attrs->cpumask contains the cpumask requested by the user which + * is masked with wq_unbound_cpumask to determine the effective cpumask. The + * default pwq is always mapped to the pool with the current effective cpumask. + */ +static struct cpumask *unbound_effective_cpumask(struct workqueue_struct *wq) +{ + return unbound_pwq(wq, -1)->pool->attrs->__pod_cpumask; +} + static unsigned int work_color_to_flags(int color) { return color << WORK_STRUCT_COLOR_SHIFT; @@ -653,10 +768,9 @@ static int work_next_color(int color) * contain the pointer to the queued pwq. Once execution starts, the flag * is cleared and the high bits contain OFFQ flags and pool ID. * - * set_work_pwq(), set_work_pool_and_clear_pending(), mark_work_canceling() - * and clear_work_data() can be used to set the pwq, pool or clear - * work->data. These functions should only be called while the work is - * owned - ie. while the PENDING bit is set. + * set_work_pwq(), set_work_pool_and_clear_pending() and mark_work_canceling() + * can be used to set the pwq, pool or clear work->data. These functions should + * only be called while the work is owned - ie. while the PENDING bit is set. * * get_work_pool() and get_work_pwq() can be used to obtain the pool or pwq * corresponding to a work. Pool is available once the work has been @@ -668,29 +782,28 @@ static int work_next_color(int color) * but stay off timer and worklist for arbitrarily long and nobody should * try to steal the PENDING bit. */ -static inline void set_work_data(struct work_struct *work, unsigned long data, - unsigned long flags) +static inline void set_work_data(struct work_struct *work, unsigned long data) { WARN_ON_ONCE(!work_pending(work)); - atomic_long_set(&work->data, data | flags | work_static(work)); + atomic_long_set(&work->data, data | work_static(work)); } static void set_work_pwq(struct work_struct *work, struct pool_workqueue *pwq, - unsigned long extra_flags) + unsigned long flags) { - set_work_data(work, (unsigned long)pwq, - WORK_STRUCT_PENDING | WORK_STRUCT_PWQ | extra_flags); + set_work_data(work, (unsigned long)pwq | WORK_STRUCT_PENDING | + WORK_STRUCT_PWQ | flags); } static void set_work_pool_and_keep_pending(struct work_struct *work, - int pool_id) + int pool_id, unsigned long flags) { - set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, - WORK_STRUCT_PENDING); + set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | + WORK_STRUCT_PENDING | flags); } static void set_work_pool_and_clear_pending(struct work_struct *work, - int pool_id) + int pool_id, unsigned long flags) { /* * The following wmb is paired with the implied mb in @@ -699,7 +812,8 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, * owner. */ smp_wmb(); - set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + set_work_data(work, ((unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT) | + flags); /* * The following mb guarantees that previous clear of a PENDING bit * will not be reordered with any speculative LOADS or STORES from @@ -731,15 +845,9 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, smp_mb(); } -static void clear_work_data(struct work_struct *work) -{ - smp_wmb(); /* see set_work_pool_and_clear_pending() */ - set_work_data(work, WORK_STRUCT_NO_POOL, 0); -} - static inline struct pool_workqueue *work_struct_pwq(unsigned long data) { - return (struct pool_workqueue *)(data & WORK_STRUCT_WQ_DATA_MASK); + return (struct pool_workqueue *)(data & WORK_STRUCT_PWQ_MASK); } static struct pool_workqueue *get_work_pwq(struct work_struct *work) @@ -806,7 +914,7 @@ static void mark_work_canceling(struct work_struct *work) unsigned long pool_id = get_work_pool_id(work); pool_id <<= WORK_OFFQ_POOL_SHIFT; - set_work_data(work, pool_id | WORK_OFFQ_CANCELING, WORK_STRUCT_PENDING); + set_work_data(work, pool_id | WORK_STRUCT_PENDING | WORK_OFFQ_CANCELING); } static bool work_is_canceling(struct work_struct *work) @@ -1101,6 +1209,29 @@ static bool assign_work(struct work_struct *work, struct worker *worker, return true; } +static struct irq_work *bh_pool_irq_work(struct worker_pool *pool) +{ + int high = pool->attrs->nice == HIGHPRI_NICE_LEVEL ? 1 : 0; + + return &per_cpu(bh_pool_irq_works, pool->cpu)[high]; +} + +static void kick_bh_pool(struct worker_pool *pool) +{ +#ifdef CONFIG_SMP + /* see drain_dead_softirq_workfn() for BH_DRAINING */ + if (unlikely(pool->cpu != smp_processor_id() && + !(pool->flags & POOL_BH_DRAINING))) { + irq_work_queue_on(bh_pool_irq_work(pool), pool->cpu); + return; + } +#endif + if (pool->attrs->nice == HIGHPRI_NICE_LEVEL) + raise_softirq_irqoff(HI_SOFTIRQ); + else + raise_softirq_irqoff(TASKLET_SOFTIRQ); +} + /** * kick_pool - wake up an idle worker if necessary * @pool: pool to kick @@ -1118,6 +1249,11 @@ static bool kick_pool(struct worker_pool *pool) if (!need_more_worker(pool) || !worker) return false; + if (pool->flags & POOL_BH) { + kick_bh_pool(pool); + return true; + } + p = worker->task; #ifdef CONFIG_SMP @@ -1198,11 +1334,13 @@ restart: u64 cnt; /* - * Start reporting from the fourth time and back off + * Start reporting from the warning_thresh and back off * exponentially. */ cnt = atomic64_inc_return_relaxed(&ent->cnt); - if (cnt >= 4 && is_power_of_2(cnt)) + if (wq_cpu_intensive_warning_thresh && + cnt >= wq_cpu_intensive_warning_thresh && + is_power_of_2(cnt + 1 - wq_cpu_intensive_warning_thresh)) printk_deferred(KERN_WARNING "workqueue: %ps hogged CPU for >%luus %llu times, consider switching to WQ_UNBOUND\n", ent->func, wq_cpu_intensive_thresh_us, atomic64_read(&ent->cnt)); @@ -1231,10 +1369,12 @@ restart: ent = &wci_ents[wci_nr_ents++]; ent->func = func; - atomic64_set(&ent->cnt, 1); + atomic64_set(&ent->cnt, 0); hash_add_rcu(wci_hash, &ent->hash_node, (unsigned long)func); raw_spin_unlock(&wci_lock); + + goto restart; } #else /* CONFIG_WQ_CPU_INTENSIVE_REPORT */ @@ -1402,6 +1542,74 @@ work_func_t wq_worker_last_func(struct task_struct *task) } /** + * wq_node_nr_active - Determine wq_node_nr_active to use + * @wq: workqueue of interest + * @node: NUMA node, can be %NUMA_NO_NODE + * + * Determine wq_node_nr_active to use for @wq on @node. Returns: + * + * - %NULL for per-cpu workqueues as they don't need to use shared nr_active. + * + * - node_nr_active[nr_node_ids] if @node is %NUMA_NO_NODE. + * + * - Otherwise, node_nr_active[@node]. + */ +static struct wq_node_nr_active *wq_node_nr_active(struct workqueue_struct *wq, + int node) +{ + if (!(wq->flags & WQ_UNBOUND)) + return NULL; + + if (node == NUMA_NO_NODE) + node = nr_node_ids; + + return wq->node_nr_active[node]; +} + +/** + * wq_update_node_max_active - Update per-node max_actives to use + * @wq: workqueue to update + * @off_cpu: CPU that's going down, -1 if a CPU is not going down + * + * Update @wq->node_nr_active[]->max. @wq must be unbound. max_active is + * distributed among nodes according to the proportions of numbers of online + * cpus. The result is always between @wq->min_active and max_active. + */ +static void wq_update_node_max_active(struct workqueue_struct *wq, int off_cpu) +{ + struct cpumask *effective = unbound_effective_cpumask(wq); + int min_active = READ_ONCE(wq->min_active); + int max_active = READ_ONCE(wq->max_active); + int total_cpus, node; + + lockdep_assert_held(&wq->mutex); + + if (!wq_topo_initialized) + return; + + if (off_cpu >= 0 && !cpumask_test_cpu(off_cpu, effective)) + off_cpu = -1; + + total_cpus = cpumask_weight_and(effective, cpu_online_mask); + if (off_cpu >= 0) + total_cpus--; + + for_each_node(node) { + int node_cpus; + + node_cpus = cpumask_weight_and(effective, cpumask_of_node(node)); + if (off_cpu >= 0 && cpu_to_node(off_cpu) == node) + node_cpus--; + + wq_node_nr_active(wq, node)->max = + clamp(DIV_ROUND_UP(max_active * node_cpus, total_cpus), + min_active, max_active); + } + + wq_node_nr_active(wq, NUMA_NO_NODE)->max = min_active; +} + +/** * get_pwq - get an extra reference on the specified pool_workqueue * @pwq: pool_workqueue to get * @@ -1453,24 +1661,336 @@ static void put_pwq_unlocked(struct pool_workqueue *pwq) } } -static void pwq_activate_inactive_work(struct work_struct *work) +static bool pwq_is_empty(struct pool_workqueue *pwq) { - struct pool_workqueue *pwq = get_work_pwq(work); + return !pwq->nr_active && list_empty(&pwq->inactive_works); +} +static void __pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) +{ + unsigned long *wdb = work_data_bits(work); + + WARN_ON_ONCE(!(*wdb & WORK_STRUCT_INACTIVE)); trace_workqueue_activate_work(work); if (list_empty(&pwq->pool->worklist)) pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); - __clear_bit(WORK_STRUCT_INACTIVE_BIT, work_data_bits(work)); + __clear_bit(WORK_STRUCT_INACTIVE_BIT, wdb); +} + +/** + * pwq_activate_work - Activate a work item if inactive + * @pwq: pool_workqueue @work belongs to + * @work: work item to activate + * + * Returns %true if activated. %false if already active. + */ +static bool pwq_activate_work(struct pool_workqueue *pwq, + struct work_struct *work) +{ + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna; + + lockdep_assert_held(&pool->lock); + + if (!(*work_data_bits(work) & WORK_STRUCT_INACTIVE)) + return false; + + nna = wq_node_nr_active(pwq->wq, pool->node); + if (nna) + atomic_inc(&nna->nr); + pwq->nr_active++; + __pwq_activate_work(pwq, work); + return true; } -static void pwq_activate_first_inactive(struct pool_workqueue *pwq) +static bool tryinc_node_nr_active(struct wq_node_nr_active *nna) { - struct work_struct *work = list_first_entry(&pwq->inactive_works, - struct work_struct, entry); + int max = READ_ONCE(nna->max); - pwq_activate_inactive_work(work); + while (true) { + int old, tmp; + + old = atomic_read(&nna->nr); + if (old >= max) + return false; + tmp = atomic_cmpxchg_relaxed(&nna->nr, old, old + 1); + if (tmp == old) + return true; + } +} + +/** + * pwq_tryinc_nr_active - Try to increment nr_active for a pwq + * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level + * + * Try to increment nr_active for @pwq. Returns %true if an nr_active count is + * successfully obtained. %false otherwise. + */ +static bool pwq_tryinc_nr_active(struct pool_workqueue *pwq, bool fill) +{ + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(wq, pool->node); + bool obtained = false; + + lockdep_assert_held(&pool->lock); + + if (!nna) { + /* BH or per-cpu workqueue, pwq->nr_active is sufficient */ + obtained = pwq->nr_active < READ_ONCE(wq->max_active); + goto out; + } + + if (unlikely(pwq->plugged)) + return false; + + /* + * Unbound workqueue uses per-node shared nr_active $nna. If @pwq is + * already waiting on $nna, pwq_dec_nr_active() will maintain the + * concurrency level. Don't jump the line. + * + * We need to ignore the pending test after max_active has increased as + * pwq_dec_nr_active() can only maintain the concurrency level but not + * increase it. This is indicated by @fill. + */ + if (!list_empty(&pwq->pending_node) && likely(!fill)) + goto out; + + obtained = tryinc_node_nr_active(nna); + if (obtained) + goto out; + + /* + * Lockless acquisition failed. Lock, add ourself to $nna->pending_pwqs + * and try again. The smp_mb() is paired with the implied memory barrier + * of atomic_dec_return() in pwq_dec_nr_active() to ensure that either + * we see the decremented $nna->nr or they see non-empty + * $nna->pending_pwqs. + */ + raw_spin_lock(&nna->lock); + + if (list_empty(&pwq->pending_node)) + list_add_tail(&pwq->pending_node, &nna->pending_pwqs); + else if (likely(!fill)) + goto out_unlock; + + smp_mb(); + + obtained = tryinc_node_nr_active(nna); + + /* + * If @fill, @pwq might have already been pending. Being spuriously + * pending in cold paths doesn't affect anything. Let's leave it be. + */ + if (obtained && likely(!fill)) + list_del_init(&pwq->pending_node); + +out_unlock: + raw_spin_unlock(&nna->lock); +out: + if (obtained) + pwq->nr_active++; + return obtained; +} + +/** + * pwq_activate_first_inactive - Activate the first inactive work item on a pwq + * @pwq: pool_workqueue of interest + * @fill: max_active may have increased, try to increase concurrency level + * + * Activate the first inactive work item of @pwq if available and allowed by + * max_active limit. + * + * Returns %true if an inactive work item has been activated. %false if no + * inactive work item is found or max_active limit is reached. + */ +static bool pwq_activate_first_inactive(struct pool_workqueue *pwq, bool fill) +{ + struct work_struct *work = + list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + + if (work && pwq_tryinc_nr_active(pwq, fill)) { + __pwq_activate_work(pwq, work); + return true; + } else { + return false; + } +} + +/** + * unplug_oldest_pwq - unplug the oldest pool_workqueue + * @wq: workqueue_struct where its oldest pwq is to be unplugged + * + * This function should only be called for ordered workqueues where only the + * oldest pwq is unplugged, the others are plugged to suspend execution to + * ensure proper work item ordering:: + * + * dfl_pwq --------------+ [P] - plugged + * | + * v + * pwqs -> A -> B [P] -> C [P] (newest) + * | | | + * 1 3 5 + * | | | + * 2 4 6 + * + * When the oldest pwq is drained and removed, this function should be called + * to unplug the next oldest one to start its work item execution. Note that + * pwq's are linked into wq->pwqs with the oldest first, so the first one in + * the list is the oldest. + */ +static void unplug_oldest_pwq(struct workqueue_struct *wq) +{ + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq->mutex); + + /* Caller should make sure that pwqs isn't empty before calling */ + pwq = list_first_entry_or_null(&wq->pwqs, struct pool_workqueue, + pwqs_node); + raw_spin_lock_irq(&pwq->pool->lock); + if (pwq->plugged) { + pwq->plugged = false; + if (pwq_activate_first_inactive(pwq, true)) + kick_pool(pwq->pool); + } + raw_spin_unlock_irq(&pwq->pool->lock); +} + +/** + * node_activate_pending_pwq - Activate a pending pwq on a wq_node_nr_active + * @nna: wq_node_nr_active to activate a pending pwq for + * @caller_pool: worker_pool the caller is locking + * + * Activate a pwq in @nna->pending_pwqs. Called with @caller_pool locked. + * @caller_pool may be unlocked and relocked to lock other worker_pools. + */ +static void node_activate_pending_pwq(struct wq_node_nr_active *nna, + struct worker_pool *caller_pool) +{ + struct worker_pool *locked_pool = caller_pool; + struct pool_workqueue *pwq; + struct work_struct *work; + + lockdep_assert_held(&caller_pool->lock); + + raw_spin_lock(&nna->lock); +retry: + pwq = list_first_entry_or_null(&nna->pending_pwqs, + struct pool_workqueue, pending_node); + if (!pwq) + goto out_unlock; + + /* + * If @pwq is for a different pool than @locked_pool, we need to lock + * @pwq->pool->lock. Let's trylock first. If unsuccessful, do the unlock + * / lock dance. For that, we also need to release @nna->lock as it's + * nested inside pool locks. + */ + if (pwq->pool != locked_pool) { + raw_spin_unlock(&locked_pool->lock); + locked_pool = pwq->pool; + if (!raw_spin_trylock(&locked_pool->lock)) { + raw_spin_unlock(&nna->lock); + raw_spin_lock(&locked_pool->lock); + raw_spin_lock(&nna->lock); + goto retry; + } + } + + /* + * $pwq may not have any inactive work items due to e.g. cancellations. + * Drop it from pending_pwqs and see if there's another one. + */ + work = list_first_entry_or_null(&pwq->inactive_works, + struct work_struct, entry); + if (!work) { + list_del_init(&pwq->pending_node); + goto retry; + } + + /* + * Acquire an nr_active count and activate the inactive work item. If + * $pwq still has inactive work items, rotate it to the end of the + * pending_pwqs so that we round-robin through them. This means that + * inactive work items are not activated in queueing order which is fine + * given that there has never been any ordering across different pwqs. + */ + if (likely(tryinc_node_nr_active(nna))) { + pwq->nr_active++; + __pwq_activate_work(pwq, work); + + if (list_empty(&pwq->inactive_works)) + list_del_init(&pwq->pending_node); + else + list_move_tail(&pwq->pending_node, &nna->pending_pwqs); + + /* if activating a foreign pool, make sure it's running */ + if (pwq->pool != caller_pool) + kick_pool(pwq->pool); + } + +out_unlock: + raw_spin_unlock(&nna->lock); + if (locked_pool != caller_pool) { + raw_spin_unlock(&locked_pool->lock); + raw_spin_lock(&caller_pool->lock); + } +} + +/** + * pwq_dec_nr_active - Retire an active count + * @pwq: pool_workqueue of interest + * + * Decrement @pwq's nr_active and try to activate the first inactive work item. + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock. + */ +static void pwq_dec_nr_active(struct pool_workqueue *pwq) +{ + struct worker_pool *pool = pwq->pool; + struct wq_node_nr_active *nna = wq_node_nr_active(pwq->wq, pool->node); + + lockdep_assert_held(&pool->lock); + + /* + * @pwq->nr_active should be decremented for both percpu and unbound + * workqueues. + */ + pwq->nr_active--; + + /* + * For a percpu workqueue, it's simple. Just need to kick the first + * inactive work item on @pwq itself. + */ + if (!nna) { + pwq_activate_first_inactive(pwq, false); + return; + } + + /* + * If @pwq is for an unbound workqueue, it's more complicated because + * multiple pwqs and pools may be sharing the nr_active count. When a + * pwq needs to wait for an nr_active count, it puts itself on + * $nna->pending_pwqs. The following atomic_dec_return()'s implied + * memory barrier is paired with smp_mb() in pwq_tryinc_nr_active() to + * guarantee that either we see non-empty pending_pwqs or they see + * decremented $nna->nr. + * + * $nna->max may change as CPUs come online/offline and @pwq->wq's + * max_active gets updated. However, it is guaranteed to be equal to or + * larger than @pwq->wq->min_active which is above zero unless freezing. + * This maintains the forward progress guarantee. + */ + if (atomic_dec_return(&nna->nr) >= READ_ONCE(nna->max)) + return; + + if (!list_empty(&nna->pending_pwqs)) + node_activate_pending_pwq(nna, pool); } /** @@ -1481,6 +2001,11 @@ static void pwq_activate_first_inactive(struct pool_workqueue *pwq) * A work either has completed or is removed from pending queue, * decrement nr_in_flight of its pwq and handle workqueue flushing. * + * NOTE: + * For unbound workqueues, this function may temporarily drop @pwq->pool->lock + * and thus should be called after all other state updates for the in-flight + * work item is complete. + * * CONTEXT: * raw_spin_lock_irq(pool->lock). */ @@ -1488,14 +2013,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, unsigned long work_ { int color = get_work_color(work_data); - if (!(work_data & WORK_STRUCT_INACTIVE)) { - pwq->nr_active--; - if (!list_empty(&pwq->inactive_works)) { - /* one down, submit an inactive one */ - if (pwq->nr_active < pwq->max_active) - pwq_activate_first_inactive(pwq); - } - } + if (!(work_data & WORK_STRUCT_INACTIVE)) + pwq_dec_nr_active(pwq); pwq->nr_in_flight[color]--; @@ -1523,8 +2042,8 @@ out_put: /** * try_to_grab_pending - steal work item from worklist and disable irq * @work: work item to steal - * @is_dwork: @work is a delayed_work - * @flags: place to store irq state + * @cflags: %WORK_CANCEL_ flags + * @irq_flags: place to store irq state * * Try to grab PENDING bit of @work. This function can handle @work in any * stable state - idle, on timer or on worklist. @@ -1546,20 +2065,20 @@ out_put: * irqsafe, ensures that we return -EAGAIN for finite short period of time. * * On successful return, >= 0, irq is disabled and the caller is - * responsible for releasing it using local_irq_restore(*@flags). + * responsible for releasing it using local_irq_restore(*@irq_flags). * * This function is safe to call from any context including IRQ handler. */ -static int try_to_grab_pending(struct work_struct *work, bool is_dwork, - unsigned long *flags) +static int try_to_grab_pending(struct work_struct *work, u32 cflags, + unsigned long *irq_flags) { struct worker_pool *pool; struct pool_workqueue *pwq; - local_irq_save(*flags); + local_irq_save(*irq_flags); /* try to steal the timer if it exists */ - if (is_dwork) { + if (cflags & WORK_CANCEL_DELAYED) { struct delayed_work *dwork = to_delayed_work(work); /* @@ -1595,6 +2114,8 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, */ pwq = get_work_pwq(work); if (pwq && pwq->pool == pool) { + unsigned long work_data; + debug_work_deactivate(work); /* @@ -1608,14 +2129,19 @@ static int try_to_grab_pending(struct work_struct *work, bool is_dwork, * management later on and cause stall. Make sure the work * item is activated before grabbing. */ - if (*work_data_bits(work) & WORK_STRUCT_INACTIVE) - pwq_activate_inactive_work(work); + pwq_activate_work(pwq, work); list_del_init(&work->entry); - pwq_dec_nr_in_flight(pwq, *work_data_bits(work)); - /* work->dat |