From b3b3cc618ee07f5f4c409e8ca86ac7fbac085ebd Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 5 Aug 2021 15:53:10 -0700 Subject: locktorture: Warn on individual lock_torture_init() error conditions When running locktorture as a module, any lock_torture_init() issues will be reflected in the error code from modprobe or insmod, as the case may be. However, these error codes are not available when running locktorture built-in, for example, when using the kvm.sh script. This commit therefore adds WARN_ON_ONCE() to allow distinguishing lock_torture_init() errors when running locktorture built-in. Signed-off-by: Paul E. McKenney --- kernel/locking/locktorture.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 7c5a4a087cc7..397ac13d2ef7 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -1022,23 +1022,23 @@ static int __init lock_torture_init(void) if (onoff_interval > 0) { firsterr = torture_onoff_init(onoff_holdoff * HZ, onoff_interval * HZ, NULL); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shuffle_interval > 0) { firsterr = torture_shuffle_init(shuffle_interval); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (shutdown_secs > 0) { firsterr = torture_shutdown_init(shutdown_secs, lock_torture_cleanup); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stutter > 0) { firsterr = torture_stutter_init(stutter, stutter); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } @@ -1082,7 +1082,7 @@ static int __init lock_torture_init(void) /* Create writer. */ firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i], writer_tasks[i]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; create_reader: @@ -1091,13 +1091,13 @@ static int __init lock_torture_init(void) /* Create reader. */ firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j], reader_tasks[j]); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } if (stat_interval > 0) { firsterr = torture_create_kthread(lock_torture_stats, NULL, stats_task); - if (firsterr) + if (torture_init_error(firsterr)) goto unwind; } torture_init_end(); -- cgit v1.2.3 From ce0b9c805dd66d5e49fd53ec5415ae398f4c56e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2021 11:41:10 +0200 Subject: locking/lockdep: Avoid RCU-induced noinstr fail vmlinux.o: warning: objtool: look_up_lock_class()+0xc7: call to rcu_read_lock_any_held() leaves .noinstr.text section Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20210624095148.311980536@infradead.org --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..8a509672a4cc 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -888,7 +888,7 @@ look_up_lock_class(const struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - hlist_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu_notrace(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample -- cgit v1.2.3 From 12235da8c80a1f9909008e4ca6036d5772b81192 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Thu, 9 Sep 2021 11:32:18 +0200 Subject: kernel/locking: Add context to ww_mutex_trylock() i915 will soon gain an eviction path that trylock a whole lot of locks for eviction, getting dmesg failures like below: BUG: MAX_LOCK_DEPTH too low! turning off the locking correctness validator. depth: 48 max: 48! 48 locks held by i915_selftest/5776: #0: ffff888101a79240 (&dev->mutex){....}-{3:3}, at: __driver_attach+0x88/0x160 #1: ffffc900009778c0 (reservation_ww_class_acquire){+.+.}-{0:0}, at: i915_vma_pin.constprop.63+0x39/0x1b0 [i915] #2: ffff88800cf74de8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_vma_pin.constprop.63+0x5f/0x1b0 [i915] #3: ffff88810c7f9e38 (&vm->mutex/1){+.+.}-{3:3}, at: i915_vma_pin_ww+0x1c4/0x9d0 [i915] #4: ffff88810bad5768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] #5: ffff88810bad60e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] ... #46: ffff88811964d768 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] #47: ffff88811964e0e8 (reservation_ww_class_mutex){+.+.}-{3:3}, at: i915_gem_evict_something+0x110/0x860 [i915] INFO: lockdep is turned off. Fixing eviction to nest into ww_class_acquire is a high priority, but it requires a rework of the entire driver, which can only be done one step at a time. As an intermediate solution, add an acquire context to ww_mutex_trylock, which allows us to do proper nesting annotations on the trylocks, making the above lockdep splat disappear. This is also useful in regulator_lock_nested, which may avoid dropping regulator_nesting_mutex in the uncontended path, so use it there. TTM may be another user for this, where we could lock a buffer in a fastpath with list locks held, without dropping all locks we hold. [peterz: rework actual ww_mutex_trylock() implementations] Signed-off-by: Maarten Lankhorst Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/YUBGPdDDjKlxAuXJ@hirez.programming.kicks-ass.net --- kernel/locking/mutex.c | 41 ++++++++++++++++++++ kernel/locking/test-ww_mutex.c | 86 +++++++++++++++++++++++++++++++----------- kernel/locking/ww_rt_mutex.c | 25 ++++++++++++ 3 files changed, 131 insertions(+), 21 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index d456579d0952..2fede72b6af5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -94,6 +94,9 @@ static inline unsigned long __owner_flags(unsigned long owner) return owner & MUTEX_FLAGS; } +/* + * Returns: __mutex_owner(lock) on failure or NULL on success. + */ static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff) { unsigned long owner, curr = (unsigned long)current; @@ -736,6 +739,44 @@ __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass, return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true); } +/** + * ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context + * @ww: mutex to lock + * @ww_ctx: optional w/w acquire context + * + * Trylocks a mutex with the optional acquire context; no deadlock detection is + * possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise. + * + * Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is + * specified, -EALREADY handling may happen in calls to ww_mutex_trylock. + * + * A mutex acquired with this function must be released with ww_mutex_unlock. + */ +int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx) +{ + if (!ww_ctx) + return mutex_trylock(&ww->base); + + MUTEX_WARN_ON(ww->base.magic != &ww->base); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__mutex_trylock(&ww->base)) { + ww_mutex_set_context_fastpath(ww, ww_ctx); + mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __sched mutex_lock_nested(struct mutex *lock, unsigned int subclass) diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 3e82f449b4ff..d63ac411f367 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -16,6 +16,15 @@ static DEFINE_WD_CLASS(ww_class); struct workqueue_struct *wq; +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH +#define ww_acquire_init_noinject(a, b) do { \ + ww_acquire_init((a), (b)); \ + (a)->deadlock_inject_countdown = ~0U; \ + } while (0) +#else +#define ww_acquire_init_noinject(a, b) ww_acquire_init((a), (b)) +#endif + struct test_mutex { struct work_struct work; struct ww_mutex mutex; @@ -36,7 +45,7 @@ static void test_mutex_work(struct work_struct *work) wait_for_completion(&mtx->go); if (mtx->flags & TEST_MTX_TRY) { - while (!ww_mutex_trylock(&mtx->mutex)) + while (!ww_mutex_trylock(&mtx->mutex, NULL)) cond_resched(); } else { ww_mutex_lock(&mtx->mutex, NULL); @@ -109,19 +118,38 @@ static int test_mutex(void) return 0; } -static int test_aa(void) +static int test_aa(bool trylock) { struct ww_mutex mutex; struct ww_acquire_ctx ctx; int ret; + const char *from = trylock ? "trylock" : "lock"; ww_mutex_init(&mutex, &ww_class); ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&mutex, &ctx); + if (!trylock) { + ret = ww_mutex_lock(&mutex, &ctx); + if (ret) { + pr_err("%s: initial lock failed!\n", __func__); + goto out; + } + } else { + if (!ww_mutex_trylock(&mutex, &ctx)) { + pr_err("%s: initial trylock failed!\n", __func__); + goto out; + } + } - if (ww_mutex_trylock(&mutex)) { - pr_err("%s: trylocked itself!\n", __func__); + if (ww_mutex_trylock(&mutex, NULL)) { + pr_err("%s: trylocked itself without context from %s!\n", __func__, from); + ww_mutex_unlock(&mutex); + ret = -EINVAL; + goto out; + } + + if (ww_mutex_trylock(&mutex, &ctx)) { + pr_err("%s: trylocked itself with context from %s!\n", __func__, from); ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; @@ -129,17 +157,17 @@ static int test_aa(void) ret = ww_mutex_lock(&mutex, &ctx); if (ret != -EALREADY) { - pr_err("%s: missed deadlock for recursing, ret=%d\n", - __func__, ret); + pr_err("%s: missed deadlock for recursing, ret=%d from %s\n", + __func__, ret, from); if (!ret) ww_mutex_unlock(&mutex); ret = -EINVAL; goto out; } + ww_mutex_unlock(&mutex); ret = 0; out: - ww_mutex_unlock(&mutex); ww_acquire_fini(&ctx); return ret; } @@ -150,7 +178,7 @@ struct test_abba { struct ww_mutex b_mutex; struct completion a_ready; struct completion b_ready; - bool resolve; + bool resolve, trylock; int result; }; @@ -160,8 +188,13 @@ static void test_abba_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err; - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba->b_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!abba->trylock) + ww_mutex_lock(&abba->b_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba->b_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba->b_mutex.ctx) != &ctx); complete(&abba->b_ready); wait_for_completion(&abba->a_ready); @@ -181,7 +214,7 @@ static void test_abba_work(struct work_struct *work) abba->result = err; } -static int test_abba(bool resolve) +static int test_abba(bool trylock, bool resolve) { struct test_abba abba; struct ww_acquire_ctx ctx; @@ -192,12 +225,18 @@ static int test_abba(bool resolve) INIT_WORK_ONSTACK(&abba.work, test_abba_work); init_completion(&abba.a_ready); init_completion(&abba.b_ready); + abba.trylock = trylock; abba.resolve = resolve; schedule_work(&abba.work); - ww_acquire_init(&ctx, &ww_class); - ww_mutex_lock(&abba.a_mutex, &ctx); + ww_acquire_init_noinject(&ctx, &ww_class); + if (!trylock) + ww_mutex_lock(&abba.a_mutex, &ctx); + else + WARN_ON(!ww_mutex_trylock(&abba.a_mutex, &ctx)); + + WARN_ON(READ_ONCE(abba.a_mutex.ctx) != &ctx); complete(&abba.a_ready); wait_for_completion(&abba.b_ready); @@ -249,7 +288,7 @@ static void test_cycle_work(struct work_struct *work) struct ww_acquire_ctx ctx; int err, erra = 0; - ww_acquire_init(&ctx, &ww_class); + ww_acquire_init_noinject(&ctx, &ww_class); ww_mutex_lock(&cycle->a_mutex, &ctx); complete(cycle->a_signal); @@ -581,7 +620,9 @@ static int stress(int nlocks, int nthreads, unsigned int flags) static int __init test_ww_mutex_init(void) { int ncpus = num_online_cpus(); - int ret; + int ret, i; + + printk(KERN_INFO "Beginning ww mutex selftests\n"); wq = alloc_workqueue("test-ww_mutex", WQ_UNBOUND, 0); if (!wq) @@ -591,17 +632,19 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = test_aa(); + ret = test_aa(false); if (ret) return ret; - ret = test_abba(false); + ret = test_aa(true); if (ret) return ret; - ret = test_abba(true); - if (ret) - return ret; + for (i = 0; i < 4; i++) { + ret = test_abba(i & 1, i & 2); + if (ret) + return ret; + } ret = test_cycle(ncpus); if (ret) @@ -619,6 +662,7 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; + printk(KERN_INFO "All ww mutex selftests passed\n"); return 0; } diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index 3f1fff7d2780..0e00205cf467 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -9,6 +9,31 @@ #define WW_RT #include "rtmutex.c" +int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) +{ + struct rt_mutex *rtm = &lock->base; + + if (!ww_ctx) + return rt_mutex_trylock(rtm); + + /* + * Reset the wounded flag after a kill. No other process can + * race and wound us here, since they can't have a valid owner + * pointer if we don't have any locks held. + */ + if (ww_ctx->acquired == 0) + ww_ctx->wounded = 0; + + if (__rt_mutex_trylock(&rtm->rtmutex)) { + ww_mutex_set_context_fastpath(lock, ww_ctx); + mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + return 1; + } + + return 0; +} +EXPORT_SYMBOL(ww_mutex_trylock); + static int __sched __ww_rt_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx, unsigned int state, unsigned long ip) -- cgit v1.2.3 From 2507003a1d10917c9158077bf6030719d02c941e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 3 Sep 2021 10:40:01 +0200 Subject: lockdep: Let lock_is_held_type() detect recursive read as read lock_is_held_type(, 1) detects acquired read locks. It only recognized locks acquired with lock_acquire_shared(). Read locks acquired with lock_acquire_shared_recursive() are not recognized because a `2' is stored as the read value. Rework the check to additionally recognise lock's read value one and two as a read held lock. Fixes: e918188611f07 ("locking: More accurate annotations for read_lock()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Acked-by: Boqun Feng Acked-by: Waiman Long Link: https://lkml.kernel.org/r/20210903084001.lblecrvz4esl4mrr@linutronix.de --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..bfa0a347f27c 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5366,7 +5366,7 @@ int __lock_is_held(const struct lockdep_map *lock, int read) struct held_lock *hlock = curr->held_locks + i; if (match_held_lock(hlock, lock)) { - if (read == -1 || hlock->read == read) + if (read == -1 || !!hlock->read == read) return LOCK_STATE_HELD; return LOCK_STATE_NOT_HELD; -- cgit v1.2.3 From a2e05ddda11b0bd529f443df9089ab498b2c2642 Mon Sep 17 00:00:00 2001 From: Zhouyi Zhou Date: Wed, 11 Aug 2021 10:59:20 +0800 Subject: lockdep: Improve comments in wait-type checks Comments in wait-type checks be improved by mentioning the PREEPT_RT kernel configure option. Signed-off-by: Zhouyi Zhou Signed-off-by: Peter Zijlstra (Intel) Acked-by: Paul E. McKenney Link: https://lkml.kernel.org/r/20210811025920.20751-1-zhouzhouyi@gmail.com --- kernel/locking/lockdep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bfa0a347f27c..4e6312977ffb 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -4671,7 +4671,7 @@ print_lock_invalid_wait_context(struct task_struct *curr, /* * Verify the wait_type context. * - * This check validates we takes locks in the right wait-type order; that is it + * This check validates we take locks in the right wait-type order; that is it * ensures that we do not take mutexes inside spinlocks and do not attempt to * acquire spinlocks inside raw_spinlocks and the sort. * -- cgit v1.2.3 From 1415b49bcd321bca7347f43f8b269c91ec46d1dc Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 22 Sep 2021 07:58:22 -0700 Subject: locking/ww-mutex: Fix uninitialized use of ret in test_aa() Clang warns: kernel/locking/test-ww_mutex.c:138:7: error: variable 'ret' is used uninitialized whenever 'if' condition is true [-Werror,-Wsometimes-uninitialized] if (!ww_mutex_trylock(&mutex, &ctx)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/locking/test-ww_mutex.c:172:9: note: uninitialized use occurs here return ret; ^~~ kernel/locking/test-ww_mutex.c:138:3: note: remove the 'if' if its condition is always false if (!ww_mutex_trylock(&mutex, &ctx)) { ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ kernel/locking/test-ww_mutex.c:125:9: note: initialize the variable 'ret' to silence this warning int ret; ^ = 0 1 error generated. Assign !ww_mutex_trylock(...) to ret so that it is always initialized. Fixes: 12235da8c80a ("kernel/locking: Add context to ww_mutex_trylock()") Reported-by: "kernelci.org bot" Reported-by: Stephen Rothwell Signed-off-by: Nathan Chancellor Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20210922145822.3935141-1-nathan@kernel.org --- kernel/locking/test-ww_mutex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index d63ac411f367..353004155d65 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -135,7 +135,8 @@ static int test_aa(bool trylock) goto out; } } else { - if (!ww_mutex_trylock(&mutex, &ctx)) { + ret = !ww_mutex_trylock(&mutex, &ctx); + if (ret) { pr_err("%s: initial trylock failed!\n", __func__); goto out; } -- cgit v1.2.3 From 874f670e6088d3bff3972ecd44c1cb00610f9183 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:35 +0200 Subject: sched: Clean up the might_sleep() underscore zoo __might_sleep() vs. ___might_sleep() is hard to distinguish. Aside of that the three underscore variant is exposed to provide a checkpoint for rescheduling points which are distinct from blocking points. They are semantically a preemption point which means that scheduling is state preserving. A real blocking operation, e.g. mutex_lock(), wait*(), which cannot preserve a task state which is not equal to RUNNING. While technically blocking on a "sleeping" spinlock in RT enabled kernels falls into the voluntary scheduling category because it has to wait until the contended spin/rw lock becomes available, the RT lock substitution code can semantically be mapped to a voluntary preemption because the RT lock substitution code and the scheduler are providing mechanisms to preserve the task state and to take regular non-lock related wakeups into account. Rename ___might_sleep() to __might_resched() to make the distinction of these functions clear. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165357.928693482@linutronix.de --- kernel/locking/spinlock_rt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index d2912e44d61f..c5289240cfb4 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -32,7 +32,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) static __always_inline void __rt_spin_lock(spinlock_t *lock) { - ___might_sleep(__FILE__, __LINE__, 0); + __might_resched(__FILE__, __LINE__, 0); rtlock_lock(&lock->lock); rcu_read_lock(); migrate_disable(); @@ -210,7 +210,7 @@ EXPORT_SYMBOL(rt_write_trylock); void __sched rt_read_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + __might_resched(__FILE__, __LINE__, 0); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @@ -220,7 +220,7 @@ EXPORT_SYMBOL(rt_read_lock); void __sched rt_write_lock(rwlock_t *rwlock) { - ___might_sleep(__FILE__, __LINE__, 0); + __might_resched(__FILE__, __LINE__, 0); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); -- cgit v1.2.3 From ef1f4804b27a54da34de6984d16f1fe8f2cc7011 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 23 Sep 2021 18:54:46 +0200 Subject: locking/rt: Take RCU nesting into account for __might_resched() The general rule that rcu_read_lock() held sections cannot voluntary sleep does apply even on RT kernels. Though the substitution of spin/rw locks on RT enabled kernels has to be exempt from that rule. On !RT a spin_lock() can obviously nest inside a RCU read side critical section as the lock acquisition is not going to block, but on RT this is not longer the case due to the 'sleeping' spinlock substitution. The RT patches contained a cheap hack to ignore the RCU nesting depth in might_sleep() checks, which was a pragmatic but incorrect workaround. Instead of generally ignoring the RCU nesting depth in __might_sleep() and __might_resched() checks, pass the rcu_preempt_depth() via the offsets argument to __might_resched() from spin/read/write_lock() which makes the checks work correctly even in RCU read side critical sections. The actual blocking on such a substituted lock within a RCU read side critical section is already handled correctly in __schedule() by treating it as a "preemption" of the RCU read side critical section. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210923165358.368305497@linutronix.de --- kernel/locking/spinlock_rt.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/spinlock_rt.c b/kernel/locking/spinlock_rt.c index c5289240cfb4..b2e553f9255b 100644 --- a/kernel/locking/spinlock_rt.c +++ b/kernel/locking/spinlock_rt.c @@ -24,6 +24,17 @@ #define RT_MUTEX_BUILD_SPINLOCKS #include "rtmutex.c" +/* + * __might_resched() skips the state check as rtlocks are state + * preserving. Take RCU nesting into account as spin/read/write_lock() can + * legitimately nest into an RCU read side critical section. + */ +#define RTLOCK_RESCHED_OFFSETS \ + (rcu_preempt_depth() << MIGHT_RESCHED_RCU_SHIFT) + +#define rtlock_might_resched() \ + __might_resched(__FILE__, __LINE__, RTLOCK_RESCHED_OFFSETS) + static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) { if (unlikely(!rt_mutex_cmpxchg_acquire(rtm, NULL, current))) @@ -32,7 +43,7 @@ static __always_inline void rtlock_lock(struct rt_mutex_base *rtm) static __always_inline void __rt_spin_lock(spinlock_t *lock) { - __might_resched(__FILE__, __LINE__, 0); + rtlock_might_resched(); rtlock_lock(&lock->lock); rcu_read_lock(); migrate_disable(); @@ -210,7 +221,7 @@ EXPORT_SYMBOL(rt_write_trylock); void __sched rt_read_lock(rwlock_t *rwlock) { - __might_resched(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_read_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); @@ -220,7 +231,7 @@ EXPORT_SYMBOL(rt_read_lock); void __sched rt_write_lock(rwlock_t *rwlock) { - __might_resched(__FILE__, __LINE__, 0); + rtlock_might_resched(); rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); rwbase_write_lock(&rwlock->rwbase, TASK_RTLOCK_WAIT); rcu_read_lock(); -- cgit v1.2.3 From 8fe46535e10dbfebad68ad9f2f8260e49f5852c9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 28 Sep 2021 17:00:05 +0200 Subject: rtmutex: Check explicit for TASK_RTLOCK_WAIT. rt_mutex_wake_q_add() needs to need to distiguish between sleeping locks (TASK_RTLOCK_WAIT) and normal locks which use TASK_NORMAL to use the proper wake mechanism. Instead of checking for != TASK_NORMAL make it more robust and check explicit for TASK_RTLOCK_WAIT which is the reason why a different wake mechanism is used. No functional change. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928150006.597310-2-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6bb116c559b4..cafc259ec59d 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -449,7 +449,7 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, struct rt_mutex_waiter *w) { - if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) { + if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state == TASK_RTLOCK_WAIT) { if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(wqh->rtlock_task); get_task_struct(w->task); -- cgit v1.2.3 From 9321f8152d9a764208c3f0dad49e0c55f293b7ab Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 28 Sep 2021 17:00:06 +0200 Subject: rtmutex: Wake up the waiters lockless while dropping the read lock. The rw_semaphore and rwlock_t implementation both wake the waiter while holding the rt_mutex_base::wait_lock acquired. This can be optimized by waking the waiter lockless outside of the locked section to avoid a needless contention on the rt_mutex_base::wait_lock lock. Extend rt_mutex_wake_q_add() to also accept task and state and use it in __rwbase_read_unlock(). Suggested-by: Davidlohr Bueso Signed-off-by: Thomas Gleixner Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20210928150006.597310-3-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 19 +++++++++++++------ kernel/locking/rwbase_rt.c | 6 +++++- 2 files changed, 18 insertions(+), 7 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index cafc259ec59d..0c6a48dfcecb 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -446,19 +446,26 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p) } /* RT mutex specific wake_q wrappers */ -static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, - struct rt_mutex_waiter *w) +static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh, + struct task_struct *task, + unsigned int wake_state) { - if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state == TASK_RTLOCK_WAIT) { + if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) { if (IS_ENABLED(CONFIG_PROVE_LOCKING)) WARN_ON_ONCE(wqh->rtlock_task); - get_task_struct(w->task); - wqh->rtlock_task = w->task; + get_task_struct(task); + wqh->rtlock_task = task; } else { - wake_q_add(&wqh->head, w->task); + wake_q_add(&wqh->head, task); } } +static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh, + struct rt_mutex_waiter *w) +{ + rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state); +} + static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh) { if (IS_ENABLED(CONFIG_PREEMPT_RT) && wqh->rtlock_task) { diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 4ba15088e640..6b143fb294f6 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -141,6 +141,7 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, { struct rt_mutex_base *rtm = &rwb->rtmutex; struct task_struct *owner; + DEFINE_RT_WAKE_Q(wqh); raw_spin_lock_irq(&rtm->wait_lock); /* @@ -151,9 +152,12 @@ static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, */ owner = rt_mutex_owner(rtm); if (owner) - wake_up_state(owner, state); + rt_mutex_wake_q_add_task(&wqh, owner, state); + /* Pairs with the preempt_enable in rt_mutex_wake_up_q() */ + preempt_disable(); raw_spin_unlock_irq(&rtm->wait_lock); + rt_mutex_wake_up_q(&wqh); } static __always_inline void rwbase_read_unlock(struct rwbase_rt *rwb, -- cgit v1.2.3 From c78416d122243c92992a1d1063f17ddd0bc80e6c Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Sun, 19 Sep 2021 22:20:30 -0700 Subject: locking/rwbase: Optimize rwbase_read_trylock Instead of a full barrier around the Rmw insn, micro-optimize for weakly ordered archs such that we only provide the required ACQUIRE semantics when taking the read lock. Signed-off-by: Davidlohr Bueso Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lkml.kernel.org/r/20210920052031.54220-2-dave@stgolabs.net --- kernel/locking/rwbase_rt.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwbase_rt.c b/kernel/locking/rwbase_rt.c index 15c81100f0e2..6fd3162e4098 100644 --- a/kernel/locking/rwbase_rt.c +++ b/kernel/locking/rwbase_rt.c @@ -59,8 +59,7 @@ static __always_inline int rwbase_read_trylock(struct rwbase_rt *rwb) * set. */ for (r = atomic_read(&rwb->readers); r < 0;) { - /* Fully-ordered if cmpxchg() succeeds, provides ACQUIRE */ - if (likely(atomic_try_cmpxchg(&rwb->readers, &r, r + 1))) + if (likely(atomic_try_cmpxchg_acquire(&rwb->readers, &r, r + 1))) return 1; } return 0; @@ -187,7 +186,7 @@ static inline void __rwbase_write_unlock(struct rwbase_rt *rwb, int bias, /* * _release() is needed in case that reader is in fast path, pairing - * with atomic_try_cmpxchg() in rwbase_read_trylock(), provides RELEASE + * with atomic_try_cmpxchg_acquire() in rwbase_read_trylock(). */ (void)atomic_add_return_release(READER_BIAS - bias, &rwb->readers); raw_spin_unlock_irqrestore(&rtm->wait_lock, flags); -- cgit v1.2.3 From 7cdacc5f52d68a9370f182c844b5b3e6cc975cc1 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Wed, 13 Oct 2021 21:41:53 +0800 Subject: locking/rwsem: Disable preemption for spinning region The spinning region rwsem_spin_on_owner() should not be preempted, however the rwsem_down_write_slowpath() invokes it and don't disable preemption. Fix it by adding a pair of preempt_disable/enable(). Signed-off-by: Yanfei Xu [peterz: Fix CONFIG_RWSEM_SPIN_ON_OWNER=n build] Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211013134154.1085649-3-yanfei.xu@windriver.com --- kernel/locking/rwsem.c | 53 ++++++++++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 23 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 000e8d5a2884..29eea50a3e67 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -577,6 +577,24 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, return true; } +/* + * The rwsem_spin_on_owner() function returns the following 4 values + * depending on the lock owner state. + * OWNER_NULL : owner is currently NULL + * OWNER_WRITER: when owner changes and is a writer + * OWNER_READER: when owner changes and the new owner may be a reader. + * OWNER_NONSPINNABLE: + * when optimistic spinning has to stop because either the + * owner stops running, is unknown, or its timeslice has + * been used up. + */ +enum owner_state { + OWNER_NULL = 1 << 0, + OWNER_WRITER = 1 << 1, + OWNER_READER = 1 << 2, + OWNER_NONSPINNABLE = 1 << 3, +}; + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* * Try to acquire write lock before the writer has been put on wait queue. @@ -632,23 +650,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) return ret; } -/* - * The rwsem_spin_on_owner() function returns the following 4 values - * depending on the lock owner state. - * OWNER_NULL : owner is currently NULL - * OWNER_WRITER: when owner changes and is a writer - * OWNER_READER: when owner changes and the new owner may be a reader. - * OWNER_NONSPINNABLE: - * when optimistic spinning has to stop because either the - * owner stops running, is unknown, or its timeslice has - * been used up. - */ -enum owner_state { - OWNER_NULL = 1 << 0, - OWNER_WRITER = 1 << 1, - OWNER_READER = 1 << 2, - OWNER_NONSPINNABLE = 1 << 3, -}; #define OWNER_SPINNABLE (OWNER_NULL | OWNER_WRITER | OWNER_READER) static inline enum owner_state @@ -878,12 +879,11 @@ static inline bool rwsem_optimistic_spin(struct rw_semaphore *sem) static inline void clear_nonspinnable(struct rw_semaphore *sem) { } -static inline int +static inline enum owner_state rwsem_spin_on_owner(struct rw_semaphore *sem) { - return 0; + return OWNER_NONSPINNABLE; } -#define OWNER_NULL 1 #endif /* @@ -1095,9 +1095,16 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF && - rwsem_spin_on_owner(sem) == OWNER_NULL) - goto trylock_again; + if (wstate == WRITER_HANDOFF) { + enum owner_state owner_state; + + preempt_disable(); + owner_state = rwsem_spin_on_owner(sem); + preempt_enable(); + + if (owner_state == OWNER_NULL) + goto trylock_again; + } /* Block until there are no active lockers. */ for (;;) { -- cgit v1.2.3 From 6c2787f2a20ceb49c98bd06f7dad1589eed1c951 Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Wed, 13 Oct 2021 21:41:52 +0800 Subject: locking: Remove rcu_read_{,un}lock() for preempt_{dis,en}able() preempt_disable/enable() is equal to RCU read-side crital section, and the spinning codes in mutex and rwsem could ensure that the preemption is disabled. So let's remove the unnecessary rcu_read_lock/unlock for saving some cycles in hot codes. Signed-off-by: Yanfei Xu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211013134154.1085649-2-yanfei.xu@windriver.com --- kernel/locking/mutex.c | 22 +++++++++++++++------- kernel/locking/rwsem.c | 14 +++++++++----- 2 files changed, 24 insertions(+), 12 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 2fede72b6af5..db1913611192 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -351,13 +351,16 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, { bool ret = true; - rcu_read_lock(); + lockdep_assert_preemption_disabled(); + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ - * checking lock->owner still matches owner. If that fails, - * owner might point to freed memory. If it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * checking lock->owner still matches owner. And we already + * disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the + * task_strcut structure won't go away during the spinning + * period */ barrier(); @@ -377,7 +380,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner, cpu_relax(); } - rcu_read_unlock(); return ret; } @@ -390,19 +392,25 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) struct task_struct *owner; int retval = 1; + lockdep_assert_preemption_disabled(); + if (need_resched()) return 0; - rcu_read_lock(); + /* + * We already disabled preemption which is equal to the RCU read-side + * crital section in optimistic spinning code. Thus the task_strcut + * structure won't go away during the spinning period. + */ owner = __mutex_owner(lock); /* * As lock holder preemption issue, we both skip spinning if task is not * on cpu or its cpu is preempted */ + if (owner) retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); - rcu_read_unlock(); /* * If lock->owner is not set, the mutex has been released. Return true diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 29eea50a3e67..884aa08e0624 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -635,7 +635,10 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) } preempt_disable(); - rcu_read_lock(); + /* + * Disable preemption is equal to the RCU read-side crital section, + * thus the task_strcut structure won't go away. + */ owner = rwsem_owner_flags(sem, &flags); /* * Don't check the read-owner as the entry may be stale. @@ -643,7 +646,6 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) if ((flags & RWSEM_NONSPINNABLE) || (owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner))) ret = false; - rcu_read_unlock(); preempt_enable(); lockevent_cond_inc(rwsem_opt_fail, !ret); @@ -671,12 +673,13 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) unsigned long flags, new_flags; enum owner_state state; + lockdep_assert_preemption_disabled(); + owner = rwsem_owner_flags(sem, &flags); state = rwsem_owner_state(owner, flags); if (state != OWNER_WRITER) return state; - rcu_read_lock(); for (;;) { /* * When a waiting writer set the handoff flag, it may spin @@ -694,7 +697,9 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, * owner might point to free()d memory, if it still matches, - * the rcu_read_lock() ensures the memory stays valid. + * our spinning context already disabled preemption which is + * equal to RCU read-side crital section ensures the memory + * stays valid. */ barrier(); @@ -705,7 +710,6 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } - rcu_read_unlock(); return state; } -- cgit v1.2.3 From 5197fcd09ab6dcc4df79edec7e8e27575276374c Mon Sep 17 00:00:00 2001 From: Yanfei Xu Date: Wed, 13 Oct 2021 21:41:54 +0800 Subject: locking/rwsem: Fix comments about reader optimistic lock stealing conditions After the commit 617f3ef95177 ("locking/rwsem: Remove reader optimistic spinning"), reader doesn't support optimistic spinning anymore, there is no need meet the condition which OSQ is empty. BTW, add an unlikely() for the max reader wakeup check in the loop. Signed-off-by: Yanfei Xu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Waiman Long Link: https://lore.kernel.org/r/20211013134154.1085649-4-yanfei.xu@windriver.com --- kernel/locking/rwsem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 884aa08e0624..c51387a43265 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -56,7 +56,6 @@ * * A fast path reader optimistic lock stealing is supported when the rwsem * is previously owned by a writer and the following conditions are met: - * - OSQ is empty * - rwsem is not currently writer owned * - the handoff isn't set. */ @@ -485,7 +484,7 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, /* * Limit # of readers that can be woken up per wakeup call. */ - if (woken >= MAX_READERS_WAKEUP) + if (unlikely(woken >= MAX_READERS_WAKEUP)) break; } -- cgit v1.2.3 From f98a3dccfcb0b9b9c3bef8df9edd61cda80ad937 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 22 Oct 2021 13:59:38 +0200 Subject: locking: Remove spin_lock_flags() etc parisc, ia64 and powerpc32 are the only remaining architectures that provide custom arch_{spin,read,write}_lock_flags() functions, which are meant to re-enable interrupts while waiting for a spinlock. However, none of these can actually run into this codepath, because it is only called on architectures without CONFIG_GENERIC_LOCKBREAK, or when CONFIG_DEBUG_LOCK_ALLOC is set without CONFIG_LOCKDEP, and none of those combinations are possible on the three architectures. Going back in the git history, it appears that arch/mn10300 may have been able to run into this code path, but there is a good chance that it never worked. On the architectures that still exist, it was already impossible to hit back in 2008 after the introduction of CONFIG_GENERIC_LOCKBREAK, and possibly earlier. As this is all dead code, just remove it and the helper functions built around it. For arch/ia64, the inline asm could be cleaned up, but it seems safer to leave it untouched. Signed-off-by: Arnd Bergmann Signed-off-by: Peter Zijlstra (Intel) Acked-by: Helge Deller # parisc Link: https://lore.kernel.org/r/20211022120058.1031690-1-arnd@kernel.org --- kernel/locking/spinlock.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index c5830cfa379a..b562f9289372 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -378,8 +378,7 @@ unsigned long __lockfunc _raw_spin_lock_irqsave_nested(raw_spinlock_t *lock, local_irq_save(flags); preempt_disable(); spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - LOCK_CONTENDED_FLAGS(lock, do_raw_spin_trylock, do_raw_spin_lock, - do_raw_spin_lock_flags, &flags); + LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock); return flags; } EXPORT_SYMBOL(_raw_spin_lock_irqsave_nested); -- cgit v1.2.3 From e5ae3728327fda5209454d738eebdb20443fdfac Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Nov 2021 13:40:43 -0700 Subject: mm: make generic arch_is_kernel_initmem_freed() do what it says Commit 7a5da02de8d6 ("locking/lockdep: check for freed initmem in static_obj()") added arch_is_kernel_initmem_freed() which is supposed to report whether an object is part of already freed init memory. For the time being, the generic version of arch_is_kernel_initmem_freed() always reports 'false', allthough free_initmem() is generically called on all architectures. Therefore, change the generic version of arch_is_kernel_initmem_freed() to check whether free_initmem() has been called. If so, then check if a given address falls into init memory. To ease the use of system_state, move it out of line into its only caller which is lockdep.c Link: https://lkml.kernel.org/r/1d40783e676e07858be97d881f449ee7ea8adfb1.1633001016.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Cc: Gerald Schaefer Cc: Kefeng Wang Cc: Benjamin Herrenschmidt Cc: Heiko Carstens Cc: Paul Mackerras Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index bf1c00c881e4..8e118caf835e 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class) * Is this the address of a static object: */ #ifdef __KERNEL__ +/* + * Check if an address is part of freed initmem. After initmem is freed, + * memory can be allocated from it, and such allocations would then have + * addresses within the range [_stext, _end]. + */ +#ifndef arch_is_kernel_initmem_freed +static int arch_is_kernel_initmem_freed(unsigned long addr) +{ + if (system_state < SYSTEM_FREEING_INITMEM) + return 0; + + return init_section_contains((void *)addr, 1); +} +#endif + static int static_obj(const void *obj) { unsigned long start = (unsigned long) &_stext, -- cgit v1.2.3 From 1b1ad288b8f1b11f83396e537003722897ecc12b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:43 -0800 Subject: kallsyms: remove arch specific text and data check Patch series "sections: Unify kernel sections range check and use", v4. There are three head files(kallsyms.h, kernel.h and sections.h) which include the kernel sections range check, let's make some cleanup and unify them. 1. cleanup arch specific text/data check and fix address boundary check in kallsyms.h 2. make all the basic/core kernel range check function into sections.h 3. update all the callers, and use the helper in sections.h to simplify the code After this series, we have 5 APIs about kernel sections range check in sections.h * is_kernel_rodata() --- already in sections.h * is_kernel_core_data() --- come from core_kernel_data() in kernel.h * is_kernel_inittext() --- come from kernel.h and kallsyms.h * __is_kernel_text() --- add new internal helper * __is_kernel() --- add new internal helper Note: For the last two helpers, people should not use directly, consider to use corresponding function in kallsyms.h. This patch (of 11): Remove arch specific text and data check after commit 4ba66a976072 ("arch: remove blackfin port"), no need arch-specific text/data check. Link: https://lkml.kernel.org/r/20210930071143.63410-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210930071143.63410-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: David S. Miller Cc: Alexei Starovoitov Cc: Andrey Ryabinin Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christophe Leroy Cc: Kefeng Wang Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michal Simek Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8e118caf835e..f2d3bf4960f4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -818,9 +818,6 @@ static int static_obj(const void *obj) if ((addr >= start) && (addr < end)) return 1; - if (arch_is_kernel_data(addr)) - return 1; - /* * in-kernel percpu var? */ -- cgit v1.2.3 From 2202e15b2b1a946ce760d96748cd7477589701ab Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 4 Nov 2021 13:27:06 +0100 Subject: kernel/locking: Use a pointer in ww_mutex_trylock(). mutex_acquire_nest() expects a pointer, pass the pointer. Fixes: 12235da8c80a1 ("kernel/locking: Add context to ww_mutex_trylock()") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211104122706.frk52zxbjorso2kv@linutronix.de --- kernel/locking/ww_rt_mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/locking') diff --git a/kernel/locking/ww_rt_mutex.c b/kernel/locking/ww_rt_mutex.c index 0e00205cf467..d1473c624105 100644 --- a/kernel/locking/ww_rt_mutex.c +++ b/kernel/locking/ww_rt_mutex.c @@ -26,7 +26,7 @@ int ww_mutex_trylock(struct ww_mutex *lock, struct ww_acquire_ctx *ww_ctx) if (__rt_mutex_trylock(&rtm->rtmutex)) { ww_mutex_set_context_fastpath(lock, ww_ctx); - mutex_acquire_nest(&rtm->dep_map, 0, 1, ww_ctx->dep_map, _RET_IP_); + mutex_acquire_nest(&rtm->dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_); return 1; } -- cgit v1.2.3 From d257cc8cb8d5355ffc43a96bab94db7b5a324803 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 15 Nov 2021 20:29:12 -0500 Subject: locking/rwsem: Make handoff bit handling more consistent There are some inconsistency in the way that the handoff bit is being handled in readers and writers that lead to a race condition. Firstly, when a queue head writer set the handoff bit, it will clear it when the writer is being killed or interrupted on its way out without acquiring the lock. That is not the case for a queue head reader. The handoff bit will simply be inherited by the next waiter. Secondly, in the out_nolock path of rwsem_down_read_slowpath(), both the waiter and handoff bits are cleared if the wait queue becomes empty. For rwsem_down_write_slowpath(), however, the handoff bit is not checked and cleared if the wait queue is empty. This can potentially make the handoff bit set with empty wait queue. Worse, the situation in rwsem_down_write_slowpath() relies on wstate, a variable set outside of the critical section containing the ->count manipulation, this leads to race condition where RWSEM_FLAG_HANDOFF can be double subtracted, corrupting ->count. To make the handoff bit handling more consistent and robust, extract out handoff bit clearing code into the new rwsem_del_waiter() helper function. Also, completely eradicate wstate; always evaluate everything inside the same critical section. The common function will only use atomic_long_andnot() to clear bits when the wait queue is empty to avoid possible race condition. If the first waiter with handoff bit set is killed or interrupted to exit the slowpath without acquiring the lock, the next waiter will inherit the handoff bit. While at it, simplify the trylock for loop in rwsem_down_write_slowpath() to make it easier to read. Fixes: 4f23dbc1e657 ("locking/rwsem: Implement lock handoff to prevent lock starvation") Reported-by: Zhenhua Ma Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20211116012912.723980-1-longman@redhat.com --- kernel/locking/rwsem.c | 171 ++++++++++++++++++++++++------------------------- 1 file changed, 85 insertions(+), 86 deletions(-) (limited to 'kernel/locking') diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index c51387a43265..e039cf1605af 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -105,9 +105,9 @@ * atomic_long_cmpxchg() will be used to obtain writer lock. * * There are three places where the lock handoff bit may be set or cleared. - * 1) rwsem_mark_wake() for readers. - * 2) rwsem_try_write_lock() for writers. - * 3) Error path of rwsem_down_write_slowpath(). + * 1) rwsem_mark_wake() for readers -- set, clear + * 2) rwsem_try_write_lock() for writers -- set, clear + * 3) rwsem_del_waiter() -- clear * * For all the above cases, wait_lock will be held. A writer must also * be the first one in the wait_list to be eligible for setting the handoff @@ -334,6 +334,9 @@ struct rwsem_waiter { struct task_struct *task; enum rwsem_waiter_type type; unsigned long timeout; + + /* Writer only, not initialized in reader */ + bool handoff_set; }; #define rwsem_first_waiter(sem) \ list_first_entry(&sem->wait_list, struct rwsem_waiter, list) @@ -344,12 +347,6 @@ enum rwsem_wake_type { RWSEM_WAKE_READ_OWNED /* Waker thread holds the read lock */ }; -enum writer_wait_state { - WRITER_NOT_FIRST, /* Writer is not first in wait list */ - WRITER_FIRST, /* Writer is first in wait list */ - WRITER_HANDOFF /* Writer is first & handoff needed */ -}; - /* * The typical HZ value is either 250 or 1000. So set the minimum waiting * time to at least 4ms or 1 jiffy (if it is higher than 4ms) in the wait @@ -365,6 +362,31 @@ enum writer_wait_state { */ #define MAX_READERS_WAKEUP 0x100 +static inline void +rwsem_add_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_add_tail(&waiter->list, &sem->wait_list); + /* caller will set RWSEM_FLAG_WAITERS */ +} + +/* + * Remove a waiter from the wait_list and clear flags. + * + * Both rwsem_mark_wake() and rwsem_try_write_lock() contain a full 'copy' of + * this function. Modify with care. + */ +static inline void +rwsem_del_waiter(struct rw_semaphore *sem, struct rwsem_waiter *waiter) +{ + lockdep_assert_held(&sem->wait_lock); + list_del(&waiter->list); + if (likely(!list_empty(&sem->wait_list))) + return; + + atomic_long_andnot(RWSEM_FLAG_HANDOFF | RWSEM_FLAG_WAITERS, &sem->count); +} + /* * handle the lock release when processes blocked on it that can now run * - if we come here from up_xxxx(), then the RWSEM_FLAG_WAITERS bit must @@ -376,6 +398,8 @@ enum writer_wait_state { * preferably when the wait_lock is released * - woken process blocks are discarded from the list after having task zeroed * - writers are only marked woken if downgrading is false + * + * Implies rwsem_del_waiter() for all woken readers. */ static void rwsem_mark_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type, @@ -490,18 +514,25 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, adjustment = woken * RWSEM_READER_BIAS - adjustment; lockevent_cond_inc(rwsem_wake_reader, woken); + + oldcount = atomic_long_read(&sem->count); if (list_empty(&sem->wait_list)) { - /* hit end of list above */ + /* + * Combined with list_move_tail() above, this implies + * rwsem_del_waiter(). + */ adjustment -= RWSEM_FLAG_WAITERS; + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; + } else if (woken) { + /* + * When we've woken a reader, we no longer need to force + * writers to give up the lock and we can clear HANDOFF. + */ + if (oldcount & RWSEM_FLAG_HANDOFF) + adjustment -= RWSEM_FLAG_HANDOFF; } - /* - * When we've woken a reader, we no longer need to force writers - * to give up the lock and we can clear HANDOFF. - */ - if (woken && (atomic_long_read(&sem->count) & RWSEM_FLAG_HANDOFF)) - adjustment -= RWSEM_FLAG_HANDOFF; - if (adjustment) atomic_long_add(adjustment, &sem->count); @@ -532,12 +563,12 @@ static void rwsem_mark_wake(struct rw_semaphore *sem, * race conditions between checking the rwsem wait list and setting the * sem->count accordingly. * - * If wstate is WRITER_HANDOFF, it will make sure that either the handoff - * bit is set or the lock is acquired with handoff bit cleared. + * Implies rwsem_del_waiter() on success. */ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, - enum writer_wait_state wstate) + struct rwsem_waiter *waiter) { + bool first = rwsem_first_waiter(sem) == waiter; long count, new; lockdep_assert_held(&sem->wait_lock); @@ -546,13 +577,19 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, do { bool has_handoff = !!(count & RWSEM_FLAG_HANDOFF); - if (has_handoff && wstate == WRITER_NOT_FIRST) - return false; + if (has_handoff) { + if (!first) + return false; + + /* First waiter inherits a previously set handoff bit */ + waiter->handoff_set = true; + } new = count; if (count & RWSEM_LOCK_MASK) { - if (has_handoff || (wstate != WRITER_HANDOFF)) + if (has_handoff || (!rt_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) return false; new |= RWSEM_FLAG_HANDOFF; @@ -569,9 +606,17 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, * We have either acquired the lock with handoff bit cleared or * set the handoff bit. */ - if (new & RWSEM_FLAG_HANDOFF) + if (new & RWSEM_FLAG_HANDOFF) { + waiter->handoff_set = true; + lockevent_inc(rwsem_wlock_handoff); return false; + } + /* + * Have rwsem_try_write_lock() fully imply rwsem_del_waiter() on + * success. + */ + list_del(&waiter->list); rwsem_set_owner(sem); return true; } @@ -956,7 +1001,7 @@ queue: } adjustment += RWSEM_FLAG_WAITERS; } - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock, but no longer actively locking */ count = atomic_long_add_return(adjustment, &sem->count); @@ -1002,11 +1047,7 @@ queue: return sem; out_nolock: - list_del(&waiter.list); - if (list_empty(&sem->wait_list)) { - atomic_long_andnot(RWSEM_FLAG_WAITERS|RWSEM_FLAG_HANDOFF, - &sem->count); - } + rwsem_del_waiter(sem, &waiter); raw_spin_unlock_irq(&sem->wait_lock); __set_current_state(TASK_RUNNING); lockevent_inc(rwsem_rlock_fail); @@ -1020,9 +1061,7 @@ static struct rw_semaphore * rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) { long count; - enum writer_wait_state wstate; struct rwsem_waiter waiter; - struct rw_semaphore *ret = sem; DEFINE_WAKE_Q(wake_q); /* do optimistic spinning and steal lock if possible */ @@ -1038,16 +1077,13 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state) waiter.task = current; waiter.type = RWSEM_WAITING_FOR_WRITE; waiter.timeout = jiffies + RWSEM_WAIT_TIMEOUT; + waiter.handoff_set = false; raw_spin_lock_irq(&sem->wait_lock); - - /* account for this before adding a new element to the list */ - wstate = list_empty(&sem->wait_list) ? WRITER_FIRST : WRITER_NOT_FIRST; - - list_add_tail(&waiter.list, &sem->wait_list); + rwsem_add_waiter(sem, &waiter); /* we're now waiting on the lock */ - if (wstate == WRITER_NOT_FIRST) { + if (rwsem_first_waiter(sem) != &waiter) { count = atomic_long_read(&sem->count); /* @@ -1083,13 +1119,16 @@ wait: /* wait until we successfully acquire the lock */ set_current_state(state); for (;;) { - if (rwsem_try_write_lock(sem, wstate)) { + if (rwsem_try_write_lock(sem, &waiter)) { /* rwsem_try_write_lock() implies ACQUIRE on success */ break; } raw_spin_unlock_irq(&sem->wait_lock); + if (signal_pending_state(state, current)) + goto out_nolock; + /* * After setting the handoff bit and failing to acquire * the lock, attempt to spin on owner to accelerate lock @@ -1098,7 +1137,7 @@ wait: * In this case, we attempt to acquire the lock again * without sleeping. */ - if (wstate == WRITER_HANDOFF) { + if (waiter.handoff_set) { enum owner_state owner_state; preempt_disable(); @@ -1109,66 +1148,26 @@ wait: goto trylock_again; } - /* Block until there are no active lockers. */ - for (;;) { - if (signal_pending_state(state, current)) - goto out_nolock; - - schedule(); - lockevent_inc(rwsem_sleep_writer); - set_current_state(state); - /* - * If HANDOFF bit is set, unconditionally do - * a trylock. - */ - if (wstate == WRITER_HANDOFF) - break; - - if ((wstate == WRITER_NOT_FIRST) && - (rwsem_first_waiter(sem) == &waiter)) - wstate = WRITER_FIRST; - - count = atomic_long_read(&sem->count); - if (!(count & RWSEM_LOCK_MASK)) - break; - - /* - * The setting of the handoff bit is deferred - * until rwsem_try_write_lock() is called. - */ - if ((wstate == WRITER_FIRST) && (rt_task(current) || - time_after(jiffies, waiter.timeout))) { - wstate = WRITER_HANDOFF; - lockevent_inc(rwsem_wlock_handoff); - break; - } - } + schedule(); + lockevent_inc(rwsem_sleep_writer); + set_current_state(state); trylock_again: raw_spin_lock_irq(&sem->wait_lock); } __set_current_state(TASK_RUNNING); - list_del(&waiter.list); raw_spin_unlock_irq(&sem->wait_lock); lockevent_inc(rwsem_wlock); - - return ret; + return sem; out_nolock: __set_current_state(TASK_RUNNING); raw_spin_lock_irq(&sem->wait_lock); - list_del(&waiter.list); - - if (unlikely(wstate == WRITER_HANDOFF)) - atomic_long_add(-RWSEM_FLAG_HANDOFF, &sem->count); - - if (list_empty(&sem->wait_list)) - atomic_long_andnot(RWSEM_FLAG_WAITERS, &sem->count); - else + rwsem_del_waiter(sem, &waiter); + if (!list_empty(&sem->wait_list)) rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q); raw_spin_unlock_irq(&sem->wait_lock); wake_up_q(&wake_q); lockevent_inc(rwsem_wlock_fail); - return ERR_PTR(-EINTR); } -- cgit v1.2.3 From 14c24048841151548a3f4d9e218510c844c1b737 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 18 Nov 2021 17:44:55 +0800 Subject: locking/rwsem: Optimize down_read_trylock() under highly contended case MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We found that a process with 10 thousnads threads has been encountered a regression problem from Linux-v4.14 to Linux-v5.4. It is a kind of workload which will concurrently allocate lots of memory in different threads sometimes. In this case, we will see the down_read_trylock() with a high hotspot. Therefore, we suppose that rwsem has a regression at least since Linux-v5.4. In order to easily debug this problem, we write a simply benchmark to create the similar situation lile the following. ```c++ #include #include #include #include #include #include #include #include #include volatile int mutex; void trigger(int cpu, char* ptr, std::size_t sz) { cpu_set_t set; CPU_ZERO(&set); CPU_SET(cpu, &set); assert(pthread_setaffinity_np(pthread_self(), sizeof(set), &set) == 0); while (mutex); for (std::size_t i = 0; i < sz; i += 4096) { *ptr = '\0'; ptr += 4096; } } int main(int argc, char* argv[]) { std::size_t sz = 100; if (argc > 1) sz = atoi(argv[1]); auto nproc = std::thread::hardware_concurrency(); std::vector thr; sz <<= 30; auto* ptr = mmap(nullptr, sz, PROT_READ | PROT_WRITE, MAP_ANON | MAP_PRIVATE, -1, 0); assert(ptr != MAP_FAILED); char* cptr = static_cast(ptr); auto run = sz / nproc; run = (run >> 12) << 12; mutex = 1; for (auto i = 0U; i < nproc; ++i) { thr.emplace_back(std::thread([i, cptr, run]() { trigger(i, cptr, run); })); cptr += run; } rusage usage_start; getrusage(RUSAGE_SELF, &usage_start); auto start = std::chrono::system_clock::now(); mutex = 0; for (auto& t : thr) t.join(); rusage usage_end; getrusage(RUSAGE_SELF, &usage_end); auto end = std::chrono::system_clock::now(); timeval utime; timeval stime; timersub(&usage_end.ru_utime, &usage_start.ru_utime, &utime); timersub(&usage_end.ru_stime, &usage_start.ru_stime, &stime); printf("usr: %ld.%06ld\n", utime.tv_sec, utime.tv_usec); printf("sys: %ld.%06ld\n", stime.tv_sec, stime.tv_usec); printf("real: %lu\n", std::chrono::duration_cast(end - start).count()); return 0; } ``` The functionality of above program is simply which cr