From d16dbd1b8a29bb9f8aca2c2f3bd1a0d2b7621126 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:22 +0800 Subject: locking/lockdep: Update obsolete struct field description The lock_chain struct definition has outdated comment, update it and add struct member description. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-7-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 6e2377e6c1d6..851d44fa5457 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -203,11 +203,17 @@ struct lock_list { struct lock_list *parent; }; -/* - * We record lock dependency chains, so that we can cache them: +/** + * struct lock_chain - lock dependency chain record + * + * @irq_context: the same as irq_context in held_lock below + * @depth: the number of held locks in this chain + * @base: the index in chain_hlocks for this chain + * @entry: the collided lock chains in lock_chain hash list + * @chain_key: the hash key of this lock_chain */ struct lock_chain { - /* see BUILD_BUG_ON()s in lookup_chain_cache() */ + /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; -- cgit v1.2.3 From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:23 +0800 Subject: locking/lockdep: Use lockdep_init_task for task initiation consistently Despite that there is a lockdep_init_task() which does nothing, lockdep initiates tasks by assigning lockdep fields and does so inconsistently. Fix this by using lockdep_init_task(). Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 851d44fa5457..5d05b8149f19 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -287,6 +287,8 @@ extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); +extern void lockdep_init_task(struct task_struct *task); + extern void lockdep_off(void); extern void lockdep_on(void); @@ -411,6 +413,10 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #else /* !CONFIG_LOCKDEP */ +static inline void lockdep_init_task(struct task_struct *task) +{ +} + static inline void lockdep_off(void) { } @@ -503,7 +509,6 @@ enum xhlock_context_t { { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} -static inline void lockdep_init_task(struct task_struct *task) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT -- cgit v1.2.3 From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:24 +0800 Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start with Chain keys are computed using Jenkins hash function, which needs an initial hash to start with. Dedicate a macro to make this clear and configurable. A later patch changes this initial chain key. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 5d05b8149f19..d4e69595dbd4 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -229,6 +229,7 @@ struct lock_chain { * bitfield and hitting the BUG in hlock_class(). */ #define MAX_LOCKDEP_KEYS ((1UL << MAX_LOCKDEP_KEYS_BITS) - 1) +#define INITIAL_CHAIN_KEY 0 struct held_lock { /* -- cgit v1.2.3 From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001 From: Yuyang Du Date: Mon, 6 May 2019 16:19:25 +0800 Subject: locking/lockdep: Change the range of class_idx in held_lock struct held_lock->class_idx is used to point to the class of the held lock. The index is shifted by 1 to make index 0 mean no class, which results in class index shifting back and forth but is not worth doing so. The reason is: (1) there will be no "no-class" held_lock to begin with, and (2) index 0 seems to be used for error checking, but if something wrong indeed happened, the index can't be counted on to distinguish it as that something won't set the class_idx to 0 on purpose to tell us it is wrong. Therefore, change the index to start from 0. This saves a lot of back-and-forth shifts and a class slot back to lock_classes. Since index 0 is now used for lock class, we change the initial chain key to -1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0. Actually, the initial chain key can be any arbitrary value other than 0. In addition, a bitmap is maintained to keep track of the used lock classes, and we check the validity of the held lock against that bitmap. Signed-off-by: Yuyang Du Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: bvanassche@acm.org Cc: frederic@kernel.org Cc: ming.lei@redhat.com Cc: will.deacon@arm.com Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index d4e69595dbd4..30a0f81aa130 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -223,13 +223,8 @@ struct lock_chain { }; #define MAX_LOCKDEP_KEYS_BITS 13 -/* - * Subtract one because we offset hlock->class_idx by 1 in order - * to make 0 mean no class. This avoids overflowing the class_idx - * bitfield and hitting the BUG in hlock_class(). - */ -#define MAX_LOCKDEP_KEYS ((1UL << MAX_LOCKDEP_KEYS_BITS) - 1) -#define INITIAL_CHAIN_KEY 0 +#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) +#define INITIAL_CHAIN_KEY -1 struct held_lock { /* @@ -254,6 +249,11 @@ struct held_lock { u64 waittime_stamp; u64 holdtime_stamp; #endif + /* + * class_idx is zero-indexed; it points to the element in + * lock_classes this held lock instance belongs to. class_idx is in + * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. + */ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt -- cgit v1.2.3 From 3724921396dd1a07c93e3516b8d7c9ff570d17a9 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Wed, 22 May 2019 14:22:48 +0100 Subject: locking/atomic: Use s64 for atomic64_t on 64-bit Now that all architectures use 64 consistently as the base type for the atomic64 API, let's have the CONFIG_64BIT definition of atomic64_t use s64 as the underlying type for atomic64_t, rather than long, matching the generated headers. On architectures where atomic64_read(v) is READ_ONCE(v->counter), this patch will cause the return type of atomic64_read() to be s64. As of this patch, the atomic64 API can be relied upon to consistently return s64 where a value rather than boolean condition is returned. This should make code more robust, and simpler, allowing for the removal of casts previously required to ensure consistent types. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: aou@eecs.berkeley.edu Cc: arnd@arndb.de Cc: bp@alien8.de Cc: catalin.marinas@arm.com Cc: davem@davemloft.net Cc: fenghua.yu@intel.com Cc: heiko.carstens@de.ibm.com Cc: herbert@gondor.apana.org.au Cc: ink@jurassic.park.msu.ru Cc: jhogan@kernel.org Cc: linux@armlinux.org.uk Cc: mattst88@gmail.com Cc: mpe@ellerman.id.au Cc: palmer@sifive.com Cc: paul.burton@mips.com Cc: paulus@samba.org Cc: ralf@linux-mips.org Cc: rth@twiddle.net Cc: tony.luck@intel.com Cc: vgupta@synopsys.com Link: https://lkml.kernel.org/r/20190522132250.26499-17-mark.rutland@arm.com Signed-off-by: Ingo Molnar --- include/linux/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 231114ae38f4..05030f608be3 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -174,7 +174,7 @@ typedef struct { #ifdef CONFIG_64BIT typedef struct { - long counter; + s64 counter; } atomic64_t; #endif -- cgit v1.2.3 From c2ba8a15f310d915f8748dd8324c91c82b12b5ff Mon Sep 17 00:00:00 2001 From: Daniel Bristot de Oliveira Date: Wed, 12 Jun 2019 11:57:30 +0200 Subject: jump_label: Batch updates if arch supports it If the architecture supports the batching of jump label updates, use it! An easy way to see the benefits of this patch is switching the schedstats on and off. For instance: -------------------------- %< ---------------------------- #!/bin/sh while [ true ]; do sysctl -w kernel.sched_schedstats=1 sleep 2 sysctl -w kernel.sched_schedstats=0 sleep 2 done -------------------------- >% ---------------------------- while watching the IPI count: -------------------------- %< ---------------------------- # watch -n1 "cat /proc/interrupts | grep Function" -------------------------- >% ---------------------------- With the current mode, it is possible to see +- 168 IPIs each 2 seconds, while with this patch the number of IPIs goes to 3 each 2 seconds. Regarding the performance impact of this patch set, I made two measurements: The time to update a key (the task that is causing the change) The time to run the int3 handler (the side effect on a thread that hits the code being changed) The schedstats static key was chosen as the key to being switched on and off. The reason being is that it is used in more than 56 places, in a hot path. The change in the schedstats static key will be done with the following command: while [ true ]; do sysctl -w kernel.sched_schedstats=1 usleep 500000 sysctl -w kernel.sched_schedstats=0 usleep 500000 done In this way, they key will be updated twice per second. To force the hit of the int3 handler, the system will also run a kernel compilation with two jobs per CPU. The test machine is a two nodes/24 CPUs box with an Intel Xeon processor @2.27GHz. Regarding the update part, on average, the regular kernel takes 57 ms to update the schedstats key, while the kernel with the batch updates takes just 1.4 ms on average. Although it seems to be too good to be true, it makes sense: the schedstats key is used in 56 places, so it was expected that it would take around 56 times to update the keys with the current implementation, as the IPIs are the most expensive part of the update. Regarding the int3 handler, the non-batch handler takes 45 ns on average, while the batch version takes around 180 ns. At first glance, it seems to be a high value. But it is not, considering that it is doing 56 updates, rather than one! It is taking four times more, only. This gain is possible because the patch uses a binary search in the vector: log2(56)=5.8. So, it was expected to have an overhead within four times. (voice of tv propaganda) But, that is not all! As the int3 handler keeps on for a shorter period (because the update part is on for a shorter time), the number of hits in the int3 handler decreased by 10%. The question then is: Is it worth paying the price of "135 ns" more in the int3 handler? Considering that, in this test case, we are saving the handling of 53 IPIs, that takes more than these 135 ns, it seems to be a meager price to be paid. Moreover, the test case was forcing the hit of the int3, in practice, it does not take that often. While the IPI takes place on all CPUs, hitting the int3 handler or not! For instance, in an isolated CPU with a process running in user-space (nohz_full use-case), the chances of hitting the int3 handler is barely zero, while there is no way to avoid the IPIs. By bounding the IPIs, we are improving a lot this scenario. Signed-off-by: Daniel Bristot de Oliveira Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Chris von Recklinghausen Cc: Clark Williams Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Jason Baron Cc: Jiri Kosina Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Masami Hiramatsu Cc: Peter Zijlstra Cc: Scott Wood Cc: Steven Rostedt (VMware) Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/acc891dbc2dbc9fd616dd680529a2337b1d1274c.1560325897.git.bristot@redhat.com Signed-off-by: Ingo Molnar --- include/linux/jump_label.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 3e113a1fa0f1..3526c0aee954 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -215,6 +215,9 @@ extern void arch_jump_label_transform(struct jump_entry *entry, enum jump_label_type type); extern void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type); +extern bool arch_jump_label_transform_queue(struct jump_entry *entry, + enum jump_label_type type); +extern void arch_jump_label_transform_apply(void); extern int jump_label_text_reserved(void *start, void *end); extern void static_key_slow_inc(struct static_key *key); extern void static_key_slow_dec(struct static_key *key); -- cgit v1.2.3 From 9ffbe8ac05dbb4ab4a4836a55a47fc6be945a38f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 31 May 2019 13:06:51 +0300 Subject: locking/lockdep: Rename lockdep_assert_held_exclusive() -> lockdep_assert_held_write() All callers of lockdep_assert_held_exclusive() use it to verify the correct locking state of either a semaphore (ldisc_sem in tty, mmap_sem for perf events, i_rwsem of inode for dax) or rwlock by apparmor. Thus it makes sense to rename _exclusive to _write since that's the semantics callers care. Additionally there is already lockdep_assert_held_read(), which this new naming is more consistent with. No functional changes. Signed-off-by: Nikolay Borisov Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190531100651.3969-1-nborisov@suse.com Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 30a0f81aa130..151d55711082 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -394,7 +394,7 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) -#define lockdep_assert_held_exclusive(l) do { \ +#define lockdep_assert_held_write(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ } while (0) @@ -479,7 +479,7 @@ struct lockdep_map { }; #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) -#define lockdep_assert_held_exclusive(l) do { (void)(l); } while (0) +#define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) -- cgit v1.2.3 From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:00 -0400 Subject: locking/rwsem: Make owner available even if !CONFIG_RWSEM_SPIN_ON_OWNER The owner field in the rw_semaphore structure is used primarily for optimistic spinning. However, identifying the rwsem owner can also be helpful in debugging as well as tracing locking related issues when analyzing crash dump. The owner field may also store state information that can be important to the operation of the rwsem. So the owner field is now made a permanent member of the rw_semaphore structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER. Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 2ea18a3def04..148983e21d47 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -34,12 +34,12 @@ */ struct rw_semaphore { atomic_long_t count; -#ifdef CONFIG_RWSEM_SPIN_ON_OWNER /* - * Write owner. Used as a speculative check to see - * if the owner is running on the cpu. + * Write owner or one of the read owners. Can be used as a + * speculative check to see if the owner is running on the cpu. */ struct task_struct *owner; +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif raw_spinlock_t wait_lock; @@ -73,13 +73,14 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #endif #ifdef CONFIG_RWSEM_SPIN_ON_OWNER -#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL +#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED #else #define __RWSEM_OPT_INIT(lockname) #endif #define __RWSEM_INITIALIZER(name) \ { __RWSEM_INIT_COUNT(name), \ + .owner = NULL, \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ -- cgit v1.2.3 From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:07 -0400 Subject: locking/rwsem: Always release wait_lock before waking up tasks With the use of wake_q, we can do task wakeups without holding the wait_lock. There is one exception in the rwsem code, though. It is when the writer in the slowpath detects that there are waiters ahead but the rwsem is not held by a writer. This can lead to a long wait_lock hold time especially when a large number of readers are to be woken up. Remediate this situation by releasing the wait_lock before waking up tasks and re-acquiring it afterward. The rwsem_try_write_lock() function is also modified to read the rwsem count directly to avoid stale count value. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/sched/wake_q.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index ad826d2a4557..26a2013ac39c 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -51,6 +51,11 @@ static inline void wake_q_init(struct wake_q_head *head) head->lastp = &head->first; } +static inline bool wake_q_empty(struct wake_q_head *head) +{ + return head->first == WAKE_Q_TAIL; +} + extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); extern void wake_up_q(struct wake_q_head *head); -- cgit v1.2.3 From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:10 -0400 Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an anonymous owner - readers or an anonymous writer. The setting of this anonymous bit is used as an indicator that optimistic spinning cannot be done on this rwsem. With the upcoming reader optimistic spinning patches, a reader-owned rwsem can be spinned on for a limit period of time. We still need this bit to indicate a rwsem is nonspinnable, but not setting this bit loses its meaning that the owner is known. So rename the bit to RWSEM_NONSPINNABLE to clarify its meaning. This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write(). Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/rwsem.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 148983e21d47..bb76e82398b2 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -50,7 +50,7 @@ struct rw_semaphore { }; /* - * Setting bit 1 of the owner field but not bit 0 will indicate + * Setting all bits of the owner field except bit 0 will indicate * that the rwsem is writer-owned with an unknown owner. */ #define RWSEM_OWNER_UNKNOWN ((struct task_struct *)-2L) -- cgit v1.2.3 From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Mon, 20 May 2019 16:59:12 -0400 Subject: locking/rwsem: Make rwsem->owner an atomic_long_t The rwsem->owner contains not just the task structure pointer, it also holds some flags for storing the current state of the rwsem. Some of the flags may have to be atomically updated. To reflect the new reality, the owner is now changed to an atomic_long_t type. New helper functions are added to properly separate out the task structure pointer and the embedded flags. Suggested-by: Peter Zijlstra Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com Signed-off-by: Ingo Molnar --- include/linux/percpu-rwsem.h | 4 ++-- include/linux/rwsem.h | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 03cb4b6f842e..0a43830f1932 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -117,7 +117,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, lock_release(&sem->rw_sem.dep_map, 1, ip); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER if (!read) - sem->rw_sem.owner = RWSEM_OWNER_UNKNOWN; + atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN); #endif } @@ -127,7 +127,7 @@ static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); #ifdef CONFIG_RWSEM_SPIN_ON_OWNER if (!read) - sem->rw_sem.owner = current; + atomic_long_set(&sem->rw_sem.owner, (long)current); #endif } diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index bb76e82398b2..e401358c4e7e 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -35,10 +35,11 @@ struct rw_semaphore { atomic_long_t count; /* - * Write owner or one of the read owners. Can be used as a - * speculative check to see if the owner is running on the cpu. + * Write owner or one of the read owners as well flags regarding + * the current state of the rwsem. Can be used as a speculative + * check to see if the write owner is running on the cpu. */ - struct task_struct *owner; + atomic_long_t owner; #ifdef CONFIG_RWSEM_SPIN_ON_OWNER struct optimistic_spin_queue osq; /* spinner MCS lock */ #endif @@ -53,7 +54,7 @@ struct rw_semaphore { * Setting all bits of the owner field except bit 0 will indicate * that the rwsem is writer-owned with an unknown owner. */ -#define RWSEM_OWNER_UNKNOWN ((struct task_struct *)-2L) +#define RWSEM_OWNER_UNKNOWN (-2L) /* In all implementations count != 0 means locked */ static inline int rwsem_is_locked(struct rw_semaphore *sem) @@ -80,7 +81,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem) #define __RWSEM_INITIALIZER(name) \ { __RWSEM_INIT_COUNT(name), \ - .owner = NULL, \ + .owner = ATOMIC_LONG_INIT(0), \ .wait_list = LIST_HEAD_INIT((name).wait_list), \ .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock) \ __RWSEM_OPT_INIT(name) \ -- cgit v1.2.3 From 9ed7d75b2f09d836e71d597cd5879abb1a44e7a9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 27 Feb 2019 09:48:51 +0100 Subject: x86/percpu: Relax smp_processor_id() Nadav reported that since this_cpu_read() became asm-volatile, many smp_processor_id() users generated worse code due to the extra constraints. However since smp_processor_id() is reading a stable value, we can use __this_cpu_read(). While this does reduce text size somewhat, this mostly results in code movement to .text.unlikely as a result of more/larger .cold. subfunctions. Less text on the hotpath is good for I$. $ ./compare.sh defconfig-build1 defconfig-build2 vmlinux.o setup_APIC_ibs 90 98 -12,+20 force_ibs_eilvt_setup 400 413 -57,+70 pci_serr_error 109 104 -54,+49 pci_serr_error 109 104 -54,+49 unknown_nmi_error 125 120 -76,+71 unknown_nmi_error 125 120 -76,+71 io_check_error 125 132 -97,+104 intel_thermal_interrupt 730 822 +92,+0 intel_init_thermal 951 945 -6,+0 generic_get_mtrr 301 294 -7,+0 generic_get_mtrr 301 294 -7,+0 generic_set_all 749 754 -44,+49 get_fixed_ranges 352 360 -41,+49 x86_acpi_suspend_lowlevel 369 363 -6,+0 check_tsc_sync_source 412 412 -71,+71 irq_migrate_all_off_this_cpu 662 674 -14,+26 clocksource_watchdog 748 748 -113,+113 __perf_event_account_interrupt 204 197 -7,+0 attempt_merge 1748 1741 -7,+0 intel_guc_send_ct 1424 1409 -15,+0 __fini_doorbell 235 231 -4,+0 bdw_set_cdclk 928 923 -5,+0 gen11_dsi_disable 1571 1556 -15,+0 gmbus_wait 493 488 -5,+0 md_make_request 376 369 -7,+0 __split_and_process_bio 543 536 -7,+0 delay_tsc 96 89 -7,+0 hsw_disable_pc8 696 691 -5,+0 tsc_verify_tsc_adjust 215 228 -22,+35 cpuidle_driver_unref 56 49 -7,+0 blk_account_io_completion 159 148 -11,+0 mtrr_wrmsr 95 99 -29,+33 __intel_wait_for_register_fw 401 419 +18,+0 cpuidle_driver_ref 43 36 -7,+0 cpuidle_get_driver 15 8 -7,+0 blk_account_io_done 535 528 -7,+0 irq_migrate_all_off_this_cpu 662 674 -14,+26 check_tsc_sync_source 412 412 -71,+71 irq_wait_for_poll 170 163 -7,+0 generic_end_io_acct 329 322 -7,+0 x86_acpi_suspend_lowlevel 369 363 -6,+0 nohz_balance_enter_idle 198 191 -7,+0 generic_start_io_acct 254 247 -7,+0 blk_account_io_start 341 334 -7,+0 perf_event_task_tick 682 675 -7,+0 intel_init_thermal 951 945 -6,+0 amd_e400_c1e_apic_setup 47 51 -28,+32 setup_APIC_eilvt 350 328 -22,+0 hsw_enable_pc8 1611 1605 -6,+0 total 12985947 12985892 -994,+939 Reported-by: Nadav Amit Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/smp.h | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/smp.h b/include/linux/smp.h index a56f08ff3097..aa9e5e82d8c3 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -181,29 +181,46 @@ static inline int get_boot_cpu_id(void) #endif /* !SMP */ -/* - * smp_processor_id(): get the current CPU ID. +/** + * raw_processor_id() - get the current (unstable) CPU id + * + * For then you know what you are doing and need an unstable + * CPU id. + */ + +/** + * smp_processor_id() - get the current (stable) CPU id + * + * This is the normal accessor to the CPU id and should be used + * whenever possible. + * + * The CPU id is stable when: * - * if DEBUG_PREEMPT is enabled then we check whether it is - * used in a preemption-safe way. (smp_processor_id() is safe - * if it's used in a preemption-off critical section, or in - * a thread that is bound to the current CPU.) + * - IRQs are disabled; + * - preemption is disabled; + * - the task is CPU affine. * - * NOTE: raw_smp_processor_id() is for internal use only - * (smp_processor_id() is the preferred variant), but in rare - * instances it might also be used to turn off false positives - * (i.e. smp_processor_id() use that the debugging code reports but - * which use for some reason is legal). Don't use this to hack around - * the warning message, as your code might not work under PREEMPT. + * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN + * when smp_processor_id() is used when the CPU id is not stable. */ + +/* + * Allow the architecture to differentiate between a stable and unstable read. + * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a + * regular asm read for the stable. + */ +#ifndef __smp_processor_id +#define __smp_processor_id(x) raw_smp_processor_id(x) +#endif + #ifdef CONFIG_DEBUG_PREEMPT extern unsigned int debug_smp_processor_id(void); # define smp_processor_id() debug_smp_processor_id() #else -# define smp_processor_id() raw_smp_processor_id() +# define smp_processor_id() __smp_processor_id() #endif -#define get_cpu() ({ preempt_disable(); smp_processor_id(); }) +#define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() /* -- cgit v1.2.3