From d16dbd1b8a29bb9f8aca2c2f3bd1a0d2b7621126 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:22 +0800
Subject: locking/lockdep: Update obsolete struct field description

The lock_chain struct definition has outdated comment, update it and add
struct member description.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-7-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 6e2377e6c1d6..851d44fa5457 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -203,11 +203,17 @@ struct lock_list {
 	struct lock_list		*parent;
 };
 
-/*
- * We record lock dependency chains, so that we can cache them:
+/**
+ * struct lock_chain - lock dependency chain record
+ *
+ * @irq_context: the same as irq_context in held_lock below
+ * @depth:       the number of held locks in this chain
+ * @base:        the index in chain_hlocks for this chain
+ * @entry:       the collided lock chains in lock_chain hash list
+ * @chain_key:   the hash key of this lock_chain
  */
 struct lock_chain {
-	/* see BUILD_BUG_ON()s in lookup_chain_cache() */
+	/* see BUILD_BUG_ON()s in add_chain_cache() */
 	unsigned int			irq_context :  2,
 					depth       :  6,
 					base	    : 24;
-- 
cgit v1.2.3


From e196e479a3b844da6e6e71e0d2a8694040cb4e52 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:23 +0800
Subject: locking/lockdep: Use lockdep_init_task for task initiation
 consistently

Despite that there is a lockdep_init_task() which does nothing, lockdep
initiates tasks by assigning lockdep fields and does so inconsistently. Fix
this by using lockdep_init_task().

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-8-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 851d44fa5457..5d05b8149f19 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -287,6 +287,8 @@ extern void lockdep_free_key_range(void *start, unsigned long size);
 extern asmlinkage void lockdep_sys_exit(void);
 extern void lockdep_set_selftest_task(struct task_struct *task);
 
+extern void lockdep_init_task(struct task_struct *task);
+
 extern void lockdep_off(void);
 extern void lockdep_on(void);
 
@@ -411,6 +413,10 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 
 #else /* !CONFIG_LOCKDEP */
 
+static inline void lockdep_init_task(struct task_struct *task)
+{
+}
+
 static inline void lockdep_off(void)
 {
 }
@@ -503,7 +509,6 @@ enum xhlock_context_t {
 	{ .name = (_name), .key = (void *)(_key), }
 
 static inline void lockdep_invariant_state(bool force) {}
-static inline void lockdep_init_task(struct task_struct *task) {}
 static inline void lockdep_free_task(struct task_struct *task) {}
 
 #ifdef CONFIG_LOCK_STAT
-- 
cgit v1.2.3


From f6ec8829ac9d59b637366c13038f15d6f6156fe1 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:24 +0800
Subject: locking/lockdep: Define INITIAL_CHAIN_KEY for chain keys to start
 with

Chain keys are computed using Jenkins hash function, which needs an initial
hash to start with. Dedicate a macro to make this clear and configurable. A
later patch changes this initial chain key.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-9-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5d05b8149f19..d4e69595dbd4 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -229,6 +229,7 @@ struct lock_chain {
  * bitfield and hitting the BUG in hlock_class().
  */
 #define MAX_LOCKDEP_KEYS		((1UL << MAX_LOCKDEP_KEYS_BITS) - 1)
+#define INITIAL_CHAIN_KEY		0
 
 struct held_lock {
 	/*
-- 
cgit v1.2.3


From 01bb6f0af992a1e6b7797d92fd31a7864872e347 Mon Sep 17 00:00:00 2001
From: Yuyang Du <duyuyang@gmail.com>
Date: Mon, 6 May 2019 16:19:25 +0800
Subject: locking/lockdep: Change the range of class_idx in held_lock struct

held_lock->class_idx is used to point to the class of the held lock. The
index is shifted by 1 to make index 0 mean no class, which results in class
index shifting back and forth but is not worth doing so.

The reason is: (1) there will be no "no-class" held_lock to begin with, and
(2) index 0 seems to be used for error checking, but if something wrong
indeed happened, the index can't be counted on to distinguish it as that
something won't set the class_idx to 0 on purpose to tell us it is wrong.

Therefore, change the index to start from 0. This saves a lot of
back-and-forth shifts and a class slot back to lock_classes.

Since index 0 is now used for lock class, we change the initial chain key to
-1 to avoid key collision, which is due to the fact that __jhash_mix(0, 0, 0) = 0.
Actually, the initial chain key can be any arbitrary value other than 0.

In addition, a bitmap is maintained to keep track of the used lock classes,
and we check the validity of the held lock against that bitmap.

Signed-off-by: Yuyang Du <duyuyang@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bvanassche@acm.org
Cc: frederic@kernel.org
Cc: ming.lei@redhat.com
Cc: will.deacon@arm.com
Link: https://lkml.kernel.org/r/20190506081939.74287-10-duyuyang@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index d4e69595dbd4..30a0f81aa130 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -223,13 +223,8 @@ struct lock_chain {
 };
 
 #define MAX_LOCKDEP_KEYS_BITS		13
-/*
- * Subtract one because we offset hlock->class_idx by 1 in order
- * to make 0 mean no class. This avoids overflowing the class_idx
- * bitfield and hitting the BUG in hlock_class().
- */
-#define MAX_LOCKDEP_KEYS		((1UL << MAX_LOCKDEP_KEYS_BITS) - 1)
-#define INITIAL_CHAIN_KEY		0
+#define MAX_LOCKDEP_KEYS		(1UL << MAX_LOCKDEP_KEYS_BITS)
+#define INITIAL_CHAIN_KEY		-1
 
 struct held_lock {
 	/*
@@ -254,6 +249,11 @@ struct held_lock {
 	u64 				waittime_stamp;
 	u64				holdtime_stamp;
 #endif
+	/*
+	 * class_idx is zero-indexed; it points to the element in
+	 * lock_classes this held lock instance belongs to. class_idx is in
+	 * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive.
+	 */
 	unsigned int			class_idx:MAX_LOCKDEP_KEYS_BITS;
 	/*
 	 * The lock-stack is unified in that the lock chains of interrupt
-- 
cgit v1.2.3


From 3724921396dd1a07c93e3516b8d7c9ff570d17a9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Wed, 22 May 2019 14:22:48 +0100
Subject: locking/atomic: Use s64 for atomic64_t on 64-bit

Now that all architectures use 64 consistently as the base type for the
atomic64 API, let's have the CONFIG_64BIT definition of atomic64_t use
s64 as the underlying type for atomic64_t, rather than long, matching
the generated headers.

On architectures where atomic64_read(v) is READ_ONCE(v->counter), this
patch will cause the return type of atomic64_read() to be s64.

As of this patch, the atomic64 API can be relied upon to consistently
return s64 where a value rather than boolean condition is returned. This
should make code more robust, and simpler, allowing for the removal of
casts previously required to ensure consistent types.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aou@eecs.berkeley.edu
Cc: arnd@arndb.de
Cc: bp@alien8.de
Cc: catalin.marinas@arm.com
Cc: davem@davemloft.net
Cc: fenghua.yu@intel.com
Cc: heiko.carstens@de.ibm.com
Cc: herbert@gondor.apana.org.au
Cc: ink@jurassic.park.msu.ru
Cc: jhogan@kernel.org
Cc: linux@armlinux.org.uk
Cc: mattst88@gmail.com
Cc: mpe@ellerman.id.au
Cc: palmer@sifive.com
Cc: paul.burton@mips.com
Cc: paulus@samba.org
Cc: ralf@linux-mips.org
Cc: rth@twiddle.net
Cc: tony.luck@intel.com
Cc: vgupta@synopsys.com
Link: https://lkml.kernel.org/r/20190522132250.26499-17-mark.rutland@arm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/types.h b/include/linux/types.h
index 231114ae38f4..05030f608be3 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -174,7 +174,7 @@ typedef struct {
 
 #ifdef CONFIG_64BIT
 typedef struct {
-	long counter;
+	s64 counter;
 } atomic64_t;
 #endif
 
-- 
cgit v1.2.3


From c2ba8a15f310d915f8748dd8324c91c82b12b5ff Mon Sep 17 00:00:00 2001
From: Daniel Bristot de Oliveira <bristot@redhat.com>
Date: Wed, 12 Jun 2019 11:57:30 +0200
Subject: jump_label: Batch updates if arch supports it

If the architecture supports the batching of jump label updates, use it!

An easy way to see the benefits of this patch is switching the
schedstats on and off. For instance:

-------------------------- %< ----------------------------
  #!/bin/sh
  while [ true ]; do
      sysctl -w kernel.sched_schedstats=1
      sleep 2
      sysctl -w kernel.sched_schedstats=0
      sleep 2
  done
-------------------------- >% ----------------------------

while watching the IPI count:

-------------------------- %< ----------------------------
  # watch -n1 "cat /proc/interrupts | grep Function"
-------------------------- >% ----------------------------

With the current mode, it is possible to see +- 168 IPIs each 2 seconds,
while with this patch the number of IPIs goes to 3 each 2 seconds.

Regarding the performance impact of this patch set, I made two measurements:

    The time to update a key (the task that is causing the change)
    The time to run the int3 handler (the side effect on a thread that
                                      hits the code being changed)

The schedstats static key was chosen as the key to being switched on and off.
The reason being is that it is used in more than 56 places, in a hot path. The
change in the schedstats static key will be done with the following command:

while [ true ]; do
    sysctl -w kernel.sched_schedstats=1
    usleep 500000
    sysctl -w kernel.sched_schedstats=0
    usleep 500000
done

In this way, they key will be updated twice per second. To force the hit of the
int3 handler, the system will also run a kernel compilation with two jobs per
CPU. The test machine is a two nodes/24 CPUs box with an Intel Xeon processor
@2.27GHz.

Regarding the update part, on average, the regular kernel takes 57 ms to update
the schedstats key, while the kernel with the batch updates takes just 1.4 ms
on average. Although it seems to be too good to be true, it makes sense: the
schedstats key is used in 56 places, so it was expected that it would take
around 56 times to update the keys with the current implementation, as the
IPIs are the most expensive part of the update.

Regarding the int3 handler, the non-batch handler takes 45 ns on average, while
the batch version takes around 180 ns. At first glance, it seems to be a high
value. But it is not, considering that it is doing 56 updates, rather than one!
It is taking four times more, only. This gain is possible because the patch
uses a binary search in the vector: log2(56)=5.8. So, it was expected to have
an overhead within four times.

(voice of tv propaganda) But, that is not all! As the int3 handler keeps on for
a shorter period (because the update part is on for a shorter time), the number
of hits in the int3 handler decreased by 10%.

The question then is: Is it worth paying the price of "135 ns" more in the int3
handler?

Considering that, in this test case, we are saving the handling of 53 IPIs,
that takes more than these 135 ns, it seems to be a meager price to be paid.
Moreover, the test case was forcing the hit of the int3, in practice, it
does not take that often. While the IPI takes place on all CPUs, hitting
the int3 handler or not!

For instance, in an isolated CPU with a process running in user-space
(nohz_full use-case), the chances of hitting the int3 handler is barely zero,
while there is no way to avoid the IPIs. By bounding the IPIs, we are improving
a lot this scenario.

Signed-off-by: Daniel Bristot de Oliveira <bristot@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Chris von Recklinghausen <crecklin@redhat.com>
Cc: Clark Williams <williams@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott Wood <swood@redhat.com>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/acc891dbc2dbc9fd616dd680529a2337b1d1274c.1560325897.git.bristot@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/jump_label.h | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h
index 3e113a1fa0f1..3526c0aee954 100644
--- a/include/linux/jump_label.h
+++ b/include/linux/jump_label.h
@@ -215,6 +215,9 @@ extern void arch_jump_label_transform(struct jump_entry *entry,
 				      enum jump_label_type type);
 extern void arch_jump_label_transform_static(struct jump_entry *entry,
 					     enum jump_label_type type);
+extern bool arch_jump_label_transform_queue(struct jump_entry *entry,
+					    enum jump_label_type type);
+extern void arch_jump_label_transform_apply(void);
 extern int jump_label_text_reserved(void *start, void *end);
 extern void static_key_slow_inc(struct static_key *key);
 extern void static_key_slow_dec(struct static_key *key);
-- 
cgit v1.2.3


From 9ffbe8ac05dbb4ab4a4836a55a47fc6be945a38f Mon Sep 17 00:00:00 2001
From: Nikolay Borisov <nborisov@suse.com>
Date: Fri, 31 May 2019 13:06:51 +0300
Subject: locking/lockdep: Rename lockdep_assert_held_exclusive() ->
 lockdep_assert_held_write()

All callers of lockdep_assert_held_exclusive() use it to verify the
correct locking state of either a semaphore (ldisc_sem in tty,
mmap_sem for perf events, i_rwsem of inode for dax) or rwlock by
apparmor. Thus it makes sense to rename _exclusive to _write since
that's the semantics callers care. Additionally there is already
lockdep_assert_held_read(), which this new naming is more consistent with.

No functional changes.

Signed-off-by: Nikolay Borisov <nborisov@suse.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: https://lkml.kernel.org/r/20190531100651.3969-1-nborisov@suse.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/lockdep.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 30a0f81aa130..151d55711082 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -394,7 +394,7 @@ extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie);
 		WARN_ON(debug_locks && !lockdep_is_held(l));	\
 	} while (0)
 
-#define lockdep_assert_held_exclusive(l)	do {			\
+#define lockdep_assert_held_write(l)	do {			\
 		WARN_ON(debug_locks && !lockdep_is_held_type(l, 0));	\
 	} while (0)
 
@@ -479,7 +479,7 @@ struct lockdep_map { };
 #define lockdep_is_held_type(l, r)		(1)
 
 #define lockdep_assert_held(l)			do { (void)(l); } while (0)
-#define lockdep_assert_held_exclusive(l)	do { (void)(l); } while (0)
+#define lockdep_assert_held_write(l)	do { (void)(l); } while (0)
 #define lockdep_assert_held_read(l)		do { (void)(l); } while (0)
 #define lockdep_assert_held_once(l)		do { (void)(l); } while (0)
 
-- 
cgit v1.2.3


From c71fd893f614f205dbc050d60299cc5496491c19 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:00 -0400
Subject: locking/rwsem: Make owner available even if
 !CONFIG_RWSEM_SPIN_ON_OWNER

The owner field in the rw_semaphore structure is used primarily for
optimistic spinning. However, identifying the rwsem owner can also be
helpful in debugging as well as tracing locking related issues when
analyzing crash dump. The owner field may also store state information
that can be important to the operation of the rwsem.

So the owner field is now made a permanent member of the rw_semaphore
structure irrespective of CONFIG_RWSEM_SPIN_ON_OWNER.

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-2-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 2ea18a3def04..148983e21d47 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -34,12 +34,12 @@
  */
 struct rw_semaphore {
 	atomic_long_t count;
-#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	/*
-	 * Write owner. Used as a speculative check to see
-	 * if the owner is running on the cpu.
+	 * Write owner or one of the read owners. Can be used as a
+	 * speculative check to see if the owner is running on the cpu.
 	 */
 	struct task_struct *owner;
+#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
 	raw_spinlock_t wait_lock;
@@ -73,13 +73,14 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 #endif
 
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
-#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED, .owner = NULL
+#define __RWSEM_OPT_INIT(lockname) , .osq = OSQ_LOCK_UNLOCKED
 #else
 #define __RWSEM_OPT_INIT(lockname)
 #endif
 
 #define __RWSEM_INITIALIZER(name)				\
 	{ __RWSEM_INIT_COUNT(name),				\
+	  .owner = NULL,					\
 	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
-- 
cgit v1.2.3


From 00f3c5a3df2c1e3dab14d0dd2b71f852d46be97f Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:07 -0400
Subject: locking/rwsem: Always release wait_lock before waking up tasks

With the use of wake_q, we can do task wakeups without holding the
wait_lock. There is one exception in the rwsem code, though. It is
when the writer in the slowpath detects that there are waiters ahead
but the rwsem is not held by a writer. This can lead to a long wait_lock
hold time especially when a large number of readers are to be woken up.

Remediate this situation by releasing the wait_lock before waking
up tasks and re-acquiring it afterward. The rwsem_try_write_lock()
function is also modified to read the rwsem count directly to avoid
stale count value.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-9-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched/wake_q.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'include/linux')

diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h
index ad826d2a4557..26a2013ac39c 100644
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -51,6 +51,11 @@ static inline void wake_q_init(struct wake_q_head *head)
 	head->lastp = &head->first;
 }
 
+static inline bool wake_q_empty(struct wake_q_head *head)
+{
+	return head->first == WAKE_Q_TAIL;
+}
+
 extern void wake_q_add(struct wake_q_head *head, struct task_struct *task);
 extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task);
 extern void wake_up_q(struct wake_q_head *head);
-- 
cgit v1.2.3


From 02f1082b003a0cd48f48f12533d969cdbf1c2b63 Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:10 -0400
Subject: locking/rwsem: Clarify usage of owner's nonspinaable bit

Bit 1 of sem->owner (RWSEM_ANONYMOUSLY_OWNED) is used to designate an
anonymous owner - readers or an anonymous writer. The setting of this
anonymous bit is used as an indicator that optimistic spinning cannot
be done on this rwsem.

With the upcoming reader optimistic spinning patches, a reader-owned
rwsem can be spinned on for a limit period of time. We still need
this bit to indicate a rwsem is nonspinnable, but not setting this
bit loses its meaning that the owner is known. So rename the bit
to RWSEM_NONSPINNABLE to clarify its meaning.

This patch also fixes a DEBUG_RWSEMS_WARN_ON() bug in __up_write().

Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-12-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/rwsem.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index 148983e21d47..bb76e82398b2 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -50,7 +50,7 @@ struct rw_semaphore {
 };
 
 /*
- * Setting bit 1 of the owner field but not bit 0 will indicate
+ * Setting all bits of the owner field except bit 0 will indicate
  * that the rwsem is writer-owned with an unknown owner.
  */
 #define RWSEM_OWNER_UNKNOWN	((struct task_struct *)-2L)
-- 
cgit v1.2.3


From 94a9717b3c40e77a54e4afacd8f19a9a86bfeead Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Mon, 20 May 2019 16:59:12 -0400
Subject: locking/rwsem: Make rwsem->owner an atomic_long_t

The rwsem->owner contains not just the task structure pointer, it also
holds some flags for storing the current state of the rwsem. Some of
the flags may have to be atomically updated. To reflect the new reality,
the owner is now changed to an atomic_long_t type.

New helper functions are added to properly separate out the task
structure pointer and the embedded flags.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: huang ying <huang.ying.caritas@gmail.com>
Link: https://lkml.kernel.org/r/20190520205918.22251-14-longman@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/percpu-rwsem.h |  4 ++--
 include/linux/rwsem.h        | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h
index 03cb4b6f842e..0a43830f1932 100644
--- a/include/linux/percpu-rwsem.h
+++ b/include/linux/percpu-rwsem.h
@@ -117,7 +117,7 @@ static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem,
 	lock_release(&sem->rw_sem.dep_map, 1, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
-		sem->rw_sem.owner = RWSEM_OWNER_UNKNOWN;
+		atomic_long_set(&sem->rw_sem.owner, RWSEM_OWNER_UNKNOWN);
 #endif
 }
 
@@ -127,7 +127,7 @@ static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem,
 	lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip);
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	if (!read)
-		sem->rw_sem.owner = current;
+		atomic_long_set(&sem->rw_sem.owner, (long)current);
 #endif
 }
 
diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h
index bb76e82398b2..e401358c4e7e 100644
--- a/include/linux/rwsem.h
+++ b/include/linux/rwsem.h
@@ -35,10 +35,11 @@
 struct rw_semaphore {
 	atomic_long_t count;
 	/*
-	 * Write owner or one of the read owners. Can be used as a
-	 * speculative check to see if the owner is running on the cpu.
+	 * Write owner or one of the read owners as well flags regarding
+	 * the current state of the rwsem. Can be used as a speculative
+	 * check to see if the write owner is running on the cpu.
 	 */
-	struct task_struct *owner;
+	atomic_long_t owner;
 #ifdef CONFIG_RWSEM_SPIN_ON_OWNER
 	struct optimistic_spin_queue osq; /* spinner MCS lock */
 #endif
@@ -53,7 +54,7 @@ struct rw_semaphore {
  * Setting all bits of the owner field except bit 0 will indicate
  * that the rwsem is writer-owned with an unknown owner.
  */
-#define RWSEM_OWNER_UNKNOWN	((struct task_struct *)-2L)
+#define RWSEM_OWNER_UNKNOWN	(-2L)
 
 /* In all implementations count != 0 means locked */
 static inline int rwsem_is_locked(struct rw_semaphore *sem)
@@ -80,7 +81,7 @@ static inline int rwsem_is_locked(struct rw_semaphore *sem)
 
 #define __RWSEM_INITIALIZER(name)				\
 	{ __RWSEM_INIT_COUNT(name),				\
-	  .owner = NULL,					\
+	  .owner = ATOMIC_LONG_INIT(0),				\
 	  .wait_list = LIST_HEAD_INIT((name).wait_list),	\
 	  .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock)	\
 	  __RWSEM_OPT_INIT(name)				\
-- 
cgit v1.2.3


From 9ed7d75b2f09d836e71d597cd5879abb1a44e7a9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 27 Feb 2019 09:48:51 +0100
Subject: x86/percpu: Relax smp_processor_id()

Nadav reported that since this_cpu_read() became asm-volatile, many
smp_processor_id() users generated worse code due to the extra
constraints.

However since smp_processor_id() is reading a stable value, we can use
__this_cpu_read().

While this does reduce text size somewhat, this mostly results in code
movement to .text.unlikely as a result of more/larger .cold.
subfunctions. Less text on the hotpath is good for I$.

  $ ./compare.sh defconfig-build1 defconfig-build2 vmlinux.o
  setup_APIC_ibs                                             90         98   -12,+20
  force_ibs_eilvt_setup                                     400        413   -57,+70
  pci_serr_error                                            109        104   -54,+49
  pci_serr_error                                            109        104   -54,+49
  unknown_nmi_error                                         125        120   -76,+71
  unknown_nmi_error                                         125        120   -76,+71
  io_check_error                                            125        132   -97,+104
  intel_thermal_interrupt                                   730        822   +92,+0
  intel_init_thermal                                        951        945   -6,+0
  generic_get_mtrr                                          301        294   -7,+0
  generic_get_mtrr                                          301        294   -7,+0
  generic_set_all                                           749        754   -44,+49
  get_fixed_ranges                                          352        360   -41,+49
  x86_acpi_suspend_lowlevel                                 369        363   -6,+0
  check_tsc_sync_source                                     412        412   -71,+71
  irq_migrate_all_off_this_cpu                              662        674   -14,+26
  clocksource_watchdog                                      748        748   -113,+113
  __perf_event_account_interrupt                            204        197   -7,+0
  attempt_merge                                            1748       1741   -7,+0
  intel_guc_send_ct                                        1424       1409   -15,+0
  __fini_doorbell                                           235        231   -4,+0
  bdw_set_cdclk                                             928        923   -5,+0
  gen11_dsi_disable                                        1571       1556   -15,+0
  gmbus_wait                                                493        488   -5,+0
  md_make_request                                           376        369   -7,+0
  __split_and_process_bio                                   543        536   -7,+0
  delay_tsc                                                  96         89   -7,+0
  hsw_disable_pc8                                           696        691   -5,+0
  tsc_verify_tsc_adjust                                     215        228   -22,+35
  cpuidle_driver_unref                                       56         49   -7,+0
  blk_account_io_completion                                 159        148   -11,+0
  mtrr_wrmsr                                                 95         99   -29,+33
  __intel_wait_for_register_fw                              401        419   +18,+0
  cpuidle_driver_ref                                         43         36   -7,+0
  cpuidle_get_driver                                         15          8   -7,+0
  blk_account_io_done                                       535        528   -7,+0
  irq_migrate_all_off_this_cpu                              662        674   -14,+26
  check_tsc_sync_source                                     412        412   -71,+71
  irq_wait_for_poll                                         170        163   -7,+0
  generic_end_io_acct                                       329        322   -7,+0
  x86_acpi_suspend_lowlevel                                 369        363   -6,+0
  nohz_balance_enter_idle                                   198        191   -7,+0
  generic_start_io_acct                                     254        247   -7,+0
  blk_account_io_start                                      341        334   -7,+0
  perf_event_task_tick                                      682        675   -7,+0
  intel_init_thermal                                        951        945   -6,+0
  amd_e400_c1e_apic_setup                                    47         51   -28,+32
  setup_APIC_eilvt                                          350        328   -22,+0
  hsw_enable_pc8                                           1611       1605   -6,+0
                                               total   12985947   12985892   -994,+939

Reported-by: Nadav Amit <nadav.amit@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/smp.h | 45 +++++++++++++++++++++++++++++++--------------
 1 file changed, 31 insertions(+), 14 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index a56f08ff3097..aa9e5e82d8c3 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -181,29 +181,46 @@ static inline int get_boot_cpu_id(void)
 
 #endif /* !SMP */
 
-/*
- * smp_processor_id(): get the current CPU ID.
+/**
+ * raw_processor_id() - get the current (unstable) CPU id
+ *
+ * For then you know what you are doing and need an unstable
+ * CPU id.
+ */
+
+/**
+ * smp_processor_id() - get the current (stable) CPU id
+ *
+ * This is the normal accessor to the CPU id and should be used
+ * whenever possible.
+ *
+ * The CPU id is stable when:
  *
- * if DEBUG_PREEMPT is enabled then we check whether it is
- * used in a preemption-safe way. (smp_processor_id() is safe
- * if it's used in a preemption-off critical section, or in
- * a thread that is bound to the current CPU.)
+ *  - IRQs are disabled;
+ *  - preemption is disabled;
+ *  - the task is CPU affine.
  *
- * NOTE: raw_smp_processor_id() is for internal use only
- * (smp_processor_id() is the preferred variant), but in rare
- * instances it might also be used to turn off false positives
- * (i.e. smp_processor_id() use that the debugging code reports but
- * which use for some reason is legal). Don't use this to hack around
- * the warning message, as your code might not work under PREEMPT.
+ * When CONFIG_DEBUG_PREEMPT; we verify these assumption and WARN
+ * when smp_processor_id() is used when the CPU id is not stable.
  */
+
+/*
+ * Allow the architecture to differentiate between a stable and unstable read.
+ * For example, x86 uses an IRQ-safe asm-volatile read for the unstable but a
+ * regular asm read for the stable.
+ */
+#ifndef __smp_processor_id
+#define __smp_processor_id(x) raw_smp_processor_id(x)
+#endif
+
 #ifdef CONFIG_DEBUG_PREEMPT
   extern unsigned int debug_smp_processor_id(void);
 # define smp_processor_id() debug_smp_processor_id()
 #else
-# define smp_processor_id() raw_smp_processor_id()
+# define smp_processor_id() __smp_processor_id()
 #endif
 
-#define get_cpu()		({ preempt_disable(); smp_processor_id(); })
+#define get_cpu()		({ preempt_disable(); __smp_processor_id(); })
 #define put_cpu()		preempt_enable()
 
 /*
-- 
cgit v1.2.3