From 35e481761cdc688dbee0ef552a13f49af8eba6cc Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 19 May 2016 17:08:59 -0700
Subject: fsnotify: avoid spurious EMFILE errors from inotify_init()

Inotify instance is destroyed when all references to it are dropped.
That not only means that the corresponding file descriptor needs to be
closed but also that all corresponding instance marks are freed (as each
mark holds a reference to the inotify instance).  However marks are
freed only after SRCU period ends which can take some time and thus if
user rapidly creates and frees inotify instances, number of existing
inotify instances can exceed max_user_instances limit although from user
point of view there is always at most one existing instance.  Thus
inotify_init() returns EMFILE error which is hard to justify from user
point of view.  This problem is exposed by LTP inotify06 testcase on
some machines.

We fix the problem by making sure all group marks are properly freed
while destroying inotify instance.  We wait for SRCU period to end in
that path anyway since we have to make sure there is no event being
added to the instance while we are tearing down the instance.  So it
takes only some plumbing to allow for marks to be destroyed in that path
as well and not from a dedicated work item.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Jan Kara <jack@suse.cz>
Reported-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Tested-by: Xiaoguang Wang <wangxg.fnst@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/fsnotify_backend.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 1259e53d9296..29f917517299 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -359,8 +359,6 @@ extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group)
 extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group);
 /* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/
 extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags);
-/* run all the marks in a group, and flag them to be freed */
-extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group);
 extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_unmount_inodes(struct super_block *sb);
-- 
cgit v1.2.3


From bc2c53e5f1a2bae69ae50ce3a592633da7fcf6d9 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:02 -0700
Subject: time: add missing implementation for timespec64_add_safe()

timespec64_add_safe() has been defined in time64.h for 64 bit systems.
But, 32 bit systems only have an extern function prototype defined.
Provide a definition for the above function.

The function will be necessary as part of y2038 changes.  struct
timespec is not y2038 safe.  All references to timespec will be replaced
by struct timespec64.  The function is meant to be a replacement for
timespec_add_safe().

The implementation is similar to timespec_add_safe().

Link: http://lkml.kernel.org/r/1461947989-21926-2-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 367d5af899e8..1778937221bf 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -136,13 +136,11 @@ extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 n
 
 /*
  * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME_T_MAX if the returned value would be
- * smaller then either of the arguments.
+ * overflow. It will return TIME64_MAX in case of overflow.
  */
 extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
 					 const struct timespec64 rhs);
 
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
-- 
cgit v1.2.3


From 766b9f928bd5b9b185d986d40355d1f143484136 Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:05 -0700
Subject: fs: poll/select/recvmmsg: use timespec64 for timeout events

struct timespec is not y2038 safe.  Even though timespec might be
sufficient to represent timeouts, use struct timespec64 here as the plan
is to get rid of all timespec reference in the kernel.

The patch transitions the common functions: poll_select_set_timeout()
and select_estimate_accuracy() to use timespec64.  And, all the syscalls
that use these functions are transitioned in the same patch.

The restart block parameters for poll uses monotonic time.  Use
timespec64 here as well to assign timeout value.  This parameter in the
restart block need not change because this only holds the monotonic
timestamp at which timeout should occur.  And, unsigned long data type
should be big enough for this timestamp.

The system call interfaces will be handled in a separate series.

Compat interfaces need not change as timespec64 is an alias to struct
timespec on a 64 bit system.

Link: http://lkml.kernel.org/r/1461947989-21926-3-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/poll.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/poll.h b/include/linux/poll.h
index 9fb4f40d9a26..37b057b63b46 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -96,7 +96,7 @@ extern void poll_initwait(struct poll_wqueues *pwq);
 extern void poll_freewait(struct poll_wqueues *pwq);
 extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
 				 ktime_t *expires, unsigned long slack);
-extern u64 select_estimate_accuracy(struct timespec *tv);
+extern u64 select_estimate_accuracy(struct timespec64 *tv);
 
 
 static inline int poll_schedule(struct poll_wqueues *pwq, int state)
@@ -153,12 +153,13 @@ void zero_fd_set(unsigned long nr, unsigned long *fdset)
 
 #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1)
 
-extern int do_select(int n, fd_set_bits *fds, struct timespec *end_time);
+extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time);
 extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds,
-		       struct timespec *end_time);
+		       struct timespec64 *end_time);
 extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
-			   fd_set __user *exp, struct timespec *end_time);
+			   fd_set __user *exp, struct timespec64 *end_time);
 
-extern int poll_select_set_timeout(struct timespec *to, long sec, long nsec);
+extern int poll_select_set_timeout(struct timespec64 *to, time64_t sec,
+				   long nsec);
 
 #endif /* _LINUX_POLL_H */
-- 
cgit v1.2.3


From 8e4f70e21877297577dce13cca97599a5864a91f Mon Sep 17 00:00:00 2001
From: Deepa Dinamani <deepa.kernel@gmail.com>
Date: Thu, 19 May 2016 17:09:08 -0700
Subject: time: remove timespec_add_safe()

All references to timespec_add_safe() now use timespec64_add_safe().

The plan is to replace struct timespec references with struct timespec64
throughout the kernel as timespec is not y2038 safe.

Drop timespec_add_safe() and use timespec64_add_safe() for all
architectures.

Link: http://lkml.kernel.org/r/1461947989-21926-4-git-send-email-deepa.kernel@gmail.com
Signed-off-by: Deepa Dinamani <deepa.kernel@gmail.com>
Acked-by: John Stultz <john.stultz@linaro.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/time64.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'include')

diff --git a/include/linux/time64.h b/include/linux/time64.h
index 1778937221bf..7e5d2fa9ac46 100644
--- a/include/linux/time64.h
+++ b/include/linux/time64.h
@@ -65,7 +65,6 @@ static inline struct itimerspec64 itimerspec_to_itimerspec64(struct itimerspec *
 # define timespec64_equal		timespec_equal
 # define timespec64_compare		timespec_compare
 # define set_normalized_timespec64	set_normalized_timespec
-# define timespec64_add_safe		timespec_add_safe
 # define timespec64_add			timespec_add
 # define timespec64_sub			timespec_sub
 # define timespec64_valid		timespec_valid
@@ -134,13 +133,6 @@ static inline int timespec64_compare(const struct timespec64 *lhs, const struct
 
 extern void set_normalized_timespec64(struct timespec64 *ts, time64_t sec, s64 nsec);
 
-/*
- * timespec64_add_safe assumes both values are positive and checks for
- * overflow. It will return TIME64_MAX in case of overflow.
- */
-extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
-					 const struct timespec64 rhs);
-
 static inline struct timespec64 timespec64_add(struct timespec64 lhs,
 						struct timespec64 rhs)
 {
@@ -222,4 +214,11 @@ static __always_inline void timespec64_add_ns(struct timespec64 *a, u64 ns)
 
 #endif
 
+/*
+ * timespec64_add_safe assumes both values are positive and checks for
+ * overflow. It will return TIME64_MAX in case of overflow.
+ */
+extern struct timespec64 timespec64_add_safe(const struct timespec64 lhs,
+					 const struct timespec64 rhs);
+
 #endif /* _LINUX_TIME64_H */
-- 
cgit v1.2.3


From b1e4d9d82df8ab9097f80aa208c40eab6fc29858 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:20 -0700
Subject: debugobjects: make fixup functions return bool instead of int

I am going to introduce debugobjects infrastructure to USB subsystem.
But before this, I found the code of debugobjects could be improved.
This patchset will make fixup functions return bool type instead of int.
Because fixup only need report success or no.  boolean is the 'real'
type.

This patch (of 7):

The object debugging infrastructure core provides some fixup callbacks
for the subsystem who use it.  These callbacks are called from the debug
code whenever a problem in debug_object_init is detected.  And
debugobjects core suppose them returns 1 when the fixup was successful,
otherwise 0.  So the return type is boolean.

A bad thing is that debug_object_fixup use the return value for
arithmetic operation.  It confused me that what is the reall return
type.

Reading over the whole code, I found some place do use the return value
incorrectly(see next patch).  So why use bool type instead?

Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index 98ffcbd4888e..a899f10c9365 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -39,7 +39,8 @@ struct debug_obj {
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
  * @fixup_init:		fixup function, which is called when the init check
- *			fails
+ *			fails. All fixup functions must return true if fixup
+ *			was successful, otherwise return false
  * @fixup_activate:	fixup function, which is called when the activate check
  *			fails
  * @fixup_destroy:	fixup function, which is called when the destroy check
@@ -51,12 +52,12 @@ struct debug_obj {
  */
 struct debug_obj_descr {
 	const char		*name;
-	void *(*debug_hint)	(void *addr);
-	int (*fixup_init)	(void *addr, enum debug_obj_state state);
-	int (*fixup_activate)	(void *addr, enum debug_obj_state state);
-	int (*fixup_destroy)	(void *addr, enum debug_obj_state state);
-	int (*fixup_free)	(void *addr, enum debug_obj_state state);
-	int (*fixup_assert_init)(void *addr, enum debug_obj_state state);
+	void *(*debug_hint)(void *addr);
+	bool (*fixup_init)(void *addr, enum debug_obj_state state);
+	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
+	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
+	bool (*fixup_free)(void *addr, enum debug_obj_state state);
+	bool (*fixup_assert_init)(void *addr, enum debug_obj_state state);
 };
 
 #ifdef CONFIG_DEBUG_OBJECTS
-- 
cgit v1.2.3


From b9fdac7f660609abb157500e468d2165b3c9cf08 Mon Sep 17 00:00:00 2001
From: "Du, Changbin" <changbin.du@intel.com>
Date: Thu, 19 May 2016 17:09:41 -0700
Subject: debugobjects: insulate non-fixup logic related to static obj from
 fixup callbacks

When activating a static object we need make sure that the object is
tracked in the object tracker.  If it is a non-static object then the
activation is illegal.

In previous implementation, each subsystem need take care of this in
their fixup callbacks.  Actually we can put it into debugobjects core.
Thus we can save duplicated code, and have *pure* fixup callbacks.

To achieve this, a new callback "is_static_object" is introduced to let
the type specific code decide whether a object is static or not.  If
yes, we take it into object tracker, otherwise give warning and invoke
fixup callback.

This change has paassed debugobjects selftest, and I also do some test
with all debugobjects supports enabled.

At last, I have a concern about the fixups that can it change the object
which is in incorrect state on fixup? Because the 'addr' may not point
to any valid object if a non-static object is not tracked.  Then Change
such object can overwrite someone's memory and cause unexpected
behaviour.  For example, the timer_fixup_activate bind timer to function
stub_timer.

Link: http://lkml.kernel.org/r/1462576157-14539-1-git-send-email-changbin.du@intel.com
[changbin.du@intel.com: improve code comments where invoke the new is_static_object callback]
  Link: http://lkml.kernel.org/r/1462777431-8171-1-git-send-email-changbin.du@intel.com
Signed-off-by: Du, Changbin <changbin.du@intel.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Josh Triplett <josh@kernel.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tejun Heo <tj@kernel.org>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/debugobjects.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'include')

diff --git a/include/linux/debugobjects.h b/include/linux/debugobjects.h
index a899f10c9365..46056cb161fc 100644
--- a/include/linux/debugobjects.h
+++ b/include/linux/debugobjects.h
@@ -38,6 +38,7 @@ struct debug_obj {
  * @name:		name of the object typee
  * @debug_hint:		function returning address, which have associated
  *			kernel symbol, to allow identify the object
+ * @is_static_object	return true if the obj is static, otherwise return false
  * @fixup_init:		fixup function, which is called when the init check
  *			fails. All fixup functions must return true if fixup
  *			was successful, otherwise return false
@@ -53,6 +54,7 @@ struct debug_obj {
 struct debug_obj_descr {
 	const char		*name;
 	void *(*debug_hint)(void *addr);
+	bool (*is_static_object)(void *addr);
 	bool (*fixup_init)(void *addr, enum debug_obj_state state);
 	bool (*fixup_activate)(void *addr, enum debug_obj_state state);
 	bool (*fixup_destroy)(void *addr, enum debug_obj_state state);
-- 
cgit v1.2.3


From 815613da6a67c196d7458d0e6c278ea88e21933f Mon Sep 17 00:00:00 2001
From: Richard Cochran <rcochran@linutronix.de>
Date: Thu, 19 May 2016 17:09:56 -0700
Subject: kernel/padata.c: removed unused code

By accident I stumbled across code that has never been used.  This
driver has EXPORT_SYMBOL functions, and the only user of the code is
pcrypt.c, but this only uses a subset of the exported symbols.

According to 'git log -G', the functions, padata_set_cpumasks,
padata_add_cpu, and padata_remove_cpu have never been used since they
were first introduced.  This patch removes the unused code.

On one 64 bit build, with CRYPTO_PCRYPT built in, the text is more than
4k smaller.

  kbuild_hp> size $KBUILD_OUTPUT/vmlinux
      text    data     bss      dec hex    filename
  10566658 4678360 1122304 16367322 f9beda vmlinux
  10561984 4678360 1122304 16362648 f9ac98 vmlinux

On another config, 32 bit, the saving is about 0.5k bytes.

  kbuild_hp-x86> size $KBUILD_OUTPUT/vmlinux
  6012005 2409513 2785280 11206798 ab008e vmlinux
  6011491 2409513 2785280 11206284 aafe8c vmlinux

Signed-off-by: Richard Cochran <rcochran@linutronix.de>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Herbert Xu <herbert@gondor.apana.org.au>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/padata.h | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/padata.h b/include/linux/padata.h
index 438694650471..113ee626a4dc 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -175,11 +175,6 @@ extern int padata_do_parallel(struct padata_instance *pinst,
 extern void padata_do_serial(struct padata_priv *padata);
 extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
 			      cpumask_var_t cpumask);
-extern int padata_set_cpumasks(struct padata_instance *pinst,
-			       cpumask_var_t pcpumask,
-			       cpumask_var_t cbcpumask);
-extern int padata_add_cpu(struct padata_instance *pinst, int cpu, int mask);
-extern int padata_remove_cpu(struct padata_instance *pinst, int cpu, int mask);
 extern int padata_start(struct padata_instance *pinst);
 extern void padata_stop(struct padata_instance *pinst);
 extern int padata_register_cpumask_notifier(struct padata_instance *pinst,
-- 
cgit v1.2.3


From c7ce4f60ac199fb3521c5fcd64da21cee801ec2b Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Thu, 19 May 2016 17:10:37 -0700
Subject: mm: SLAB freelist randomization

Provides an optional config (CONFIG_SLAB_FREELIST_RANDOM) to randomize
the SLAB freelist.  The list is randomized during initialization of a
new set of pages.  The order on different freelist sizes is pre-computed
at boot for performance.  Each kmem_cache has its own randomized
freelist.  Before pre-computed lists are available freelists are
generated dynamically.  This security feature reduces the predictability
of the kernel SLAB allocator against heap overflows rendering attacks
much less stable.

For example this attack against SLUB (also applicable against SLAB)
would be affected:

  https://jon.oberheide.org/blog/2010/09/10/linux-kernel-can-slub-overflow/

Also, since v4.6 the freelist was moved at the end of the SLAB.  It
means a controllable heap is opened to new attacks not yet publicly
discussed.  A kernel heap overflow can be transformed to multiple
use-after-free.  This feature makes this type of attack harder too.

To generate entropy, we use get_random_bytes_arch because 0 bits of
entropy is available in the boot stage.  In the worse case this function
will fallback to the get_random_bytes sub API.  We also generate a shift
random number to shift pre-computed freelist for each new set of pages.

The config option name is not specific to the SLAB as this approach will
be extended to other allocators like SLUB.

Performance results highlighted no major changes:

Hackbench (running 90 10 times):

  Before average: 0.0698
  After average: 0.0663 (-5.01%)

slab_test 1 run on boot.  Difference only seen on the 2048 size test
being the worse case scenario covered by freelist randomization.  New
slab pages are constantly being created on the 10000 allocations.
Variance should be mainly due to getting new pages every few
allocations.

Before:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 99 cycles kfree -> 112 cycles
  10000 times kmalloc(16) -> 109 cycles kfree -> 140 cycles
  10000 times kmalloc(32) -> 129 cycles kfree -> 137 cycles
  10000 times kmalloc(64) -> 141 cycles kfree -> 141 cycles
  10000 times kmalloc(128) -> 152 cycles kfree -> 148 cycles
  10000 times kmalloc(256) -> 195 cycles kfree -> 167 cycles
  10000 times kmalloc(512) -> 257 cycles kfree -> 199 cycles
  10000 times kmalloc(1024) -> 393 cycles kfree -> 251 cycles
  10000 times kmalloc(2048) -> 649 cycles kfree -> 228 cycles
  10000 times kmalloc(4096) -> 806 cycles kfree -> 370 cycles
  10000 times kmalloc(8192) -> 814 cycles kfree -> 411 cycles
  10000 times kmalloc(16384) -> 892 cycles kfree -> 455 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 121 cycles
  10000 times kmalloc(64)/kfree -> 121 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 121 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

After:

  Single thread testing
  =====================
  1. Kmalloc: Repeatedly allocate then free test
  10000 times kmalloc(8) -> 130 cycles kfree -> 86 cycles
  10000 times kmalloc(16) -> 118 cycles kfree -> 86 cycles
  10000 times kmalloc(32) -> 121 cycles kfree -> 85 cycles
  10000 times kmalloc(64) -> 176 cycles kfree -> 102 cycles
  10000 times kmalloc(128) -> 178 cycles kfree -> 100 cycles
  10000 times kmalloc(256) -> 205 cycles kfree -> 109 cycles
  10000 times kmalloc(512) -> 262 cycles kfree -> 136 cycles
  10000 times kmalloc(1024) -> 342 cycles kfree -> 157 cycles
  10000 times kmalloc(2048) -> 701 cycles kfree -> 238 cycles
  10000 times kmalloc(4096) -> 803 cycles kfree -> 364 cycles
  10000 times kmalloc(8192) -> 835 cycles kfree -> 404 cycles
  10000 times kmalloc(16384) -> 896 cycles kfree -> 441 cycles
  2. Kmalloc: alloc/free test
  10000 times kmalloc(8)/kfree -> 121 cycles
  10000 times kmalloc(16)/kfree -> 121 cycles
  10000 times kmalloc(32)/kfree -> 123 cycles
  10000 times kmalloc(64)/kfree -> 142 cycles
  10000 times kmalloc(128)/kfree -> 121 cycles
  10000 times kmalloc(256)/kfree -> 119 cycles
  10000 times kmalloc(512)/kfree -> 119 cycles
  10000 times kmalloc(1024)/kfree -> 119 cycles
  10000 times kmalloc(2048)/kfree -> 119 cycles
  10000 times kmalloc(4096)/kfree -> 119 cycles
  10000 times kmalloc(8192)/kfree -> 119 cycles
  10000 times kmalloc(16384)/kfree -> 119 cycles

[akpm@linux-foundation.org: propagate gfp_t into cache_random_seq_create()]
Signed-off-by: Thomas Garnier <thgarnie@google.com>
Acked-by: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Greg Thelen <gthelen@google.com>
Cc: Laura Abbott <labbott@fedoraproject.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/slab_def.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 9edbbf352340..8694f7a5d92b 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -80,6 +80,10 @@ struct kmem_cache {
 	struct kasan_cache kasan_info;
 #endif
 
+#ifdef CONFIG_SLAB_FREELIST_RANDOM
+	void *random_seq;
+#endif
+
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-- 
cgit v1.2.3


From 0139aa7b7fa12ceef095d99dc36606a5b10ab83a Mon Sep 17 00:00:00 2001
From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Date: Thu, 19 May 2016 17:10:49 -0700
Subject: mm: rename _count, field of the struct page, to _refcount

Many developers already know that field for reference count of the
struct page is _count and atomic type.  They would try to handle it
directly and this could break the purpose of page reference count
tracepoint.  To prevent direct _count modification, this patch rename it
to _refcount and add warning message on the code.  After that, developer
who need to handle reference count will find that field should not be
accessed directly.

[akpm@linux-foundation.org: fix comments, per Vlastimil]
[akpm@linux-foundation.org: Documentation/vm/transhuge.txt too]
[sfr@canb.auug.org.au: sync ethernet driver changes]
Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Hugh Dickins <hughd@google.com>
Cc: Johannes Berg <johannes@sipsolutions.net>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Sunil Goutham <sgoutham@cavium.com>
Cc: Chris Metcalf <cmetcalf@mellanox.com>
Cc: Manish Chopra <manish.chopra@qlogic.com>
Cc: Yuval Mintz <yuval.mintz@qlogic.com>
Cc: Tariq Toukan <tariqt@mellanox.com>
Cc: Saeed Mahameed <saeedm@mellanox.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h       |  2 +-
 include/linux/mm_types.h | 14 +++++++++-----
 include/linux/page_ref.h | 26 +++++++++++++-------------
 include/linux/pagemap.h  |  8 ++++----
 4 files changed, 27 insertions(+), 23 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 727f799757ab..1193a54ea2b3 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -734,7 +734,7 @@ static inline void get_page(struct page *page)
 	page = compound_head(page);
 	/*
 	 * Getting a normal page or the head of a compound page
-	 * requires to already have an elevated page->_count.
+	 * requires to already have an elevated page->_refcount.
 	 */
 	VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page);
 	page_ref_inc(page);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c2d75b4fa86c..1fda9c99ef95 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -73,9 +73,9 @@ struct page {
 			unsigned long counters;
 #else
 			/*
-			 * Keep _count separate from slub cmpxchg_double data.
-			 * As the rest of the double word is protected by
-			 * slab_lock but _count is not.
+			 * Keep _refcount separate from slub cmpxchg_double
+			 * data.  As the rest of the double word is protected by
+			 * slab_lock but _refcount is not.
 			 */
 			unsigned counters;
 #endif
@@ -97,7 +97,11 @@ struct page {
 					};
 					int units;	/* SLOB */
 				};
-				atomic_t _count;		/* Usage count, see below. */
+				/*
+				 * Usage count, *USE WRAPPER FUNCTION*
+				 * when manual accounting. See page_ref.h
+				 */
+				atomic_t _refcount;
 			};
 			unsigned int active;	/* SLAB */
 		};
@@ -248,7 +252,7 @@ struct page_frag_cache {
 	__u32 offset;
 #endif
 	/* we maintain a pagecount bias, so that we dont dirty cache line
-	 * containing page->_count every time we allocate a fragment.
+	 * containing page->_refcount every time we allocate a fragment.
 	 */
 	unsigned int		pagecnt_bias;
 	bool pfmemalloc;
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index e596d5d9540e..8b5e0a9f2431 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -63,17 +63,17 @@ static inline void __page_ref_unfreeze(struct page *page, int v)
 
 static inline int page_ref_count(struct page *page)
 {
-	return atomic_read(&page->_count);
+	return atomic_read(&page->_refcount);
 }
 
 static inline int page_count(struct page *page)
 {
-	return atomic_read(&compound_head(page)->_count);
+	return atomic_read(&compound_head(page)->_refcount);
 }
 
 static inline void set_page_count(struct page *page, int v)
 {
-	atomic_set(&page->_count, v);
+	atomic_set(&page->_refcount, v);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_set))
 		__page_ref_set(page, v);
 }
@@ -89,35 +89,35 @@ static inline void init_page_count(struct page *page)
 
 static inline void page_ref_add(struct page *page, int nr)
 {
-	atomic_add(nr, &page->_count);
+	atomic_add(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, nr);
 }
 
 static inline void page_ref_sub(struct page *page, int nr)
 {
-	atomic_sub(nr, &page->_count);
+	atomic_sub(nr, &page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -nr);
 }
 
 static inline void page_ref_inc(struct page *page)
 {
-	atomic_inc(&page->_count);
+	atomic_inc(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, 1);
 }
 
 static inline void page_ref_dec(struct page *page)
 {
-	atomic_dec(&page->_count);
+	atomic_dec(&page->_refcount);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod))
 		__page_ref_mod(page, -1);
 }
 
 static inline int page_ref_sub_and_test(struct page *page, int nr)
 {
-	int ret = atomic_sub_and_test(nr, &page->_count);
+	int ret = atomic_sub_and_test(nr, &page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -nr, ret);
@@ -126,7 +126,7 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
 
 static inline int page_ref_dec_and_test(struct page *page)
 {
-	int ret = atomic_dec_and_test(&page->_count);
+	int ret = atomic_dec_and_test(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_test))
 		__page_ref_mod_and_test(page, -1, ret);
@@ -135,7 +135,7 @@ static inline int page_ref_dec_and_test(struct page *page)
 
 static inline int page_ref_dec_return(struct page *page)
 {
-	int ret = atomic_dec_return(&page->_count);
+	int ret = atomic_dec_return(&page->_refcount);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
 		__page_ref_mod_and_return(page, -1, ret);
@@ -144,7 +144,7 @@ static inline int page_ref_dec_return(struct page *page)
 
 static inline int page_ref_add_unless(struct page *page, int nr, int u)
 {
-	int ret = atomic_add_unless(&page->_count, nr, u);
+	int ret = atomic_add_unless(&page->_refcount, nr, u);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_unless))
 		__page_ref_mod_unless(page, nr, ret);
@@ -153,7 +153,7 @@ static inline int page_ref_add_unless(struct page *page, int nr, int u)
 
 static inline int page_ref_freeze(struct page *page, int count)
 {
-	int ret = likely(atomic_cmpxchg(&page->_count, count, 0) == count);
+	int ret = likely(atomic_cmpxchg(&page->_refcount, count, 0) == count);
 
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_freeze))
 		__page_ref_freeze(page, count, ret);
@@ -165,7 +165,7 @@ static inline void page_ref_unfreeze(struct page *page, int count)
 	VM_BUG_ON_PAGE(page_count(page) != 0, page);
 	VM_BUG_ON(count == 0);
 
-	atomic_set(&page->_count, count);
+	atomic_set(&page->_refcount, count);
 	if (page_ref_tracepoint_active(__tracepoint_page_ref_unfreeze))
 		__page_ref_unfreeze(page, count);
 }
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 7e1ab155c67c..fe1513ffb7bf 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -90,12 +90,12 @@ void release_pages(struct page **pages, int nr, bool cold);
 
 /*
  * speculatively take a reference to a page.
- * If the page is free (_count == 0), then _count is untouched, and 0
- * is returned. Otherwise, _count is incremented by 1 and 1 is returned.
+ * If the page is free (_refcount == 0), then _refcount is untouched, and 0
+ * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned.
  *
  * This function must be called inside the same rcu_read_lock() section as has
  * been used to lookup the page in the pagecache radix-tree (or page table):
- * this allows allocators to use a synchronize_rcu() to stabilize _count.
+ * this allows allocators to use a synchronize_rcu() to stabilize _refcount.
  *
  * Unless an RCU grace period has passed, the count of all pages coming out
  * of the allocator must be considered unstable. page_count may return higher
@@ -111,7 +111,7 @@ void release_pages(struct page **pages, int nr, bool cold);
  * 2. conditionally increment refcount
  * 3. check the page is still in pagecache (if no, goto 1)
  *
- * Remove-side that cares about stability of _count (eg. reclaim) has the
+ * Remove-side that cares about stability of _refcount (eg. reclaim) has the
  * following (with tree_lock held for write):
  * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg)
  * B. remove page from pagecache
-- 
cgit v1.2.3


From d64e85d3e1c59c3664b9ec1183052ec4641ea1e2 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:52 -0700
Subject: compiler.h: add support for malloc attribute

gcc as far back as at least 3.04 documents the function attribute
__malloc__.  Add a shorthand for attaching that to a function
declaration.  This was also suggested by Andi Kleen way back in 2002
[1], but didn't get applied, perhaps because gcc at that time generated
the exact same code with and without this attribute.

This attribute tells the compiler that the return value (if non-NULL)
can be assumed not to alias any other valid pointers at the time of the
call.

Please note that the documentation for a range of gcc versions (starting
from around 4.7) contained a somewhat confusing and self-contradicting
text:

  The malloc attribute is used to tell the compiler that a function may
  be treated as if any non-NULL pointer it returns cannot alias any other
  pointer valid when the function returns and *that the memory has
  undefined content*.  [...] Standard functions with this property include
  malloc and *calloc*.

(emphasis mine). The intended meaning has later been clarified [2]:

  This tells the compiler that a function is malloc-like, i.e., that the
  pointer P returned by the function cannot alias any other pointer valid
  when the function returns, and moreover no pointers to valid objects
  occur in any storage addressed by P.

What this means is that we can apply the attribute to kmalloc and
friends, and it is ok for the returned memory to have well-defined
contents (__GFP_ZERO).  But it is not ok to apply it to kmemdup(), nor
to other functions which both allocate and possibly initialize the
memory with existing pointers.  So unless someone is doing something
pretty perverted kstrdup() should also be a fine candidate.

[1] http://thread.gmane.org/gmane.linux.kernel/57172
[2] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=56955

Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc.h | 1 +
 include/linux/compiler.h     | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'include')

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index 3d5202eda22f..e2949397c19b 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -142,6 +142,7 @@
 
 #if GCC_VERSION >= 30400
 #define __must_check		__attribute__((warn_unused_result))
+#define __malloc		__attribute__((__malloc__))
 #endif
 
 #if GCC_VERSION >= 40000
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index b5ff9881bef8..793c0829e3a3 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -357,6 +357,10 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
 #define __deprecated_for_modules
 #endif
 
+#ifndef __malloc
+#define __malloc
+#endif
+
 /*
  * Allow us to avoid 'defined but not used' warnings on functions and data,
  * as well as force them to be emitted to the assembly file.
-- 
cgit v1.2.3


From 48a270554a3251681ae11173f2fd6389d943e183 Mon Sep 17 00:00:00 2001
From: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Date: Thu, 19 May 2016 17:10:55 -0700
Subject: include/linux: apply __malloc attribute

Attach the malloc attribute to a few allocation functions.  This helps
gcc generate better code by telling it that the return value doesn't
alias any existing pointers (which is even more valuable given the
pessimizations implied by -fno-strict-aliasing).

A simple example of what this allows gcc to do can be seen by looking at
the last part of drm_atomic_helper_plane_reset:

	plane->state = kzalloc(sizeof(*plane->state), GFP_KERNEL);

	if (plane->state) {
		plane->state->plane = plane;
		plane->state->rotation = BIT(DRM_ROTATE_0);
	}

which compiles to

    e8 99 bf d6 ff          callq  ffffffff8116d540 <kmem_cache_alloc_trace>
    48 85 c0                test   %rax,%rax
    48 89 83 40 02 00 00    mov    %rax,0x240(%rbx)
    74 11                   je     ffffffff814015c4 <drm_atomic_helper_plane_reset+0x64>
    48 89 18                mov    %rbx,(%rax)
    48 8b 83 40 02 00 00    mov    0x240(%rbx),%rax [*]
    c7 40 40 01 00 00 00    movl   $0x1,0x40(%rax)

With this patch applied, the instruction at [*] is elided, since the
store to plane->state->plane is known to not alter the value of
plane->state.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Rasmus Villemoes <linux@rasmusvillemoes.dk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/bootmem.h | 16 ++++++++--------
 include/linux/device.h  | 12 ++++++------
 include/linux/kernel.h  |  4 ++--
 include/linux/mempool.h |  3 ++-
 include/linux/slab.h    | 16 ++++++++--------
 include/linux/string.h  |  2 +-
 6 files changed, 27 insertions(+), 26 deletions(-)

(limited to 'include')

diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h
index 35b22f94d2d2..f9be32691718 100644
--- a/include/linux/bootmem.h
+++ b/include/linux/bootmem.h
@@ -83,34 +83,34 @@ extern void *__alloc_bootmem(unsigned long size,
 			     unsigned long goal);
 extern void *__alloc_bootmem_nopanic(unsigned long size,
 				     unsigned long align,
-				     unsigned long goal);
+				     unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *__alloc_bootmem_node_high(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
-				  unsigned long goal);
+				  unsigned long goal) __malloc;
 void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
 				  unsigned long size,
 				  unsigned long align,
 				  unsigned long goal,
-				  unsigned long limit);
+				  unsigned long limit) __malloc;
 extern void *__alloc_bootmem_low(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 void *__alloc_bootmem_low_nopanic(unsigned long size,
 				 unsigned long align,
-				 unsigned long goal);
+				 unsigned long goal) __malloc;
 extern void *__alloc_bootmem_low_node(pg_data_t *pgdat,
 				      unsigned long size,
 				      unsigned long align,
-				      unsigned long goal);
+				      unsigned long goal) __malloc;
 
 #ifdef CONFIG_NO_BOOTMEM
 /* We are using top down, so it is safe to use 0 here */
diff --git a/include/linux/device.h b/include/linux/device.h
index b130304f9b1b..ca90ad8bcd61 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -609,14 +609,14 @@ typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data);
 
 #ifdef CONFIG_DEBUG_DEVRES
 extern void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-				 int nid, const char *name);
+				 int nid, const char *name) __malloc;
 #define devres_alloc(release, size, gfp) \
 	__devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release)
 #define devres_alloc_node(release, size, gfp, nid) \
 	__devres_alloc_node(release, size, gfp, nid, #release)
 #else
 extern void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp,
-			       int nid);
+			       int nid) __malloc;
 static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp)
 {
 	return devres_alloc_node(release, size, gfp, NUMA_NO_NODE);
@@ -648,12 +648,12 @@ extern void devres_remove_group(struct device *dev, void *id);
 extern int devres_release_group(struct device *dev, void *id);
 
 /* managed devm_k.alloc/kfree for device drivers */
-extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp);
+extern void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc;
 extern __printf(3, 0)
 char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt,
-		      va_list ap);
+		      va_list ap) __malloc;
 extern __printf(3, 4)
-char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...);
+char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc;
 static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp)
 {
 	return devm_kmalloc(dev, size, gfp | __GFP_ZERO);
@@ -671,7 +671,7 @@ static inline void *devm_kcalloc(struct device *dev,
 	return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO);
 }
 extern void devm_kfree(struct device *dev, void *p);
-extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp);
+extern char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc;
 extern void *devm_kmemdup(struct device *dev, const void *src, size_t len,
 			  gfp_t gfp);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 2f7775e229b0..cc7398287fdd 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -412,9 +412,9 @@ extern __printf(3, 4)
 int scnprintf(char *buf, size_t size, const char *fmt, ...);
 extern __printf(3, 0)
 int vscnprintf(char *buf, size_t size, const char *fmt, va_list args);
-extern __printf(2, 3)
+extern __printf(2, 3) __malloc
 char *kasprintf(gfp_t gfp, const char *fmt, ...);
-extern __printf(2, 0)
+extern __printf(2, 0) __malloc
 char *kvasprintf(gfp_t gfp, const char *fmt, va_list args);
 extern __printf(2, 0)
 const char *kvasprintf_const(gfp_t gfp, const char *fmt, va_list args);
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 69b6951e8fd2..b1086c936507 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -5,6 +5,7 @@
 #define _LINUX_MEMPOOL_H
 
 #include <linux/wait.h>
+#include <linux/compiler.h>
 
 struct kmem_cache;
 
@@ -31,7 +32,7 @@ extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn,
 
 extern int mempool_resize(mempool_t *pool, int new_min_nr);
 extern void mempool_destroy(mempool_t *pool);
-extern void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask);
+extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc;
 extern void mempool_free(void *element, mempool_t *pool);
 
 /*
diff --git a/include/linux/slab.h b/include/linux/slab.h
index 508bd827e6dc..aeb3e6d00a66 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -315,8 +315,8 @@ static __always_inline int kmalloc_index(size_t size)
 }
 #endif /* !CONFIG_SLOB */
 
-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment;
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment;
+void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc;
 void kmem_cache_free(struct kmem_cache *, void *);
 
 /*
@@ -339,8 +339,8 @@ static __always_inline void kfree_bulk(size_t size, void **p)
 }
 
 #ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment;
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment;
+void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc;
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
 {
@@ -354,12 +354,12 @@ static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t f
 #endif
 
 #ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment;
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc;
 
 #ifdef CONFIG_NUMA
 extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 					   gfp_t gfpflags,
-					   int node, size_t size) __assume_slab_alignment;
+					   int node, size_t size) __assume_slab_alignment __malloc;
 #else
 static __always_inline void *
 kmem_cache_alloc_node_trace(struct kmem_cache *s,
@@ -392,10 +392,10 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s,
 }
 #endif /* CONFIG_TRACING */
 
-extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 
 #ifdef CONFIG_TRACING
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment;
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc;
 #else
 static __always_inline void *
 kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
diff --git a/include/linux/string.h b/include/linux/string.h
index d3993a79a325..26b6f6a66f83 100644
--- a/include/linux/string.h
+++ b/include/linux/string.h
@@ -119,7 +119,7 @@ char *strreplace(char *s, char old, char new);
 
 extern void kfree_const(const void *x);
 
-extern char *kstrdup(const char *s, gfp_t gfp);
+extern char *kstrdup(const char *s, gfp_t gfp) __malloc;
 extern const char *kstrdup_const(const char *s, gfp_t gfp);
 extern char *kstrndup(const char *s, size_t len, gfp_t gfp);
 extern void *kmemdup(const void *src, size_t len, gfp_t gfp);
-- 
cgit v1.2.3


From 0edaf86cf1a6a97d811fc34765ddbcbc310de564 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:10:58 -0700
Subject: include/linux/nodemask.h: create next_node_in() helper

Lots of code does

	node = next_node(node, XXX);
	if (node == MAX_NUMNODES)
		node = first_node(XXX);

so create next_node_in() to do this and use it in various places.

[mhocko@suse.com: use next_node_in() helper]
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Acked-by: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Xishi Qiu <qiuxishi@huawei.com>
Cc: Joonsoo Kim <js1304@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: Laura Abbott <lauraa@codeaurora.org>
Cc: Hui Zhu <zhuhui@xiaomi.com>
Cc: Wang Xiaoqiang <wangxq10@lzu.edu.cn>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/nodemask.h | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h
index 6e85889cf9ab..f746e44d4046 100644
--- a/include/linux/nodemask.h
+++ b/include/linux/nodemask.h
@@ -43,8 +43,10 @@
  *
  * int first_node(mask)			Number lowest set bit, or MAX_NUMNODES
  * int next_node(node, mask)		Next node past 'node', or MAX_NUMNODES
+ * int next_node_in(node, mask)		Next node past 'node', or wrap to first,
+ *					or MAX_NUMNODES
  * int first_unset_node(mask)		First node not set in mask, or 
- *					MAX_NUMNODES.
+ *					MAX_NUMNODES
  *
  * nodemask_t nodemask_of_node(node)	Return nodemask with bit 'node' set
  * NODE_MASK_ALL			Initializer - all bits set
@@ -259,6 +261,13 @@ static inline int __next_node(int n, const nodemask_t *srcp)
 	return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1));
 }
 
+/*
+ * Find the next present node in src, starting after node n, wrapping around to
+ * the first node in src if needed.  Returns MAX_NUMNODES if src is empty.
+ */
+#define next_node_in(n, src) __next_node_in((n), &(src))
+int __next_node_in(int node, const nodemask_t *srcp);
+
 static inline void init_nodemask_of_node(nodemask_t *mask, int node)
 {
 	nodes_clear(*mask);
-- 
cgit v1.2.3


From 9fee021d15ddd884d40d1540913474e8112313fe Mon Sep 17 00:00:00 2001
From: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Date: Thu, 19 May 2016 17:11:04 -0700
Subject: mm/hugetlb: introduce hugetlb_bad_size()

When any unsupported hugepage size is specified, 'hugepagesz=' and
'hugepages=' should be ignored during command line parsing until any
supported hugepage size is found.  But currently incorrect number of
hugepages are allocated when unsupported size is specified as it fails
to ignore the 'hugepages=' command.

Test case:

Note that this is specific to x86 architecture.

Boot the kernel with command line option 'hugepagesz=256M hugepages=X'.
After boot, dmesg output shows that X number of hugepages of the size 2M
is pre-allocated instead of 0.

So, to handle such command line options, introduce new routine
hugetlb_bad_size.  The routine hugetlb_bad_size sets the global variable
parsed_valid_hugepagesz.  We are using parsed_valid_hugepagesz to save
the state when unsupported hugepagesize is found so that we can ignore
the 'hugepages=' parameters after that and then reset the variable when
supported hugepage size is found.

The routine hugetlb_bad_size can be called while setting 'hugepagesz='
parameter in an architecture specific code.

Signed-off-by: Vaishali Thakkar <vaishali.thakkar@oracle.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Cc: Dominik Dingel <dingel@linux.vnet.ibm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Cc: Paul Gortmaker <paul.gortmaker@windriver.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'include')

diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index 7d953c2542a8..e44c57876e89 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -338,6 +338,7 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
 /* arch callback */
 int __init alloc_bootmem_huge_page(struct hstate *h);
 
+void __init hugetlb_bad_size(void);
 void __init hugetlb_add_hstate(unsigned order);
 struct hstate *size_to_hstate(unsigned long size);
 
-- 
cgit v1.2.3


From 32f6271dbdc351dce96b11c5f3567bae8188004f Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:23 -0700
Subject: mm/hugetlb: is_vm_hugetlb_page() can return bool

Make is_vm_hugetlb_page() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/hugetlb_inline.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
index 2bb681fbeb35..a4e7ca0f3585 100644
--- a/include/linux/hugetlb_inline.h
+++ b/include/linux/hugetlb_inline.h
@@ -5,16 +5,16 @@
 
 #include <linux/mm.h>
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
 	return !!(vma->vm_flags & VM_HUGETLB);
 }
 
 #else
 
-static inline int is_vm_hugetlb_page(struct vm_area_struct *vma)
+static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
 {
-	return 0;
+	return false;
 }
 
 #endif
-- 
cgit v1.2.3


From c98940f6fa3d06fa8fec75aa2362b25227573d06 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:26 -0700
Subject: mm/memory_hotplug: is_mem_section_removable() can return bool

Make is_mem_section_removable() return bool to improve readability due
to this particular function only using either one or zero as its return
value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memory_hotplug.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index adbef586e696..20d8a5d4d133 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -247,16 +247,16 @@ static inline void mem_hotplug_done(void) {}
 
 #ifdef CONFIG_MEMORY_HOTREMOVE
 
-extern int is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
 extern void try_offline_node(int nid);
 extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
 extern void remove_memory(int nid, u64 start, u64 size);
 
 #else
-static inline int is_mem_section_removable(unsigned long pfn,
+static inline bool is_mem_section_removable(unsigned long pfn,
 					unsigned long nr_pages)
 {
-	return 0;
+	return false;
 }
 
 static inline void try_offline_node(int nid) {}
-- 
cgit v1.2.3


From bb00a789e565b96c52b2224c2280f7ac83175bec Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:29 -0700
Subject: mm/vmalloc.c: is_vmalloc_addr() can return bool

Make is_vmalloc_addr() return bool to improve readability due to this
particular function only using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1193a54ea2b3..5b375133c695 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -447,14 +447,14 @@ unsigned long vmalloc_to_pfn(const void *addr);
  * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there
  * is no special casing required.
  */
-static inline int is_vmalloc_addr(const void *x)
+static inline bool is_vmalloc_addr(const void *x)
 {
 #ifdef CONFIG_MMU
 	unsigned long addr = (unsigned long)x;
 
 	return addr >= VMALLOC_START && addr < VMALLOC_END;
 #else
-	return 0;
+	return false;
 #endif
 }
 #ifdef CONFIG_MMU
-- 
cgit v1.2.3


From 4ee815be1d34a6f254b3d09bdebcb27f294f2bd3 Mon Sep 17 00:00:00 2001
From: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Date: Thu, 19 May 2016 17:11:32 -0700
Subject: mm/mempolicy.c: vma_migratable() can return bool

Make vma_migratable() return bool due to this particular function only
using either one or zero as its return value.

Signed-off-by: Yaowei Bai <baiyaowei@cmss.chinamobile.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 2696c1f05ed1..6978a99e571f 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -172,14 +172,14 @@ extern int mpol_parse_str(char *str, struct mempolicy **mpol);
 extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol);
 
 /* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
+static inline bool vma_migratable(struct vm_area_struct *vma)
 {
 	if (vma->vm_flags & (VM_IO | VM_PFNMAP))
-		return 0;
+		return false;
 
 #ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION
 	if (vma->vm_flags & VM_HUGETLB)
-		return 0;
+		return false;
 #endif
 
 	/*
@@ -190,8 +190,8 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 	if (vma->vm_file &&
 		gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
 								< policy_zone)
-			return 0;
-	return 1;
+			return false;
+	return true;
 }
 
 extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long);
-- 
cgit v1.2.3


From 29f9cb53d25cd9916537b44b0af7f0b95a2e4438 Mon Sep 17 00:00:00 2001
From: Chanho Min <chanho.min@lge.com>
Date: Thu, 19 May 2016 17:11:57 -0700
Subject: mm/highmem: simplify is_highmem()

is_highmem() can be simplified by use of is_highmem_idx().  This patch
removes redundant code and will make it easier to maintain if the zone
policy is changed or a new zone is added.

(akpm: saves me 25 bytes of text per is_highmem() callsite)

Signed-off-by: Chanho Min <chanho.min@lge.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index c60df9257cc7..150c6049f961 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -828,10 +828,7 @@ static inline int is_highmem_idx(enum zone_type idx)
 static inline int is_highmem(struct zone *zone)
 {
 #ifdef CONFIG_HIGHMEM
-	int zone_off = (char *)zone - (char *)zone->zone_pgdat->node_zones;
-	return zone_off == ZONE_HIGHMEM * sizeof(*zone) ||
-	       (zone_off == ZONE_MOVABLE * sizeof(*zone) &&
-		zone_movable_is_highmem());
+	return is_highmem_idx(zone_idx(zone));
 #else
 	return 0;
 #endif
-- 
cgit v1.2.3


From 1aa8aea535977f0e0b398f39d052e7befff81da6 Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Thu, 19 May 2016 17:12:00 -0700
Subject: mm: uninline page_mapped()

It's huge.  Uninlining it saves 206 bytes per callsite.  Shaves 4924
bytes from the x86_64 allmodconfig vmlinux.

[akpm@linux-foundation.org: coding-style fixes]
Cc: Steve Capper <steve.capper@arm.com>
Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5b375133c695..9c2852cabf01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1032,26 +1032,7 @@ static inline pgoff_t page_file_index(struct page *page)
 	return page->index;
 }
 
-/*
- * Return true if this page is mapped into pagetables.
- * For compound page it returns true if any subpage of compound page is mapped.
- */
-static inline bool page_mapped(struct page *page)
-{
-	int i;
-	if (likely(!PageCompound(page)))
-		return atomic_read(&page->_mapcount) >= 0;
-	page = compound_head(page);
-	if (atomic_read(compound_mapcount_ptr(page)) >= 0)
-		return true;
-	if (PageHuge(page))
-		return false;
-	for (i = 0; i < hpage_nr_pages(page); i++) {
-		if (atomic_read(&page[i]._mapcount) >= 0)
-			return true;
-	}
-	return false;
-}
+bool page_mapped(struct page *page);
 
 /*
  * Return true only if the page has been allocated with
-- 
cgit v1.2.3


From ca707239e8a7958ffb1c31737d41cae1a674c938 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:35 -0700
Subject: mm: update_lru_size warn and reset bad lru_size

Though debug kernels have a VM_BUG_ON to help protect from misaccounting
lru_size, non-debug kernels are liable to wrap it around: and then the
vast unsigned long size draws page reclaim into a loop of repeatedly
doing nothing on an empty list, without even a cond_resched().

That soft lockup looks confusingly like an over-busy reclaim scenario,
with lots of contention on the lru_lock in shrink_inactive_list(): yet
has a totally different origin.

Help differentiate with a custom warning in
mem_cgroup_update_lru_size(), even in non-debug kernels; and reset the
size to avoid the lockup.  But the particular bug which suggested this
change was mine alone, and since fixed.

Make it a WARN_ONCE: the first occurrence is the most informative, a
flurry may follow, yet even when rate-limited little more is learnt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm_inline.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index 712e8c37a200..d8cea81ab1ac 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -35,8 +35,8 @@ static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
 	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	list_del(&page->lru);
+	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
 	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
 }
 
-- 
cgit v1.2.3


From 9d5e6a9f22311b00a20ff9b072760ad3e73f0d99 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:38 -0700
Subject: mm: update_lru_size do the __mod_zone_page_state

Konstantin Khlebnikov pointed out (nearly four years ago, when lumpy
reclaim was removed) that lru_size can be updated by -nr_taken once per
call to isolate_lru_pages(), instead of page by page.

Update it inside isolate_lru_pages(), or at its two callsites? I chose
to update it at the callsites, rearranging and grouping the updates by
nr_taken and nr_scanned together in both.

With one exception, mem_cgroup_update_lru_size(,lru,) is then used where
__mod_zone_page_state(,NR_LRU_BASE+lru,) is used; and we shall be adding
some more calls in a future commit.  Make the code a little smaller and
simpler by incorporating stat update in lru_size update.

The exception was move_active_pages_to_lru(), which aggregated the
pgmoved stat update separately from the individual lru_size updates; but
I still think this a simplification worth making.

However, the __mod_zone_page_state is not peculiar to mem_cgroups: so
better use the name update_lru_size, calls mem_cgroup_update_lru_size
when CONFIG_MEMCG.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/memcontrol.h |  6 ------
 include/linux/mm_inline.h  | 24 ++++++++++++++++++------
 2 files changed, 18 insertions(+), 12 deletions(-)

(limited to 'include')

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 1191d79aa495..94da96738df3 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -658,12 +658,6 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 	return 0;
 }
 
-static inline void
-mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
-			      int increment)
-{
-}
-
 static inline unsigned long
 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 			     int nid, unsigned int lru_mask)
diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h
index d8cea81ab1ac..5bd29ba4f174 100644
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -22,22 +22,34 @@ static inline int page_is_file_cache(struct page *page)
 	return !PageSwapBacked(page);
 }
 
+static __always_inline void __update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
+}
+
+static __always_inline void update_lru_size(struct lruvec *lruvec,
+				enum lru_list lru, int nr_pages)
+{
+#ifdef CONFIG_MEMCG
+	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+#else
+	__update_lru_size(lruvec, lru, nr_pages);
+#endif
+}
+
 static __always_inline void add_page_to_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
-	mem_cgroup_update_lru_size(lruvec, lru, nr_pages);
+	update_lru_size(lruvec, lru, hpage_nr_pages(page));
 	list_add(&page->lru, &lruvec->lists[lru]);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, nr_pages);
 }
 
 static __always_inline void del_page_from_lru_list(struct page *page,
 				struct lruvec *lruvec, enum lru_list lru)
 {
-	int nr_pages = hpage_nr_pages(page);
 	list_del(&page->lru);
-	mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
-	__mod_zone_page_state(lruvec_zone(lruvec), NR_LRU_BASE + lru, -nr_pages);
+	update_lru_size(lruvec, lru, -hpage_nr_pages(page));
 }
 
 /**
-- 
cgit v1.2.3


From 75edd345e8ede51bc8f00672feff5d622f2b3af6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:44 -0700
Subject: tmpfs: preliminary minor tidyups

Make a few cleanups in mm/shmem.c, before going on to complicate it.

shmem_alloc_page() will become more complicated: we can't afford to to
have that complication duplicated between a CONFIG_NUMA version and a
!CONFIG_NUMA version, so rearrange the #ifdef'ery there to yield a
single shmem_swapin() and a single shmem_alloc_page().

Yes, it's a shame to inflict the horrid pseudo-vma on non-NUMA
configurations, but eliminating it is a larger cleanup: I have an
alloc_pages_mpol() patchset not yet ready - mpol handling is subtle and
bug-prone, and changed yet again since my last version.

Move __SetPageLocked, __SetPageSwapBacked from shmem_getpage_gfp() to
shmem_alloc_page(): that SwapBacked flag will be useful in future, to
help to distinguish different cases appropriately.

And the SGP_DIRTY variant of SGP_CACHE is hard to understand and of
little use (IIRC it dates back to when shmem_getpage() returned the page
unlocked): kill it and do the necessary in shmem_file_read_iter().

But an arm64 build then complained that info may be uninitialized (where
shmem_getpage_gfp() deletes a freshly alloced page beyond eof), and
advancing to an "sgp <= SGP_CACHE" test jogged it back to reality.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'include')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 6978a99e571f..4429d255c8ab 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -228,6 +228,12 @@ static inline void mpol_free_shared_policy(struct shared_policy *p)
 {
 }
 
+static inline struct mempolicy *
+mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
+{
+	return NULL;
+}
+
 #define vma_policy(vma) NULL
 
 static inline int
-- 
cgit v1.2.3


From 52b6f46bc163eef17ecba4cd552beeafe2b24453 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:50 -0700
Subject: mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing.  Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.

Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient.  Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.

But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time.  I have not yet worked out why, but
have no evidence that it's actually harmful.  Punt for the moment by
just ignoring the anomaly on those.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
 
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+		   void __user *buffer, size_t *lenp, loff_t *ppos);
+
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 
 int calculate_pressure_threshold(struct zone *zone);
-- 
cgit v1.2.3


From bf8616d5fa179d6c755f06726567c6d63c6fbbc7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:12:54 -0700
Subject: huge mm: move_huge_pmd does not need new_vma

Remove move_huge_pmd()'s redundant new_vma arg: all it was used for was
a VM_NOHUGEPAGE check on new_vma flags, but the new_vma is cloned from
the old vma, so a trans_huge_pmd in the new_vma will be as acceptable as
it was in the old vma, alignment and size permitting.

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/huge_mm.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'include')

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index d7b9e5346fba..419fb9e03447 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -28,9 +28,7 @@ extern int zap_huge_pmd(struct mmu_gather *tlb,
 extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 			unsigned long addr, unsigned long end,
 			unsigned char *vec);
-extern bool move_huge_pmd(struct vm_area_struct *vma,
-			 struct vm_area_struct *new_vma,
-			 unsigned long old_addr,
+extern bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 			 unsigned long new_addr, unsigned long old_end,
 			 pmd_t *old_pmd, pmd_t *new_pmd);
 extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
-- 
cgit v1.2.3


From fd8cfd3000191cb7f5b9ea8640bd46181f6b4b74 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 19 May 2016 17:13:00 -0700
Subject: arch: fix has_transparent_hugepage()

I've just discovered that the useful-sounding has_transparent_hugepage()
is actually an architecture-dependent minefield: on some arches it only
builds if CONFIG_TRANSPARENT_HUGEPAGE=y, on others it's also there when
not, but on some of those (arm and arm64) it then gives the wrong
answer; and on mips alone it's marked __init, which would crash if
called later (but so far it has not been called later).

Straighten this out: make it available to all configs, with a sensible
default in asm-generic/pgtable.h, removing its definitions from those
arches (arc, arm, arm64, sparc, tile) which are served by the default,
adding #define has_transparent_hugepage has_transparent_hugepage to
those (mips, powerpc, s390, x86) which need to override the default at
runtime, and removing the __init from mips (but maybe that kind of code
should be avoided after init: set a static variable the first time it's
called).

Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Acked-by: David S. Miller <davem@davemloft.net>
Acked-by: Vineet Gupta <vgupta@synopsys.com>		[arch/arc]
Acked-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>	[arch/s390]
Acked-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/asm-generic/pgtable.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h
index 9401f4819891..d4458b6dbfb4 100644
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -806,4 +806,12 @@ static inline int pmd_clear_huge(pmd_t *pmd)
 #define io_remap_pfn_range remap_pfn_range
 #endif
 
+#ifndef has_transparent_hugepage
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define has_transparent_hugepage() 1
+#else
+#define has_transparent_hugepage() 0
+#endif
+#endif
+
 #endif /* _ASM_GENERIC_PGTABLE_H */
-- 
cgit v1.2.3


From 3ef22dfff2390e75b379f9715388a852aa56e0d5 Mon Sep 17 00:00:00 2001
From: Michal Hocko <mhocko@suse.com>
Date: Thu, 19 May 2016 17:13:12 -0700
Subject: oom, oom_reaper: try to reap tasks which skip regular OOM killer path

If either the current task is already killed or PF_EXITING or a selected
task is PF_EXITING then the oom killer is suppressed and so is the oom
reaper.  This patch adds try_oom_reaper which checks the given task and
queues it for the oom reaper if that is safe to be done meaning that the
task doesn't share the mm with an alive process.

This might help to release the memory pressure while the task tries to
exit.

[akpm@linux-foundation.org: fix nommu build]
Signed-off-by: Michal Hocko <mhocko@suse.com>
Cc: Raushaniya Maksudova <rmaksudova@parallels.com>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Daniel Vetter <daniel.vetter@intel.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/oom.h | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'include')

diff --git a/include/linux/oom.h b/include/linux/oom.h
index 628a43242a34..83b9c39bd8b7 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -72,6 +72,14 @@ static inline bool oom_task_origin(const struct task_struct *p)
 
 extern void mark_oom_victim(struct task_struct *tsk);
 
+#ifdef CONFIG_MMU
+extern void try_oom_reaper(struct task_struct *tsk);
+#else
+static inline void try_oom_reaper(struct task_struct *tsk)
+{
+}
+#endif
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
 		unsigned long totalpages);
-- 
cgit v1.2.3


From 175145748d00794369317070dd19ce12dd816241 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:21 -0700
Subject: mm, page_alloc: use new PageAnonHead helper in the free page fast
 path

The PageAnon check always checks for compound_head but this is a
relatively expensive check if the caller already knows the page is a
head page.  This patch creates a helper and uses it in the page free
path which only operates on head pages.

With this patch and "Only check PageCompound for high-order pages", the
performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                               vanilla           nocompound-v1r20
  Min      alloc-odr0-1               425.00 (  0.00%)           417.00 (  1.88%)
  Min      alloc-odr0-2               313.00 (  0.00%)           308.00 (  1.60%)
  Min      alloc-odr0-4               257.00 (  0.00%)           253.00 (  1.56%)
  Min      alloc-odr0-8               224.00 (  0.00%)           221.00 (  1.34%)
  Min      alloc-odr0-16              208.00 (  0.00%)           205.00 (  1.44%)
  Min      alloc-odr0-32              199.00 (  0.00%)           199.00 (  0.00%)
  Min      alloc-odr0-64              195.00 (  0.00%)           193.00 (  1.03%)
  Min      alloc-odr0-128             192.00 (  0.00%)           191.00 (  0.52%)
  Min      alloc-odr0-256             204.00 (  0.00%)           200.00 (  1.96%)
  Min      alloc-odr0-512             213.00 (  0.00%)           212.00 (  0.47%)
  Min      alloc-odr0-1024            219.00 (  0.00%)           219.00 (  0.00%)
  Min      alloc-odr0-2048            225.00 (  0.00%)           225.00 (  0.00%)
  Min      alloc-odr0-4096            230.00 (  0.00%)           231.00 ( -0.43%)
  Min      alloc-odr0-8192            235.00 (  0.00%)           234.00 (  0.43%)
  Min      alloc-odr0-16384           235.00 (  0.00%)           234.00 (  0.43%)
  Min      free-odr0-1                215.00 (  0.00%)           191.00 ( 11.16%)
  Min      free-odr0-2                152.00 (  0.00%)           136.00 ( 10.53%)
  Min      free-odr0-4                119.00 (  0.00%)           107.00 ( 10.08%)
  Min      free-odr0-8                106.00 (  0.00%)            96.00 (  9.43%)
  Min      free-odr0-16                97.00 (  0.00%)            87.00 ( 10.31%)
  Min      free-odr0-32                91.00 (  0.00%)            83.00 (  8.79%)
  Min      free-odr0-64                89.00 (  0.00%)            81.00 (  8.99%)
  Min      free-odr0-128               88.00 (  0.00%)            80.00 (  9.09%)
  Min      free-odr0-256              106.00 (  0.00%)            95.00 ( 10.38%)
  Min      free-odr0-512              116.00 (  0.00%)           111.00 (  4.31%)
  Min      free-odr0-1024             125.00 (  0.00%)           118.00 (  5.60%)
  Min      free-odr0-2048             133.00 (  0.00%)           126.00 (  5.26%)
  Min      free-odr0-4096             136.00 (  0.00%)           130.00 (  4.41%)
  Min      free-odr0-8192             138.00 (  0.00%)           130.00 (  5.80%)
  Min      free-odr0-16384            137.00 (  0.00%)           130.00 (  5.11%)

There is a sizable boost to the free allocator performance.  While there
is an apparent boost on the allocation side, it's likely a co-incidence
or due to the patches slightly reducing cache footprint.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/page-flags.h | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'include')

diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 6b052aa7b5b7..a61e06e5fbce 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -371,10 +371,15 @@ PAGEFLAG(Idle, idle, PF_ANY)
 #define PAGE_MAPPING_KSM	2
 #define PAGE_MAPPING_FLAGS	(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
 
+static __always_inline int PageAnonHead(struct page *page)
+{
+	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+}
+
 static __always_inline int PageAnon(struct page *page)
 {
 	page = compound_head(page);
-	return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
+	return PageAnonHead(page);
 }
 
 #ifdef CONFIG_KSM
-- 
cgit v1.2.3


From 060e74173f292fb3e0398b3dca8765568d195ff1 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:27 -0700
Subject: mm, page_alloc: inline zone_statistics

zone_statistics has one call-site but it's a public function.  Make it
static and inline.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statbranch-v1r20           statinline-v1r20
  Min      alloc-odr0-1               419.00 (  0.00%)           412.00 (  1.67%)
  Min      alloc-odr0-2               305.00 (  0.00%)           301.00 (  1.31%)
  Min      alloc-odr0-4               250.00 (  0.00%)           247.00 (  1.20%)
  Min      alloc-odr0-8               219.00 (  0.00%)           215.00 (  1.83%)
  Min      alloc-odr0-16              203.00 (  0.00%)           199.00 (  1.97%)
  Min      alloc-odr0-32              195.00 (  0.00%)           191.00 (  2.05%)
  Min      alloc-odr0-64              191.00 (  0.00%)           187.00 (  2.09%)
  Min      alloc-odr0-128             189.00 (  0.00%)           185.00 (  2.12%)
  Min      alloc-odr0-256             198.00 (  0.00%)           193.00 (  2.53%)
  Min      alloc-odr0-512             210.00 (  0.00%)           207.00 (  1.43%)
  Min      alloc-odr0-1024            216.00 (  0.00%)           213.00 (  1.39%)
  Min      alloc-odr0-2048            221.00 (  0.00%)           220.00 (  0.45%)
  Min      alloc-odr0-4096            227.00 (  0.00%)           226.00 (  0.44%)
  Min      alloc-odr0-8192            232.00 (  0.00%)           229.00 (  1.29%)
  Min      alloc-odr0-16384           232.00 (  0.00%)           229.00 (  1.29%)

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/vmstat.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 02fce415b3d9..d2da8e053210 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -163,12 +163,10 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone,
 #ifdef CONFIG_NUMA
 
 extern unsigned long node_page_state(int node, enum zone_stat_item item);
-extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp);
 
 #else
 
 #define node_page_state(node, item) global_page_state(item)
-#define zone_statistics(_zl, _z, gfp) do { } while (0)
 
 #endif /* CONFIG_NUMA */
 
-- 
cgit v1.2.3


From 682a3385e7734fa3abbd504cbeb5fe91793f1827 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:30 -0700
Subject: mm, page_alloc: inline the fast path of the zonelist iterator

The page allocator iterates through a zonelist for zones that match the
addressing limitations and nodemask of the caller but many allocations
will not be restricted.  Despite this, there is always functional call
overhead which builds up.

This patch inlines the optimistic basic case and only calls the iterator
function for the complex case.  A hindrance was the fact that
cpuset_current_mems_allowed is used in the fastpath as the allowed
nodemask even though all nodes are allowed on most systems.  The patch
handles this by only considering cpuset_current_mems_allowed if a cpuset
exists.  As well as being faster in the fast-path, this removes some
junk in the slowpath.

The performance difference on a page allocator microbenchmark is;

                                             4.6.0-rc2                  4.6.0-rc2
                                      statinline-v1r20              optiter-v1r20
  Min      alloc-odr0-1               412.00 (  0.00%)           382.00 (  7.28%)
  Min      alloc-odr0-2               301.00 (  0.00%)           282.00 (  6.31%)
  Min      alloc-odr0-4               247.00 (  0.00%)           233.00 (  5.67%)
  Min      alloc-odr0-8               215.00 (  0.00%)           203.00 (  5.58%)
  Min      alloc-odr0-16              199.00 (  0.00%)           188.00 (  5.53%)
  Min      alloc-odr0-32              191.00 (  0.00%)           182.00 (  4.71%)
  Min      alloc-odr0-64              187.00 (  0.00%)           177.00 (  5.35%)
  Min      alloc-odr0-128             185.00 (  0.00%)           175.00 (  5.41%)
  Min      alloc-odr0-256             193.00 (  0.00%)           184.00 (  4.66%)
  Min      alloc-odr0-512             207.00 (  0.00%)           197.00 (  4.83%)
  Min      alloc-odr0-1024            213.00 (  0.00%)           203.00 (  4.69%)
  Min      alloc-odr0-2048            220.00 (  0.00%)           209.00 (  5.00%)
  Min      alloc-odr0-4096            226.00 (  0.00%)           214.00 (  5.31%)
  Min      alloc-odr0-8192            229.00 (  0.00%)           218.00 (  4.80%)
  Min      alloc-odr0-16384           229.00 (  0.00%)           219.00 (  4.37%)

perf indicated that next_zones_zonelist disappeared in the profile and
__next_zones_zonelist did not appear.  This is expected as the
micro-benchmark would hit the inlined fast-path every time.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 150c6049f961..cfcd7723edb6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -919,6 +919,10 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
 #endif /* CONFIG_NUMA */
 }
 
+struct zoneref *__next_zones_zonelist(struct zoneref *z,
+					enum zone_type highest_zoneidx,
+					nodemask_t *nodes);
+
 /**
  * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point
  * @z - The cursor used as a starting point for the search
@@ -931,9 +935,14 @@ static inline int zonelist_node_idx(struct zoneref *zoneref)
  * being examined. It should be advanced by one before calling
  * next_zones_zonelist again.
  */
-struct zoneref *next_zones_zonelist(struct zoneref *z,
+static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes);
+					nodemask_t *nodes)
+{
+	if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx))
+		return z;
+	return __next_zones_zonelist(z, highest_zoneidx, nodes);
+}
 
 /**
  * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist
-- 
cgit v1.2.3


From c603844bdcb5238980de8d58b393f52d7729d651 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:38 -0700
Subject: mm, page_alloc: convert alloc_flags to unsigned

alloc_flags is a bitmask of flags but it is signed which does not
necessarily generate the best code depending on the compiler.  Even
without an impact, it makes more sense that this be unsigned.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compaction.h | 6 +++---
 include/linux/mmzone.h     | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/compaction.h b/include/linux/compaction.h
index d7c8de583a23..242b660f64e6 100644
--- a/include/linux/compaction.h
+++ b/include/linux/compaction.h
@@ -39,12 +39,12 @@ extern int sysctl_compact_unevictable_allowed;
 
 extern int fragmentation_index(struct zone *zone, unsigned int order);
 extern unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
-			int alloc_flags, const struct alloc_context *ac,
-			enum migrate_mode mode, int *contended);
+		unsigned int alloc_flags, const struct alloc_context *ac,
+		enum migrate_mode mode, int *contended);
 extern void compact_pgdat(pg_data_t *pgdat, int order);
 extern void reset_isolation_suitable(pg_data_t *pgdat);
 extern unsigned long compaction_suitable(struct zone *zone, int order,
-					int alloc_flags, int classzone_idx);
+		unsigned int alloc_flags, int classzone_idx);
 
 extern void defer_compaction(struct zone *zone, int order);
 extern bool compaction_deferred(struct zone *zone, int order);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index cfcd7723edb6..327f0fa1e1ce 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -747,7 +747,8 @@ extern struct mutex zonelists_mutex;
 void build_all_zonelists(pg_data_t *pgdat, struct zone *zone);
 void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx);
 bool zone_watermark_ok(struct zone *z, unsigned int order,
-		unsigned long mark, int classzone_idx, int alloc_flags);
+		unsigned long mark, int classzone_idx,
+		unsigned int alloc_flags);
 bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
 		unsigned long mark, int classzone_idx);
 enum memmap_context {
-- 
cgit v1.2.3


From 09940a4f1e816abe3248fa0d185fc0e7f54c8c12 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:13:53 -0700
Subject: mm, page_alloc: simplify last cpupid reset

The current reset unnecessarily clears flags and makes pointless
calculations.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mm.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'include')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 9c2852cabf01..2b97be1147ec 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -850,10 +850,7 @@ extern int page_cpupid_xchg_last(struct page *page, int cpupid);
 
 static inline void page_cpupid_reset_last(struct page *page)
 {
-	int cpupid = (1 << LAST_CPUPID_SHIFT) - 1;
-
-	page->flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
-	page->flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+	page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT;
 }
 #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */
 #else /* !CONFIG_NUMA_BALANCING */
-- 
cgit v1.2.3


From c33d6c06f60f710f0305ae792773e1c2560e1e51 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:10 -0700
Subject: mm, page_alloc: avoid looking up the first zone in a zonelist twice

The allocator fast path looks up the first usable zone in a zonelist and
then get_page_from_freelist does the same job in the zonelist iterator.
This patch preserves the necessary information.

                                             4.6.0-rc2                  4.6.0-rc2
                                        fastmark-v1r20             initonce-v1r20
  Min      alloc-odr0-1               364.00 (  0.00%)           359.00 (  1.37%)
  Min      alloc-odr0-2               262.00 (  0.00%)           260.00 (  0.76%)
  Min      alloc-odr0-4               214.00 (  0.00%)           214.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           186.00 (  0.00%)
  Min      alloc-odr0-16              173.00 (  0.00%)           173.00 (  0.00%)
  Min      alloc-odr0-32              165.00 (  0.00%)           165.00 (  0.00%)
  Min      alloc-odr0-64              161.00 (  0.00%)           162.00 ( -0.62%)
  Min      alloc-odr0-128             159.00 (  0.00%)           161.00 ( -1.26%)
  Min      alloc-odr0-256             168.00 (  0.00%)           170.00 ( -1.19%)
  Min      alloc-odr0-512             180.00 (  0.00%)           181.00 ( -0.56%)
  Min      alloc-odr0-1024            190.00 (  0.00%)           190.00 (  0.00%)
  Min      alloc-odr0-2048            196.00 (  0.00%)           196.00 (  0.00%)
  Min      alloc-odr0-4096            202.00 (  0.00%)           202.00 (  0.00%)
  Min      alloc-odr0-8192            206.00 (  0.00%)           205.00 (  0.49%)
  Min      alloc-odr0-16384           206.00 (  0.00%)           205.00 (  0.49%)

The benefit is negligible and the results are within the noise but each
cycle counts.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 327f0fa1e1ce..4b28d2f8125e 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -959,13 +959,10 @@ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z,
  */
 static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
 					enum zone_type highest_zoneidx,
-					nodemask_t *nodes,
-					struct zone **zone)
+					nodemask_t *nodes)
 {
-	struct zoneref *z = next_zones_zonelist(zonelist->_zonerefs,
+	return next_zones_zonelist(zonelist->_zonerefs,
 							highest_zoneidx, nodes);
-	*zone = zonelist_zone(z);
-	return z;
 }
 
 /**
@@ -980,10 +977,17 @@ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist,
  * within a given nodemask
  */
 #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
-	for (z = first_zones_zonelist(zlist, highidx, nodemask, &zone);	\
+	for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z);	\
 		zone;							\
 		z = next_zones_zonelist(++z, highidx, nodemask),	\
-			zone = zonelist_zone(z))			\
+			zone = zonelist_zone(z))
+
+#define for_next_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \
+	for (zone = z->zone;	\
+		zone;							\
+		z = next_zones_zonelist(++z, highidx, nodemask),	\
+			zone = zonelist_zone(z))
+
 
 /**
  * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index
-- 
cgit v1.2.3


From 0b423ca22f95a867f789aab1fe57ee4e378df43b Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@techsingularity.net>
Date: Thu, 19 May 2016 17:14:27 -0700
Subject: mm, page_alloc: inline pageblock lookup in page free fast paths

The function call overhead of get_pfnblock_flags_mask() is measurable in
the page free paths.  This patch uses an inlined version that is faster.

Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'include')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 4b28d2f8125e..c60db2096fd8 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -85,13 +85,6 @@ extern int page_group_by_mobility_disabled;
 	get_pfnblock_flags_mask(page, page_to_pfn(page),		\
 			PB_migrate_end, MIGRATETYPE_MASK)
 
-static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
-{
-	BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2);
-	return get_pfnblock_flags_mask(page, pfn, PB_migrate_end,
-					MIGRATETYPE_MASK);
-}
-
 struct free_area {
 	struct list_head	free_list[MIGRATE_TYPES];
 	unsigned long		nr_free;
-- 
cgit v1.2.3


From 002f290627c27068087f6204baec7a334e5a3b48 Mon Sep 17 00:00:00 2001
From: Vlastimil Babka <vbabka@suse.cz>
Date: Thu, 19 May 2016 17:14:30 -0700
Subject: cpuset: use static key better and convert to new API

An important function for cpusets is cpuset_node_allowed(), which
optimizes on the fact if there's a single root CPU set, it must be
trivially allowed.  But the check "nr_cpusets() <= 1" doesn't use the
cpusets_enabled_key static key the right way where static keys eliminate
branching overhead with jump labels.

This patch converts it so that static key is used properly.  It's also
switched to the new static key API and the checking functions are
converted to return bool instead of int.  We also provide a new variant
__cpuset_zone_allowed() which expects that the static key check was
already done and they key was enabled.  This is needed for
get_page_from_freelist() where we want to also avoid the relatively
slower check when ALLOC_CPUSET is not set in alloc_flags.

The impact on the page allocator microbenchmark is less than expected
but the cleanup in itself is worthwhile.

                                             4.6.0-rc2                  4.6.0-rc2
                                       multcheck-v1r20               cpuset-v1r20
  Min      alloc-odr0-1               348.00 (  0.00%)           348.00 (  0.00%)
  Min      alloc-odr0-2               254.00 (  0.00%)           254.00 (  0.00%)
  Min      alloc-odr0-4               213.00 (  0.00%)           213.00 (  0.00%)
  Min      alloc-odr0-8               186.00 (  0.00%)           183.00 (  1.61%)
  Min      alloc-odr0-16              173.00 (  0.00%)           171.00 (  1.16%)
  Min      alloc-odr0-32              166.00 (  0.00%)           163.00 (  1.81%)
  Min      alloc-odr0-64              162.00 (  0.00%)           159.00 (  1.85%)
  Min      alloc-odr0-128             160.00 (  0.00%)           157.00 (  1.88%)
  Min      alloc-odr0-256             169.00 (  0.00%)           166.00 (  1.78%)
  Min      alloc-odr0-512             180.00 (  0.00%)           180.00 (  0.00%)
  Min      alloc-odr0-1024            188.00 (  0.00%)           187.00 (  0.53%)
  Min      alloc-odr0-2048            194.00 (  0.00%)           193.00 (  0.52%)
  Min      alloc-odr0-4096            199.00 (  0.00%)           198.00 (  0.50%)
  Min      alloc-odr0-8192            202.00 (  0.00%)           201.00 (  0.50%)
  Min      alloc-odr0-16384           203.00 (  0.00%)           202.00 (  0.49%)

Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Zefan Li <lizefan@huawei.com>
Cc: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 42 ++++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 14 deletions(-)

(limited to 'include')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 85a868ccb493..bfc204e70338 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -16,26 +16,26 @@
 
 #ifdef CONFIG_CPUSETS
 
-extern struct static_key cpusets_enabled_key;
+extern struct static_key_false cpusets_enabled_key;
 static inline bool cpusets_enabled(void)
 {
-	return static_key_false(&cpusets_enabled_key);
+	return static_branch_unlikely(&cpusets_enabled_key);
 }
 
 static inline int nr_cpusets(void)
 {
 	/* jump label reference count + the top-level cpuset */
-	return static_key_count(&cpusets_enabled_key) + 1;
+	return static_key_count(&cpusets_enabled_key.key) + 1;
 }
 
 static inline void cpuset_inc(void)
 {
-	static_key_slow_inc(&cpusets_enabled_key);
+	static_branch_inc(&cpusets_enabled_key);
 }
 
 static inline void cpuset_dec(void)
 {
-	static_key_slow_dec(&cpusets_enabled_key);
+	static_branch_dec(&cpusets_enabled_key);
 }
 
 extern int cpuset_init(void);
@@ -48,16 +48,25 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
 void cpuset_init_current_mems_allowed(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_node_allowed(int node, gfp_t gfp_mask);
+extern bool __cpuset_node_allowed(int node, gfp_t gfp_mask);
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask);
+	if (cpusets_enabled())
+		return __cpuset_node_allowed(node, gfp_mask);
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+	return __cpuset_node_allowed(zone_to_nid(z), gfp_mask);
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	if (cpusets_enabled())
+		return __cpuset_zone_allowed(z, gfp_mask);
+	return true;
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -172,14 +181,19 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
-static inline int cpuset_node_allowed(int node, gfp_t gfp_mask)
+static inline bool cpuset_node_allowed(int node, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
 }
 
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline bool __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
 {
-	return 1;
+	return true;
+}
+
+static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+{
+	return true;
 }
 
 static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
-- 
cgit v1.2.3