From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:18 -0800 Subject: slab, slub, slob: add slab_flags_t Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON, etc). SLAB is bloated temporarily by switching to "unsigned long", but only temporarily. Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 4 ++-- include/linux/kmemleak.h | 8 +++---- include/linux/slab.h | 60 +++++++++++++++++++++++++++++------------------- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/linux/types.h | 1 + include/net/sock.h | 2 +- 7 files changed, 47 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5017269e3f04..e3eb834c9a35 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags); + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) {} + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 590343f6c1b1..5ac416e2d339 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_alloc(ptr, size, min_count, gfp); } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_free(ptr); @@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count, { } static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { } @@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr) static inline void kmemleak_free_part(const void *ptr, size_t size) { } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { } static inline void kmemleak_free_percpu(const void __percpu *ptr) diff --git a/include/linux/slab.h b/include/linux/slab.h index af5aa65c7c18..0c4c579f52ed 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -21,13 +21,20 @@ * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ -#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ -#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ -#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ -#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ -#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ +/* DEBUG: Perform (expensive) checks on alloc/free */ +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +/* DEBUG: Red zone objs in a cache */ +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +/* DEBUG: Poison objects */ +#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +/* Align objs on cache lines */ +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +/* Use GFP_DMA memory */ +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +/* DEBUG: Store the last owner for bug hunting */ +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +/* Panic if kmem_cache_create() fails */ +#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -65,44 +72,51 @@ * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ -#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ -#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ +/* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +/* Spread some memory over cpuset */ +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +/* Trace allocations and frees */ +#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS 0x00400000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) #else -# define SLAB_DEBUG_OBJECTS 0x00000000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) #endif -#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Avoid kmemleak tracing */ +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK 0x01000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) #else -# define SLAB_NOTRACK 0x00000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) #endif +/* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) #else -# define SLAB_FAILSLAB 0x00000000UL +# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) #endif +/* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) #else -# define SLAB_ACCOUNT 0x00000000UL +# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN 0x08000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) #else -#define SLAB_KASAN 0x00000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) #endif /* The following flags affect the page allocator grouping pages by mobility */ -#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ +/* Objects are reclaimable */ +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. @@ -128,7 +142,7 @@ void __init kmem_cache_init(void); bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, - unsigned long, + slab_flags_t, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8f7d2b1656d2..072e46e9e1d5 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -20,7 +20,7 @@ struct kmem_cache { struct reciprocal_value reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ - unsigned int flags; /* constant flags */ + slab_flags_t flags; /* constant flags */ unsigned int num; /* # of objs per slab */ /* 3) cache_grow/shrink */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 39fa09bcde23..0adae162dc8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -82,7 +82,7 @@ struct kmem_cache_order_objects { struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ - unsigned long flags; + slab_flags_t flags; unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ diff --git a/include/linux/types.h b/include/linux/types.h index 34fce54e4f1b..732b52c2eae4 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,6 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; +typedef unsigned long __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..c577286dbffb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1105,7 +1105,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; - int slab_flags; + slab_flags_t slab_flags; struct percpu_counter *orphan_count; -- cgit v1.2.3 From 4fd0b46e898791009b03b2fdd6510044fa8730a6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:21 -0800 Subject: slab, slub, slob: convert slab_flags_t to 32-bit struct kmem_cache::flags is "unsigned long" which is unnecessary on 64-bit as no flags are defined in the higher bits. Switch the field to 32-bit and save some space on x86_64 until such flags appear: add/remove: 0/0 grow/shrink: 0/107 up/down: 0/-657 (-657) function old new delta sysfs_slab_add 720 719 -1 ... check_object 699 676 -23 [akpm@linux-foundation.org: fix printk warning] Link: http://lkml.kernel.org/r/20171021100635.GA8287@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 44 ++++++++++++++++++++++---------------------- include/linux/types.h | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c4c579f52ed..f37cb93768ab 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -22,19 +22,19 @@ * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) /* DEBUG: Red zone objs in a cache */ -#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ -#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +#define SLAB_POISON ((slab_flags_t __force)0x00000800U) /* Align objs on cache lines */ -#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ -#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) /* Panic if kmem_cache_create() fails */ -#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) +#define SLAB_PANIC ((slab_flags_t __force)0x00040000U) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -73,50 +73,50 @@ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ -#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) /* Spread some memory over cpuset */ -#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) /* Trace allocations and frees */ -#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) +#define SLAB_TRACE ((slab_flags_t __force)0x00200000U) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) #else -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) +# define SLAB_DEBUG_OBJECTS 0 #endif /* Avoid kmemleak tracing */ -#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000U) #else -# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) +# define SLAB_NOTRACK 0 #endif /* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) #else -# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) +# define SLAB_FAILSLAB 0 #endif /* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else -# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) +# define SLAB_ACCOUNT 0 #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) +#define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else -#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) +#define SLAB_KASAN 0 #endif /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ -#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. diff --git a/include/linux/types.h b/include/linux/types.h index 732b52c2eae4..c94d59ef96cc 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,7 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; -typedef unsigned long __bitwise slab_flags_t; +typedef unsigned __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT -- cgit v1.2.3 From 5799b255c491d853b73fb9e0e1760210315d06cd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:29 -0800 Subject: include/linux/slab.h: add kmalloc_array_node() and kcalloc_node() Patch series "Add kmalloc_array_node() and kcalloc_node()". Our current memeory allocation routines suffer form an API imbalance, for one we have kmalloc_array() and kcalloc() which check for overflows in size multiplication and we have kmalloc_node() and kzalloc_node() which allow for memory allocation on a certain NUMA node but don't check for eventual overflows. This patch (of 6): We have kmalloc_array() and kcalloc() wrappers on top of kmalloc() which ensure us overflow free multiplication for the size of a memory allocation but these implementations are not NUMA-aware. Likewise we have kmalloc_node() which is a NUMA-aware version of kmalloc() but the implementation is not aware of any possible overflows in eventual size calculations. Introduce a combination of the two above cases to have a NUMA-node aware version of kmalloc_array() and kcalloc(). Link: http://lkml.kernel.org/r/20170927082038.3782-2-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Sean Hefty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index f37cb93768ab..ebddfca9e24b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -650,6 +650,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) +static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) + return kmalloc_node(n * size, flags, node); + return __kmalloc_node(n * size, flags, node); +} + +static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +{ + return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); +} + + #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ -- cgit v1.2.3 From 41710443f790b5f7f5305eba99dacce88e259f4c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 15 Nov 2017 17:32:53 -0800 Subject: mm: update comments for struct page.mapping struct page.mapping can be NULL or points to one object of type address_space, anon_vma or KSM private structure. Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c85f11dafd56..d1b8e8f97fc2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -48,8 +48,10 @@ struct page { * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. + * it points to anon_vma object + * or KSM private structure. See + * PAGE_MAPPING_ANON and + * PAGE_MAPPING_KSM. */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ -- cgit v1.2.3 From 23c47d2ada9f96731492a67b28c0072715075baa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:00 -0800 Subject: bdi: introduce BDI_CAP_SYNCHRONOUS_IO As discussed at https://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwisler@linux.intel.com> someday we will remove rw_page(). If so, we need something to detect such super-fast storage on which synchronous IO operations like the current rw_page are always a win. Introduces BDI_CAP_SYNCHRONOUS_IO to indicate such devices. With it, we could use various optimization techniques. Link: http://lkml.kernel.org/r/1505886205-9671-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f41ca8486e02..7360e79e9a2f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. * * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. + * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be + * inefficient. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 +#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; +} + static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { return bdi->capabilities & BDI_CAP_STABLE_WRITES; -- cgit v1.2.3 From 539a6fea7fdcade532bd3e77be2862a683f8f0c9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:04 -0800 Subject: mm, swap: introduce SWP_SYNCHRONOUS_IO If rw-page based fast storage is used for swap devices, we need to detect it to enhance swap IO operations. This patch is preparation for optimizing of swap-in operation with next patch. Link: http://lkml.kernel.org/r/1505886205-9671-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Dan Williams Cc: Ilya Dryomov Cc: Jens Axboe Cc: Ross Zwisler Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index f02fb5db8914..933d7c0c3542 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -171,8 +171,9 @@ enum { SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL -- cgit v1.2.3 From 0bcac06f27d7528591c27ac2b093ccd71c5d0168 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:07 -0800 Subject: mm, swap: skip swapcache for swapin of synchronous device With fast swap storage, the platforms want to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patch aims to reduce swap-in latency by skipping swapcache if the swap device is synchronous device like rw_page based device. It enhances 45% my swapin test(5G sequential swapin, no readahead, from 2.41sec to 1.64sec). Link: http://lkml.kernel.org/r/1505886205-9671-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 933d7c0c3542..32c06f028c7b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -466,6 +466,7 @@ extern int page_swapcount(struct page *); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); +extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -474,6 +475,16 @@ extern void exit_swap_address_space(unsigned int type); #else /* CONFIG_SWAP */ +static inline int swap_readpage(struct page *page, bool do_poll) +{ + return 0; +} + +static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return NULL; +} + #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L -- cgit v1.2.3 From aa8d22a11da933dbf880b4933b58931f4aefe91c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:11 -0800 Subject: mm: swap: SWP_SYNCHRONOUS_IO: skip swapcache only if swapped page has no other reference When SWP_SYNCHRONOUS_IO swapped-in pages are shared by several processes, it can cause unnecessary memory wastage by skipping swap cache. Because, with swapin fault by read, they could share a page if the page were in swap cache. Thus, it avoids allocating same content new pages. This patch makes the swapcache skipping work only if the swap pte is non-sharable. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1507620825-5537-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 32c06f028c7b..8b8a6f965785 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,6 +463,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -589,6 +590,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + return 0; +} + static inline int __swp_swapcount(swp_entry_t entry) { return 0; -- cgit v1.2.3 From 4da2ce250f986060750fcc5b29112914e31803ba Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:26 -0800 Subject: mm: distinguish CMA and MOVABLE isolation in has_unmovable_pages() Joonsoo has noticed that "mm: drop migrate type checks from has_unmovable_pages" would break CMA allocator because it relies on has_unmovable_pages returning false even for CMA pageblocks which in fact don't have to be movable: alloc_contig_range start_isolate_page_range set_migratetype_isolate has_unmovable_pages This is a result of the code sharing between CMA and memory hotplug while each one has a different idea of what has_unmovable_pages should return. This is unfortunate but fixing it properly would require a lot of code duplication. Fix the issue by introducing the requested migrate type argument and special case MIGRATE_CMA case where CMA page blocks are handled properly. This will work for memory hotplug because it requires MIGRATE_MOVABLE. Link: http://lkml.kernel.org/r/20171019122118.y6cndierwl2vnguj@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Joonsoo Kim Tested-by: Stefan Wahren Tested-by: Ran Wang Cc: Michael Ellerman Cc: Vlastimil Babka Cc: Igor Mammedov Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 05a04e603686..cdad58bbfd8b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype) #endif bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - bool skip_hwpoisoned_pages); + int migratetype, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); -- cgit v1.2.3 From 66e8b438bd5c75498cfe915c4219049eaebcb869 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Wed, 15 Nov 2017 17:33:42 -0800 Subject: mm/memblock.c: make the index explicit argument of for_each_memblock_type for_each_memblock_type macro function relies on idx variable defined in the caller context. Silent macro arguments are almost always wrong thing to do. They make code harder to read and easier to get wrong. Let's use an explicit iterator parameter for for_each_memblock_type and make the code more obious. This patch is a mere cleanup and it shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170913133029.28911-1-gi-oh.kim@profitbricks.com Signed-off-by: Gioh Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index bae11c7e7bf3..ce0e5634c2f9 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -389,10 +389,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) -#define for_each_memblock_type(memblock_type, rgn) \ - for (idx = 0, rgn = &memblock_type->regions[0]; \ - idx < memblock_type->cnt; \ - idx++, rgn = &memblock_type->regions[idx]) +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); -- cgit v1.2.3 From 0bea803e9e6bf3836d5368a6426c30a8c0e5eab5 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 15 Nov 2017 17:34:00 -0800 Subject: mm/hmm: constify hmm_devmem_page_get_drvdata() parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify pointer parameter to avoid issue when use from code that only has const struct page pointer to use in the first place. Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 96e69979f84d..325017ad9311 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, * @page: pointer to struct page * Return: driver data value */ -static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; + const unsigned long *drvdata = (const unsigned long *)&page->pgmap; return drvdata[1]; } -- cgit v1.2.3 From 0f10851ea475e08896ee5d9a2036d1bb46a8f3a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:07 -0800 Subject: mm/mmu_notifier: avoid double notification when it is useless MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch only affects users of mmu_notifier->invalidate_range callback which are device drivers related to ATS/PASID, CAPI, IOMMUv2, SVM ... and it is an optimization for those users. Everyone else is unaffected by it. When clearing a pte/pmd we are given a choice to notify the event under the page table lock (notify version of *_clear_flush helpers do call the mmu_notifier_invalidate_range). But that notification is not necessary in all cases. This patch removes almost all cases where it is useless to have a call to mmu_notifier_invalidate_range before mmu_notifier_invalidate_range_end. It also adds documentation in all those cases explaining why. Below is a more in depth analysis of why this is fine to do this: For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a process virtual address space). There is only 2 cases when you need to notify those secondary TLB while holding page table lock when clearing a pte/pmd: A) page backing address is free before mmu_notifier_invalidate_range_end B) a page table entry is updated to point to a new page (COW, write fault on zero page, __replace_page(), ...) Case A is obvious you do not want to take the risk for the device to write to a page that might now be used by something completely different. Case B is more subtle. For correctness it requires the following sequence to happen: - take page table lock - clear page table entry and notify (pmd/pte_huge_clear_flush_notify()) - set page table entry to point to new page If clearing the page table entry is not followed by a notify before setting the new pte/pmd value then you can break memory model like C11 or C++11 for the device. Consider the following scenario (device use a feature similar to ATS/ PASID): Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume they are write protected for COW (other case of B apply too). [Time N] ----------------------------------------------------------------- CPU-thread-0 {try to write to addrA} CPU-thread-1 {try to write to addrB} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA and populate device TLB} DEV-thread-2 {read addrB and populate device TLB} [Time N+1] --------------------------------------------------------------- CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+2] --------------------------------------------------------------- CPU-thread-0 {COW_step1: {update page table point to new page for addrA}} CPU-thread-1 {COW_step1: {update page table point to new page for addrB}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {write to addrA which is a write to new page} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {} CPU-thread-3 {write to addrB which is a write to new page} DEV-thread-0 {} DEV-thread-2 {} [Time N+4] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+5] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA from old page} DEV-thread-2 {read addrB from new page} So here because at time N+2 the clear page table entry was not pair with a notification to invalidate the secondary TLB, the device see the new value for addrB before seing the new value for addrA. This break total memory ordering for the device. When changing a pte to write protect or to point to a new write protected page with same content (KSM) it is ok to delay invalidate_range callback to mmu_notifier_invalidate_range_end() outside the page table lock. This is true even if the thread doing page table update is preempted right after releasing page table lock before calling mmu_notifier_invalidate_range_end Thanks to Andrea for thinking of a problematic scenario for COW. [jglisse@redhat.com: v2] Link: http://lkml.kernel.org/r/20171017031003.7481-2-jglisse@redhat.com Link: http://lkml.kernel.org/r/20170901173011.10745-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2cf1c3c807f6..130831718e95 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -156,7 +156,8 @@ struct mmu_notifier_ops { * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an - * external TLB range needs to be flushed. + * external TLB range needs to be flushed. For more in depth + * discussion on this see Documentation/vm/mmu_notifier.txt * * The invalidate_range() function is called under the ptl * spin-lock and not allowed to sleep. -- cgit v1.2.3 From 4645b9fe84bf4878f04c7959a75df7c3c2d1bbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:11 -0800 Subject: mm/mmu_notifier: avoid call to invalidate_range() in range_end() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is an optimization patch that only affect mmu_notifier users which rely on the invalidate_range() callback. This patch avoids calling that callback twice in a row from inside __mmu_notifier_invalidate_range_end Existing pattern (before this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_end() mmu_notifier_invalidate_range() New pattern (after this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_only_end() We call the invalidate_range callback after clearing the page table under the page table lock and we skip the call to invalidate_range inside the __mmu_notifier_invalidate_range_end() function. Idea from Andrea Arcangeli Link: http://lkml.kernel.org/r/20171017031003.7481-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 130831718e95..b25dc9db19fc 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -214,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -268,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end); + __mmu_notifier_invalidate_range_end(mm, start, end, false); +} + +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -439,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, { } +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { -- cgit v1.2.3 From 3a50d14d0df5776e002a8683a290c87eeac93a21 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:34:15 -0800 Subject: mm: remove unused pgdat->inactive_ratio Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") 'pgdat->inactive_ratio' is not used, except for printing "node_inactive_ratio: 0" in /proc/zoneinfo output. Remove it. Link: http://lkml.kernel.org/r/20171003152611.27483-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a507f43ad221..39ccaa9c428b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -712,12 +712,6 @@ typedef struct pglist_data { /* Fields commonly accessed by the page reclaim scanner */ struct lruvec lruvec; - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this node's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - unsigned long flags; ZONE_PADDING(_pad2_) -- cgit v1.2.3 From 8745808fda84c638e45cc860c8fb600bf4b0a2a6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:34:22 -0800 Subject: mm, arch: remove empty_bad_page* empty_bad_page() and empty_bad_pte_table() seem to be relics from old days which is not used by any code for a long time. I have tried to find when exactly but this is not really all that straightforward due to many code movements - traces disappear around 2.4 times. Anyway no code really references neither empty_bad_page nor empty_bad_pte_table. We only allocate the storage which is not used by anybody so remove them. Link: http://lkml.kernel.org/r/20171004150045.30755-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Ralf Baechle Acked-by: Ingo Molnar Cc: Yoshinori Sato Cc: David Howells Cc: Rich Felker Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 584b14c774c1..3ec44e27aa9d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -18,7 +18,7 @@ * Various page->flags bits: * * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist (eg empty_bad_page)... + * of them might not even exist... * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by -- cgit v1.2.3 From 72b045aecdd856b083521f2a963705b4c2e59680 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:33 -0800 Subject: mm: implement find_get_pages_range_tag() Patch series "Ranged pagevec tagged lookup", v3. In this series I provide a ranged variant of pagevec_lookup_tag() and use it in places where it makes sense. This series removes some common code and it also has a potential for speeding up some operations similarly as for pagevec_lookup_range() (but for now I can think of only artificial cases where this happens). This patch (of 16): Implement a variant of find_get_pages_tag() that stops iterating at given index. Lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range_tag() function. Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Cc: Chao Yu Cc: David Howells Cc: David Sterba Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Ryusuke Konishi Cc: Steve French Cc: "Theodore Ts'o" Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 11 +++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e08b5339023c..956d6a80e126 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -366,8 +366,16 @@ static inline unsigned find_get_pages(struct address_space *mapping, } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, int tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2636c0c0f279..afc718f586f8 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -38,9 +38,16 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); } -unsigned pagevec_lookup_tag(struct pagevec *pvec, +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); + unsigned nr_pages) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, + nr_pages); +} static inline void pagevec_init(struct pagevec *pvec, int cold) { -- cgit v1.2.3 From 93d3b7140ad379885849ad2674b4290c9e8273da Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:12 -0800 Subject: mm: add variant of pagevec_lookup_range_tag() taking number of pages Currently pagevec_lookup_range_tag() takes number of pages to look up but most users don't need this. Create a new function pagevec_lookup_range_nr_tag() that takes maximum number of pages to lookup for Ceph which wants this functionality so that we can drop nr_pages argument from pagevec_lookup_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index afc718f586f8..87a2dfd62f5e 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -41,6 +41,9 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned nr_pages); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) -- cgit v1.2.3 From 67fd707f468142d0f689a6240044bb45c1913003 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:19 -0800 Subject: mm: remove nr_pages argument from pagevec_lookup_{,range}_tag() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87a2dfd62f5e..4f56e0ad9d00 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -40,16 +40,14 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages); + int tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *index, int tag) { - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, - nr_pages); + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } static inline void pagevec_init(struct pagevec *pvec, int cold) -- cgit v1.2.3 From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 36 +++++++++++++++++++++++++++++++++--- include/linux/mm_types.h | 3 +++ 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 91b46f99b4d2..9af86f39d928 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_nr_puds_init(struct mm_struct *mm) {} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_nr_puds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_puds, 0); +} + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_puds); +} + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_puds); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_puds); +} #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, static inline void mm_nr_pmds_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return 0; } @@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) atomic_long_set(&mm->nr_pmds, 0); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d1b8e8f97fc2..e9e561e02d22 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -404,6 +404,9 @@ struct mm_struct { atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + atomic_long_t nr_puds; /* PUD page table pages */ #endif int map_count; /* number of VMAs */ -- cgit v1.2.3 From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ 2 files changed, 34 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9af86f39d928..2ca799f0d762 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif +#ifdef CONFIG_MMU +static inline void mm_nr_ptes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_ptes, 0); +} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_ptes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_ptes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_ptes); +} +#else +static inline void mm_nr_ptes_init(struct mm_struct *mm) {} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9e561e02d22..e42048020664 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,7 +401,9 @@ struct mm_struct { */ atomic_t mm_count; +#ifdef CONFIG_MMU atomic_long_t nr_ptes; /* PTE page table pages */ +#endif #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif -- cgit v1.2.3 From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 58 ++++++++++-------------------------------------- include/linux/mm_types.h | 8 +------ 2 files changed, 13 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca799f0d762..7c1e82a1aa77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return 0; -} - -static inline void mm_nr_puds_init(struct mm_struct *mm) {} static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -static inline void mm_nr_puds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_puds, 0); -} - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_puds); -} - static inline void mm_inc_nr_puds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_puds); + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_puds); + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_str