From 58cb65487e92b47448d00a711c9f5922137d5678 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 15:25:54 -0700 Subject: proc/maps: make vm_is_stack() logic namespace-friendly - Rename vm_is_stack() to task_of_stack() and change it to return "struct task_struct *" rather than the global (and thus wrong in general) pid_t. - Add the new pid_of_stack() helper which calls task_of_stack() and uses the right namespace to report the correct pid_t. Unfortunately we need to define this helper twice, in task_mmu.c and in task_nommu.c. perhaps it makes sense to add fs/proc/util.c and move at least pid_of_stack/task_of_stack there to avoid the code duplication. - Change show_map_vma() and show_numa_map() to use the new helper. Signed-off-by: Oleg Nesterov Cc: Alexander Viro Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Greg Ungerer Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 0f4196a0bc20..28df70774b81 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1247,8 +1247,8 @@ static inline int stack_guard_page_end(struct vm_area_struct *vma, !vma_growsup(vma->vm_next, addr); } -extern pid_t -vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group); +extern struct task_struct *task_of_stack(struct task_struct *task, + struct vm_area_struct *vma, bool in_group); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, -- cgit v1.2.3 From 07f361b2bee38896df8be17d8c3f8af3f3610606 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 9 Oct 2014 15:26:00 -0700 Subject: mm/slab_common: move kmem_cache definition to internal header We don't need to keep kmem_cache definition in include/linux/slab.h if we don't need to inline kmem_cache_size(). According to my code inspection, this function is only called at lc_create() in lib/lru_cache.c which may be called at initialization phase of something, so we don't need to inline it. Therfore, move it to slab_common.c and move kmem_cache definition to internal header. After this change, we can change kmem_cache definition easily without full kernel build. For instance, we can turn on/off CONFIG_SLUB_STATS without full kernel build. [akpm@linux-foundation.org: export kmem_cache_size() to modules] [rdunlap@infradead.org: add header files to fix kmemcheck.c build errors] Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Zhang Yanfei Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 42 +----------------------------------------- 1 file changed, 1 insertion(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 1d9abb7d22a0..9062e4ad1787 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -158,31 +158,6 @@ size_t ksize(const void *); #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif -#ifdef CONFIG_SLOB -/* - * Common fields provided in kmem_cache by all slab allocators - * This struct is either used directly by the allocator (SLOB) - * or the allocator must include definitions for all fields - * provided in kmem_cache_common in their definition of kmem_cache. - * - * Once we can do anonymous structs (C11 standard) we could put a - * anonymous struct definition in these allocators so that the - * separate allocations in the kmem_cache structure of SLAB and - * SLUB is no longer needed. - */ -struct kmem_cache { - unsigned int object_size;/* The original size of the object */ - unsigned int size; /* The aligned/padded/added on size */ - unsigned int align; /* Alignment as calculated */ - unsigned long flags; /* Active flags on the slab */ - const char *name; /* Slab name for sysfs */ - int refcount; /* Use counter */ - void (*ctor)(void *); /* Called on object slot creation */ - struct list_head list; /* List of all slab caches on the system */ -}; - -#endif /* CONFIG_SLOB */ - /* * Kmalloc array related definitions */ @@ -363,14 +338,6 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, } #endif /* CONFIG_TRACING */ -#ifdef CONFIG_SLAB -#include -#endif - -#ifdef CONFIG_SLUB -#include -#endif - extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING @@ -650,14 +617,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) return kmalloc_node(size, flags | __GFP_ZERO, node); } -/* - * Determine the size of a slab object - */ -static inline unsigned int kmem_cache_size(struct kmem_cache *s) -{ - return s->object_size; -} - +unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); #endif /* _LINUX_SLAB_H */ -- cgit v1.2.3 From 61f47105a2c9c60e950ca808b7560f776f9bfa31 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 9 Oct 2014 15:26:02 -0700 Subject: mm/sl[ao]b: always track caller in kmalloc_(node_)track_caller() Now, we track caller if tracing or slab debugging is enabled. If they are disabled, we could save one argument passing overhead by calling __kmalloc(_node)(). But, I think that it would be marginal. Furthermore, default slab allocator, SLUB, doesn't use this technique so I think that it's okay to change this situation. After this change, we can turn on/off CONFIG_DEBUG_SLAB without full kernel build and remove some complicated '#if' defintion. It looks more benefitial to me. Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 22 ---------------------- 1 file changed, 22 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 9062e4ad1787..c265bec6a57d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -549,37 +549,15 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) * allocator where we care about the real place the memory allocation * request comes from. */ -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ - (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ - (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) -#else -#define kmalloc_track_caller(size, flags) \ - __kmalloc(size, flags) -#endif /* DEBUG_SLAB */ #ifdef CONFIG_NUMA -/* - * kmalloc_node_track_caller is a special version of kmalloc_node that - * records the calling function of the routine calling it for slab leak - * tracking instead of just the calling function (confusing, eh?). - * It's useful when the call to kmalloc_node comes from a widely-used - * standard allocator where we care about the real place the memory - * allocation request comes from. - */ -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) || \ - (defined(CONFIG_SLAB) && defined(CONFIG_TRACING)) || \ - (defined(CONFIG_SLOB) && defined(CONFIG_TRACING)) extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) -#else -#define kmalloc_node_track_caller(size, flags, node) \ - __kmalloc_node(size, flags, node) -#endif #else /* CONFIG_NUMA */ -- cgit v1.2.3 From ad2c8144418c6a81cefe65379fd47bbe8344cef2 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 9 Oct 2014 15:26:13 -0700 Subject: topology: add support for node_to_mem_node() to determine the fallback node Anton noticed (http://www.spinics.net/lists/linux-mm/msg67489.html) that on ppc LPARs with memoryless nodes, a large amount of memory was consumed by slabs and was marked unreclaimable. He tracked it down to slab deactivations in the SLUB core when we allocate remotely, leading to poor efficiency always when memoryless nodes are present. After much discussion, Joonsoo provided a few patches that help significantly. They don't resolve the problem altogether: - memory hotplug still needs testing, that is when a memoryless node becomes memory-ful, we want to dtrt - there are other reasons for going off-node than memoryless nodes, e.g., fully exhausted local nodes Neither case is resolved with this series, but I don't think that should block their acceptance, as they can be explored/resolved with follow-on patches. The series consists of: [1/3] topology: add support for node_to_mem_node() to determine the fallback node [2/3] slub: fallback to node_to_mem_node() node if allocating on memoryless node - Joonsoo's patches to cache the nearest node with memory for each NUMA node [3/3] Partial revert of 81c98869faa5 (""kthread: ensure locality of task_struct allocations") - At Tejun's request, keep the knowledge of memoryless node fallback to the allocator core. This patch (of 3): We need to determine the fallback node in slub allocator if the allocation target node is memoryless node. Without it, the SLUB wrongly select the node which has no memory and can't use a partial slab, because of node mismatch. Introduced function, node_to_mem_node(X), will return a node Y with memory that has the nearest distance. If X is memoryless node, it will return nearest distance node, but, if X is normal node, it will return itself. We will use this function in following patch to determine the fallback node. Signed-off-by: Joonsoo Kim Signed-off-by: Nishanth Aravamudan Cc: David Rientjes Cc: Han Pingtian Cc: Pekka Enberg Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Michael Ellerman Cc: Anton Blanchard Cc: Christoph Lameter Cc: Wanpeng Li Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/topology.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/topology.h b/include/linux/topology.h index dda6ee521e74..909b6e43b694 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -119,11 +119,20 @@ static inline int numa_node_id(void) * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem(). */ DECLARE_PER_CPU(int, _numa_mem_); +extern int _node_numa_mem_[MAX_NUMNODES]; #ifndef set_numa_mem static inline void set_numa_mem(int node) { this_cpu_write(_numa_mem_, node); + _node_numa_mem_[numa_node_id()] = node; +} +#endif + +#ifndef node_to_mem_node +static inline int node_to_mem_node(int node) +{ + return _node_numa_mem_[node]; } #endif @@ -146,6 +155,7 @@ static inline int cpu_to_mem(int cpu) static inline void set_cpu_numa_mem(int cpu, int node) { per_cpu(_numa_mem_, cpu) = node; + _node_numa_mem_[cpu_to_node(cpu)] = node; } #endif @@ -159,6 +169,13 @@ static inline int numa_mem_id(void) } #endif +#ifndef node_to_mem_node +static inline int node_to_mem_node(int node) +{ + return node; +} +#endif + #ifndef cpu_to_mem static inline int cpu_to_mem(int cpu) { -- cgit v1.2.3 From bf0dea23a9c094ae869a88bb694fbe966671bf6d Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Thu, 9 Oct 2014 15:26:27 -0700 Subject: mm/slab: use percpu allocator for cpu cache Because of chicken and egg problem, initialization of SLAB is really complicated. We need to allocate cpu cache through SLAB to make the kmem_cache work, but before initialization of kmem_cache, allocation through SLAB is impossible. On the other hand, SLUB does initialization in a more simple way. It uses percpu allocator to allocate cpu cache so there is no chicken and egg problem. So, this patch try to use percpu allocator in SLAB. This simplifies the initialization step in SLAB so that we could maintain SLAB code more easily. In my testing there is no performance difference. This implementation relies on percpu allocator. Because percpu allocator uses vmalloc address space, vmalloc address space could be exhausted by this change on many cpu system with *32 bit* kernel. This implementation can cover 1024 cpus in worst case by following calculation. Worst: 1024 cpus * 4 bytes for pointer * 300 kmem_caches * 120 objects per cpu_cache = 140 MB Normal: 1024 cpus * 4 bytes for pointer * 150 kmem_caches(slab merge) * 80 objects per cpu_cache = 46 MB Signed-off-by: Joonsoo Kim Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Jeremiah Mahler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8235dfbb3b05..b869d1662ba3 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -8,6 +8,8 @@ */ struct kmem_cache { + struct array_cache __percpu *cpu_cache; + /* 1) Cache tunables. Protected by slab_mutex */ unsigned int batchcount; unsigned int limit; @@ -71,23 +73,7 @@ struct kmem_cache { struct memcg_cache_params *memcg_params; #endif -/* 6) per-cpu/per-node data, touched during every alloc/free */ - /* - * We put array[] at the end of kmem_cache, because we want to size - * this array to nr_cpu_ids slots instead of NR_CPUS - * (see kmem_cache_init()) - * We still use [NR_CPUS] and not [1] or [0] because cache_cache - * is statically defined, so we reserve the max number of cpus. - * - * We also need to guarantee that the list is able to accomodate a - * pointer for each node since "nodelists" uses the remainder of - * available pointers. - */ - struct kmem_cache_node **node; - struct array_cache *array[NR_CPUS + MAX_NUMNODES]; - /* - * Do not add fields after array[] - */ + struct kmem_cache_node *node[MAX_NUMNODES]; }; #endif /* _LINUX_SLAB_DEF_H */ -- cgit v1.2.3 From ed2f240094f900833ac06f533ab8bbcf0a1e8199 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Thu, 9 Oct 2014 15:26:31 -0700 Subject: memory-hotplug: add sysfs valid_zones attribute Currently memory-hotplug has two limits: 1. If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE, but this memory block must be adjacent to ZONE_MOVABLE. 2. If the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL, but this memory block must be adjacent to ZONE_NORMAL. With this patch, we can easy to know a memory block can be onlined to which zone, and don't need to know the above two limits. Updated the related Documentation. [akpm@linux-foundation.org: use conventional comment layout] [akpm@linux-foundation.org: fix build with CONFIG_MEMORY_HOTREMOVE=n] [akpm@linux-foundation.org: remove unused local zone_prev] Signed-off-by: Zhang Zhen Cc: Dave Hansen Cc: David Rientjes Cc: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Naoya Horiguchi Cc: Wang Nan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index d9524c49d767..8f1a41951df9 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -84,6 +84,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages); extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long, unsigned long, int); +extern int test_pages_in_a_zone(unsigned long, unsigned long); extern void __offline_isolated_pages(unsigned long, unsigned long); typedef void (*online_page_callback_t)(struct page *page); -- cgit v1.2.3 From 6a33979d5bd7521497121c5ae4435d7003115a0f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 9 Oct 2014 15:26:33 -0700 Subject: mm: remove misleading ARCH_USES_NUMA_PROT_NONE ARCH_USES_NUMA_PROT_NONE was defined for architectures that implemented _PAGE_NUMA using _PROT_NONE. This saved using an additional PTE bit and relied on the fact that PROT_NONE vmas were skipped by the NUMA hinting fault scanner. This was found to be conceptually confusing with a lot of implicit assumptions and it was asked that an alternative be found. Commit c46a7c81 "x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels" redefined _PAGE_NUMA on x86 to be one of the swap PTE bits and shrunk the maximum possible swap size but it did not go far enough. There are no architectures that reuse _PROT_NONE as _PROT_NUMA but the relics still exist. This patch removes ARCH_USES_NUMA_PROT_NONE and removes some unnecessary duplication in powerpc vs the generic implementation by defining the types the core NUMA helpers expected to exist from x86 with their ppc64 equivalent. This necessitated that a PTE bit mask be created that identified the bits that distinguish present from NUMA pte entries but it is expected this will only differ between arches based on _PAGE_PROTNONE. The naming for the generic helpers was taken from x86 originally but ppc64 has types that are equivalent for the purposes of the helper so they are mapped instead of duplicating code. Signed-off-by: Mel Gorman Cc: Hugh Dickins Cc: "Kirill A. Shutemov" Cc: Rik van Riel Cc: Johannes Weiner Cc: Cyrill Gorcunov Reviewed-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 53b2acc38213..281870f56450 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -660,11 +660,12 @@ static inline int pmd_trans_unstable(pmd_t *pmd) } #ifdef CONFIG_NUMA_BALANCING -#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE /* - * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the - * same bit too). It's set only when _PAGE_PRESET is not set and it's - * never set if _PAGE_PRESENT is set. + * _PAGE_NUMA distinguishes between an unmapped page table entry, an entry that + * is protected for PROT_NONE and a NUMA hinting fault entry. If the + * architecture defines __PAGE_PROTNONE then it should take that into account + * but those that do not can rely on the fact that the NUMA hinting scanner + * skips inaccessible VMAs. * * pte/pmd_present() returns true if pte/pmd_numa returns true. Page * fault triggers on those regions if pte/pmd_numa returns true @@ -673,16 +674,14 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #ifndef pte_numa static inline int pte_numa(pte_t pte) { - return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; + return ptenuma_flags(pte) == _PAGE_NUMA; } #endif #ifndef pmd_numa static inline int pmd_numa(pmd_t pmd) { - return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; + return pmdnuma_flags(pmd) == _PAGE_NUMA; } #endif @@ -722,6 +721,8 @@ static inline pte_t pte_mknuma(pte_t pte) { pteval_t val = pte_val(pte); + VM_BUG_ON(!(val & _PAGE_PRESENT)); + val &= ~_PAGE_PRESENT; val |= _PAGE_NUMA; @@ -765,16 +766,6 @@ static inline void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, } #endif #else -extern int pte_numa(pte_t pte); -extern int pmd_numa(pmd_t pmd); -extern pte_t pte_mknonnuma(pte_t pte); -extern pmd_t pmd_mknonnuma(pmd_t pmd); -extern pte_t pte_mknuma(pte_t pte); -extern pmd_t pmd_mknuma(pmd_t pmd); -extern void ptep_set_numa(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -extern void pmdp_set_numa(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); -#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ -#else static inline int pmd_numa(pmd_t pmd) { return 0; -- cgit v1.2.3 From 505e3be6c082489a32a88e042f930d047b6415bc Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 9 Oct 2014 15:26:35 -0700 Subject: lib/genalloc.c: add power aligned algorithm One of the more common algorithms used for allocation is to align the start address of the allocation to the order of size requested. Add this as an algorithm option for genalloc. Signed-off-by: Laura Abbott Acked-by: Will Deacon Acked-by: Olof Johansson Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: David Riley Cc: Ritesh Harjain Cc: Russell King Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 1c2fdaa2ffc3..3cd0934d62ba 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -110,6 +110,10 @@ extern void gen_pool_set_algo(struct gen_pool *pool, genpool_algo_t algo, extern unsigned long gen_pool_first_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); +extern unsigned long gen_pool_first_fit_order_align(unsigned long *map, + unsigned long size, unsigned long start, unsigned int nr, + void *data); + extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); -- cgit v1.2.3 From 9efb3a421d55d30b65fb0dbee05108d15c6c55f7 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 9 Oct 2014 15:26:38 -0700 Subject: lib/genalloc.c: add genpool range check function After allocating an address from a particular genpool, there is no good way to verify if that address actually belongs to a genpool. Introduce addr_in_gen_pool which will return if an address plus size falls completely within the genpool range. Signed-off-by: Laura Abbott Acked-by: Will Deacon Reviewed-by: Olof Johansson Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: David Riley Cc: Ritesh Harjain Cc: Russell King Cc: Thierry Reding Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/genalloc.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 3cd0934d62ba..1ccaab44abcc 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -121,6 +121,9 @@ extern struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, int nid); extern struct gen_pool *dev_get_gen_pool(struct device *dev); +bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, + size_t size); + #ifdef CONFIG_OF extern struct gen_pool *of_get_named_gen_pool(struct device_node *np, const char *propname, int index); -- cgit v1.2.3 From 513510ddba9650fc7da456eefeb0ead7632324f6 Mon Sep 17 00:00:00 2001 From: Laura Abbott Date: Thu, 9 Oct 2014 15:26:40 -0700 Subject: common: dma-mapping: introduce common remapping functions For architectures without coherent DMA, memory for DMA may need to be remapped with coherent attributes. Factor out the the remapping code from arm and put it in a common location to reduce code duplication. As part of this, the arm APIs are now migrated away from ioremap_page_range to the common APIs which use map_vm_area for remapping. This should be an equivalent change and using map_vm_area is more correct as ioremap_page_range is intended to bring in io addresses into the cpu space and not regular kernel managed memory. Signed-off-by: Laura Abbott Reviewed-by: Catalin Marinas Cc: Arnd Bergmann Cc: David Riley Cc: Olof Johansson Cc: Ritesh Harjain Cc: Russell King Cc: Thierry Reding Cc: Will Deacon Cc: James Hogan Cc: Laura Abbott Cc: Mitchel Humpherys Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/dma-mapping-common.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/asm-generic/dma-mapping-common.h b/include/asm-generic/dma-mapping-common.h index de8bf89940f8..a9fd248f5d48 100644 --- a/include/asm-generic/dma-mapping-common.h +++ b/include/asm-generic/dma-mapping-common.h @@ -179,6 +179,15 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size); +void *dma_common_contiguous_remap(struct page *page, size_t size, + unsigned long vm_flags, + pgprot_t prot, const void *caller); + +void *dma_common_pages_remap(struct page **pages, size_t size, + unsigned long vm_flags, pgprot_t prot, + const void *caller); +void dma_common_free_remap(void *cpu_addr, size_t size, unsigned long vm_flags); + /** * dma_mmap_attrs - map a coherent DMA allocation into user space * @dev: valid struct device pointer, or NULL for ISA and EISA-like devices -- cgit v1.2.3 From 53853e2d2bfb748a8b5aa2fd1de15699266865e0 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:02 -0700 Subject: mm, compaction: defer each zone individually instead of preferred zone When direct sync compaction is often unsuccessful, it may become deferred for some time to avoid further useless attempts, both sync and async. Successful high-order allocations un-defer compaction, while further unsuccessful compaction attempts prolong the compaction deferred period. Currently the checking and setting deferred status is performed only on the preferred zone of the allocation that invoked direct compaction. But compaction itself is attempted on all eligible zones in the zonelist, so the behavior is suboptimal and may lead both to scenarios where 1) compaction is attempted uselessly, or 2) where it's not attempted despite good chances of succeeding, as shown on the examples below: 1) A direct compaction with Normal preferred zone failed and set deferred compaction for the Normal zone. Another unrelated direct compaction with DMA32 as preferred zone will attempt to compact DMA32 zone even though the first compaction attempt also included DMA32 zone. In another scenario, compaction with Normal preferred zone failed to compact Normal zone, but succeeded in the DMA32 zone, so it will not defer compaction. In the next attempt, it will try Normal zone which will fail again, instead of skipping Normal zone and trying DMA32 directly. 2) Kswapd will balance DMA32 zone and reset defer status based on watermarks looking good. A direct compaction with preferred Normal zone will skip compaction of all zones including DMA32 because Normal was still deferred. The allocation might have succeeded in DMA32, but won't. This patch makes compaction deferring work on individual zone basis instead of preferred zone. For each zone, it checks compaction_deferred() to decide if the zone should be skipped. If watermarks fail after compacting the zone, defer_compaction() is called. The zone where watermarks passed can still be deferred when the allocation attempt is unsuccessful. When allocation is successful, compaction_defer_reset() is called for the zone containing the allocated page. This approach should approximate calling defer_compaction() only on zones where compaction was attempted and did not yield allocated page. There might be corner cases but that is inevitable as long as the decision to stop compacting dues not guarantee that a page will be allocated. Due to a new COMPACT_DEFERRED return value, some functions relying implicitly on COMPACT_SKIPPED = 0 had to be updated, with comments made more accurate. The did_some_progress output parameter of __alloc_pages_direct_compact() is removed completely, as the caller actually does not use it after compaction sets it - it is only considered when direct reclaim sets it. During testing on a two-node machine with a single very small Normal zone on node 1, this patch has improved success rates in stress-highalloc mmtests benchmark. The success here were previously made worse by commit 3a025760fc15 ("mm: page_alloc: spill to remote nodes before waking kswapd") as kswapd was no longer resetting often enough the deferred compaction for the Normal zone, and DMA32 zones on both nodes were thus not considered for compaction. On different machine, success rates were improved with __GFP_NO_KSWAPD allocations. [akpm@linux-foundation.org: fix CONFIG_COMPACTION=n build] Signed-off-by: Vlastimil Babka Acked-by: Minchan Kim Reviewed-by: Zhang Yanfei Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 01e3132820da..b2e4c92d0445 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -2,14 +2,16 @@ #define _LINUX_COMPACTION_H /* Return values for compact_zone() and try_to_compact_pages() */ +/* compaction didn't start as it was deferred due to past failures */ +#define COMPACT_DEFERRED 0 /* compaction didn't start as it was not possible or direct reclaim was more suitable */ -#define COMPACT_SKIPPED 0 +#define COMPACT_SKIPPED 1 /* compaction should continue to another pageblock */ -#define COMPACT_CONTINUE 1 +#define COMPACT_CONTINUE 2 /* direct compaction partially compacted a zone and there are suitable pages */ -#define COMPACT_PARTIAL 2 +#define COMPACT_PARTIAL 3 /* The full zone was compacted */ -#define COMPACT_COMPLETE 3 +#define COMPACT_COMPLETE 4 #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; @@ -22,7 +24,8 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, bool *contended); + enum migrate_mode mode, bool *contended, + struct zone **candidate_zone); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -91,7 +94,8 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended) + enum migrate_mode mode, bool *contended, + struct zone **candidate_zone) { return COMPACT_CONTINUE; } -- cgit v1.2.3 From 1f9efdef4f3f1d2a073e524113fd0038af636f2b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 9 Oct 2014 15:27:14 -0700 Subject: mm, compaction: khugepaged should not give up due to need_resched() Async compaction aborts when it detects zone lock contention or need_resched() is true. David Rientjes has reported that in practice, most direct async compactions for THP allocation abort due to need_resched(). This means that a second direct compaction is never attempted, which might be OK for a page fault, but khugepaged is intended to attempt a sync compaction in such case and in these cases it won't. This patch replaces "bool contended" in compact_control with an int that distinguishes between aborting due to need_resched() and aborting due to lock contention. This allows propagating the abort through all compaction functions as before, but passing the abort reason up to __alloc_pages_slowpath() which decides when to continue with direct reclaim and another compaction attempt. Another problem is that try_to_compact_pages() did not act upon the reported contention (both need_resched() or lock contention) immediately and would proceed with another zone from the zonelist. When need_resched() is true, that means initializing another zone compaction, only to check again need_resched() in isolate_migratepages() and aborting. For zone lock contention, the unintended consequence is that the lock contended status reported back to the allocator is detrmined from the last zone where compaction was attempted, which is rather arbitrary. This patch fixes the problem in the following way: - async compaction of a zone aborting due to need_resched() or fatal signal pending means that further zones should not be tried. We report COMPACT_CONTENDED_SCHED to the allocator. - aborting zone compaction due to lock contention means we can still try another zone, since it has different set of locks. We report back COMPACT_CONTENDED_LOCK only if *all* zones where compaction was attempted, it was aborted due to lock contention. As a result of these fixes, khugepaged will proceed with second sync compaction as intended, when the preceding async compaction aborted due to need_resched(). Page fault compactions aborting due to need_resched() will spare some cycles previously wasted by initializing another zone compaction only to abort again. Lock contention will be reported only when compaction in all zones aborted due to lock contention, and therefore it's not a good idea to try again after reclaim. In stress-highalloc from mmtests configured to use __GFP_NO_KSWAPD, this has improved number of THP collapse allocations by 10%, which shows positive effect on khugepaged. The benchmark's success rates are unchanged as it is not recognized as khugepaged. Numbers of compact_stall and compact_fail events have however decreased by 20%, with compact_success still a bit improved, which is good. With benchmark configured not to use __GFP_NO_KSWAPD, there is 6% improvement in THP collapse allocations, and only slight improvement in stalls and failures. [akpm@linux-foundation.org: fix warnings] Reported-by: David Rientjes Signed-off-by: Vlastimil Babka Cc: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index b2e4c92d0445..60bdf8dc02a3 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -13,6 +13,14 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 4 +/* Used to signal whether compaction detected need_sched() or lock contention */ +/* No contention detected */ +#define COMPACT_CONTENDED_NONE 0 +/* Either need_sched() was true or fatal signal pending */ +#define COMPACT_CONTENDED_SCHED 1 +/* Zone lock or lru_lock was contended in async compaction */ +#define COMPACT_CONTENDED_LOCK 2 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -24,7 +32,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - enum migrate_mode mode, bool *contended, + enum migrate_mode mode, int *contended, struct zone **candidate_zone); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); @@ -94,7 +102,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - enum migrate_mode mode, bool *contended, + enum migrate_mode mode, int *contended, struct zone **candidate_zone) { return COMPACT_CONTINUE; -- cgit v1.2.3 From 43e7a34d265e884b7cf34f9b05e6f2e0c05bf120 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 9 Oct 2014 15:27:25 -0700 Subject: mm: rename allocflags_to_migratetype for clarity The page allocator has gfp flags (like __GFP_WAIT) and alloc flags (like ALLOC_CPUSET) that have separate semantics. The function allocflags_to_migratetype() actually takes gfp flags, not alloc flags, and returns a migratetype. Rename it to gfpflags_to_migratetype(). Signed-off-by: David Rientjes Signed-off-by: Vlastimil Babka Reviewed-by: Zhang Yanfei Reviewed-by: Naoya Horiguchi Acked-by: Minchan Kim Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5e7219dc0fae..41b30fd4d041 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -156,7 +156,7 @@ struct vm_area_struct; #define GFP_DMA32 __GFP_DMA32 /* Convert GFP flags to their corresponding migrate type */ -static inline int allocflags_to_migratetype(gfp_t gfp_flags) +static inline int gfpflags_to_migratetype(const gfp_t gfp_flags) { WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); -- cgit v1.2.3 From 9c5990240e076ae564cccbd921868cd08f6daaa5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:29 -0700 Subject: mm: introduce check_data_rlimit helper To eliminate code duplication lets introduce check_data_rlimit helper which we will use in brk() and prctl() syscalls. Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Cc: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 28df70774b81..4d814aa97785 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -18,6 +18,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1780,6 +1781,20 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +static inline int check_data_rlimit(unsigned long rlim, + unsigned long new, + unsigned long start, + unsigned long end_data, + unsigned long start_data) +{ + if (rlim < RLIM_INFINITY) { + if (((new - start) + (end_data - start_data)) > rlim) + return -ENOSPC; + } + + return 0; +} + extern int mm_take_all_locks(struct mm_struct *mm); extern void mm_drop_all_locks(struct mm_struct *mm); -- cgit v1.2.3 From f606b77f1a9e362451aca8f81d8f36a3a112139e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 9 Oct 2014 15:27:37 -0700 Subject: prctl: PR_SET_MM -- introduce PR_SET_MM_MAP operation During development of c/r we've noticed that in case if we need to support user namespaces we face a problem with capabilities in prctl(PR_SET_MM, ...) call, in particular once new user namespace is created capable(CAP_SYS_RESOURCE) no longer passes. A approach is to eliminate CAP_SYS_RESOURCE check but pass all new values in one bundle, which would allow the kernel to make more intensive test for sanity of values and same time allow us to support checkpoint/restore of user namespaces. Thus a new command PR_SET_MM_MAP introduced. It takes a pointer of prctl_mm_map structure which carries all the members to be updated. prctl(PR_SET_MM, PR_SET_MM_MAP, struct prctl_mm_map *, size) struct prctl_mm_map { __u64 start_code; __u64 end_code; __u64 start_data; __u64 end_data; __u64 start_brk; __u64 brk; __u64 start_stack; __u64 arg_start; __u64 arg_end; __u64 env_start; __u64 env_end; __u64 *auxv; __u32 auxv_size; __u32 exe_fd; }; All members except @exe_fd correspond ones of struct mm_struct. To figure out which available values these members may take here are meanings of the members. - start_code, end_code: represent bounds of executable code area - start_data, end_data: represent bounds of data area - start_brk, brk: used to calculate bounds for brk() syscall - start_stack: used when accounting space needed for command line arguments, environment and shmat() syscall - arg_start, arg_end, env_start, env_end: represent memory area supplied for command line arguments and environment variables - auxv, auxv_size: carries auxiliary vector, Elf format specifics - exe_fd: file descriptor number for executable link (/proc/self/exe) Thus we apply the following requirements to the values 1) Any member except @auxv, @auxv_size, @exe_fd is rather an address in user space thus it must be laying inside [mmap_min_addr, mmap_max_addr) interval. 2) While @[start|end]_code and @[start|end]_data may point to an nonexisting VMAs (say a program maps own new .text and .data segments during execution) the rest of members should belong to VMA which must exist. 3) Addresses must be ordered, ie @start_ member must not be greater or equal to appropriate @end_ member. 4) As in regular Elf loading procedure we require that @start_brk and @brk be greater than @end_data. 5) If RLIMIT_DATA rlimit is set to non-infinity new values should not exceed existing limit. Same applies to RLIMIT_STACK. 6) Auxiliary vector size must not exceed existing one (which is predefined as AT_VECTOR_SIZE and depends on architecture). 7) File descriptor passed in @exe_file should be pointing to executable file (because we use existing prctl_set_mm_exe_file_locked helper it ensures that the file we are going to use as exe link has all required permission granted). Now about where these members are involved inside kernel code: - @start_code and @end_code are used in /proc/$pid/[stat|statm] output; - @start_data and @end_data are used in /proc/$pid/[stat|statm] output, also they are considered if there enough space for brk() syscall result if RLIMIT_DATA is set; - @start_brk shown in /proc/$pid/stat output and accounted in brk() syscall if RLIMIT_DATA is set; also this member is tested to find a symbolic name of mmap event for perf system (we choose if event is generated for "heap" area); one more aplication is selinux -- we test if a process has PROCESS__EXECHEAP permission if trying to make heap area being executable with mprotect() syscall; - @brk is a current value for brk() syscall which lays inside heap area, it's shown in /proc/$pid/stat. When syscall brk() succesfully provides new memory area to a user space upon brk() completion the mm::brk is updated to carry new value; Both @start_brk and @brk are actively used in /proc/$pid/maps and /proc/$pid/smaps output to find a symbolic name "heap" for VMA being scanned; - @start_stack is printed out in /proc/$pid/stat and used to find a symbolic name "stack" for task and threads in /proc/$pid/maps and /proc/$pid/smaps output, and as the same as with @start_brk -- perf system uses it for event naming. Also kernel treat this member as a start address of where to map vDSO pages and to check if there is enough space for shmat() syscall; - @arg_start, @arg_end, @env_start and @env_end are printed out in /proc/$pid/stat. Another access to the data these members represent is to read /proc/$pid/environ or /proc/$pid/cmdline. Any attempt to read these areas kernel tests with access_process_vm helper so a user must have enough rights for this action; - @auxv and @auxv_size may be read from /proc/$pid/auxv. Strictly speaking kernel doesn't care much about which exactly data is sitting there because it is solely for userspace; - @exe_fd is referred from /proc/$pid/exe and when generating coredump. We uses prctl_set_mm_exe_file_locked helper to update this member, so exe-file link modification remains one-shot action. Still note that updating exe-file link now doesn't require sys-resource capability anymore, after all there is no much profit in preventing setup own file link (there are a number of ways to execute own code -- ptrace, ld-preload, so that the only reliable way to find which exactly code is executed is to inspect running program memory). Still we require the caller to be at least user-namespace root user. I believe the old interface should be deprecated and ripped off in a couple of kernel releases if no one against. To test if new interface is implemented in the kernel one can pass PR_SET_MM_MAP_SIZE opcode and the kernel returns the size of currently supported struct prctl_mm_map. [akpm@linux-foundation.org: fix 80-col wordwrap in macro definitions] Signed-off-by: Cyrill Gorcunov Cc: Kees Cook Cc: Tejun Heo Acked-by: Andrew Vagin Tested-by: Andrew Vagin Cc: Eric W. Biederman Cc: H. Peter Anvin Acked-by: Serge Hallyn Cc: Pavel Emelyanov Cc: Vasiliy Kulikov Cc: KAMEZAWA Hiroyuki Cc: Michael Kerrisk Cc: Julien Tinnes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/prctl.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 58afc04c107e..513df75d0fc9 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -1,6 +1,8 @@ #ifndef _LINUX_PRCTL_H #define _LINUX_PRCTL_H +#include + /* Values to pass as first argument to prctl() */ #define PR_SET_PDEATHSIG 1 /* Second arg is a signal */ @@ -119,6 +121,31 @@ # define PR_SET_MM_ENV_END 11 # define PR_SET_MM_AUXV 12 # define PR_SET_MM_EXE_FILE 13 +# define PR_SET_MM_MAP 14 +# define PR_SET_MM_MAP_SIZE 15 + +/* + * This structure provides new memory descriptor + * map which mostly modifies /proc/pid/stat[m] + * output for a task. This mostly done in a + * sake of checkpoint/restore functionality. + */ +struct prctl_mm_map { + __u64 start_code; /* code section bounds */ + __u64 end_code; + __u64 start_data; /* data section bounds */ + __u64 end_data; + __u64 start_brk; /* heap for brk() syscall */ + __u64 brk; + __u64 start_stack; /* stack starts at */ + __u64 arg_start; /* command line arguments bounds */ + __u64 arg_end; + __u64 env_start; /* environment variables bounds */ + __u64 env_end; + __u64 *auxv; /* auxiliary vector */ + __u32 auxv_size; /* vector size */ + __u32 exe_fd; /* /proc/$pid/exe link file */ +}; /* * Set specific pid that is allowed to ptrace the current task. -- cgit v1.2.3 From 1f13ae399c58af5a05b5cee61da864e1f4071de4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 9 Oct 2014 15:27:39 -0700 Subject: mm: remove noisy remainder of the scan_unevictable interface The deprecation warnings for the scan_unevictable interface triggers by scripts doing `sysctl -a | grep something else'. This is annoying and not helpful. The interface has been defunct since 264e56d8247e ("mm: disable user interface to manually rescue unevictable pages"), which was in 2011, and there haven't been any reports of usecases for it, only reports that the deprecation warnings are annying. It's unlikely that anybody is using this interface specifically at this point, so remove it. Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1b72060f093a..ea4f926e6b9b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -354,22 +354,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) extern int page_evictable(struct page *page); extern void check_move_unevictable_pages(struct page **, int nr_pages); -extern unsigned long scan_unevictable_pages; -extern int scan_unevictable_handler(struct ctl_table *, int, - void __user *, size_t *, loff_t *); -#ifdef CONFIG_NUMA -extern int scan_unevictable_register_node(struct node *node); -extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} -static inline void scan_unevictable_unregister_node(struct node *node) -{ -} -#endif - extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_MEMCG -- cgit v1.2.3 From 6b6482bbf64ef6f6dbc8b52f7a7cf88a0498bd51 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 15:27:48 -0700 Subject: mempolicy: remove the "task" arg of vma_policy_mof() and simplify it 1. vma_policy_mof(task) is simply not safe unless task == current, it can race with do_exit()->mpol_put(). Remove this arg and update its single caller. 2. vma can not be NULL, remove this check and simplify the code. Signed-off-by: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Andi Kleen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index f230a978e6ba..5e4bfcedd2ce 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -136,7 +136,7 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma); +bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); -- cgit v1.2.3 From 74d2c3a05cc6c1eef2d7236a9919036ed85ddaaf Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 15:27:50 -0700 Subject: mempolicy: introduce __get_vma_policy(), export get_task_policy() Extract the code which looks for vma's policy from get_vma_policy() into the new helper, __get_vma_policy(). Export get_task_policy(). Signed-off-by: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Andi Kleen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 5e4bfcedd2ce..e1abe249892a 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -134,6 +134,9 @@ void mpol_free_shared_policy(struct shared_policy *p); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx); +struct mempolicy *get_task_policy(struct task_struct *p); +struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, + unsigned long addr); struct mempolicy *get_vma_policy(struct task_struct *tsk, struct vm_area_struct *vma, unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); -- cgit v1.2.3 From dd6eecb917938c1b7e505a83df307b3476e7c8bd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 9 Oct 2014 15:27:57 -0700 Subject: mempolicy: unexport get_vma_policy() and remove its "task" arg - get_vma_policy(task) is not safe if task != current, remove this argument. - get_vma_policy() no longer has callers outside of mempolicy.c, make it static. Signed-off-by: Oleg Nesterov Cc: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: KOSAKI Motohiro Cc: Alexander Viro Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: "Kirill A. Shutemov" Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Andi Kleen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index e1abe249892a..3d385c81c153 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -137,8 +137,6 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); -struct mempolicy *get_vma_policy(struct task_struct *tsk, - struct vm_area_struct *vma, unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); -- cgit v1.2.3 From 1c93923cc264105418e6ead149c76bd88302eff4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 9 Oct 2014 15:27:59 -0700 Subject: include/linux/migrate.h: remove migrate_page #define This is designed to avoid a few ifdefs in .c files but it's obnoxious because it can cause unsuspecting "migrate_page" symbols to get turned into "NULL". Just nuke it and use the ifdefs. Cc: Konstantin Khlebnikov Cc: Rafael Aquini Cc: Andrey Ryabinin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'include') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index a2901c414664..b66fd10f4b93 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -82,9 +82,6 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, return -ENOSYS; } -/* Possible settings for the migrate_page() method in address_operations */ -#define migrate_page NULL - #endif /* CONFIG_MIGRATION */ #ifdef CONFIG_NUMA_BALANCING -- cgit v1.2.3 From 0bf55139782db1fa96af66e37cc84afde18443ef Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:06 -0700 Subject: mm: introduce dump_vma Introduce a helper to dump information about a VMA, this also makes dump_page_flags more generic and re-uses that so the output looks very similar to dump_page: [ 61.903437] vma ffff88070f88be00 start 00007fff25970000 end 00007fff25992000 [ 61.903437] next ffff88070facd600 prev ffff88070face400 mm ffff88070fade000 [ 61.903437] prot 8000000000000025 anon_vma ffff88070fa1e200 vm_ops (null) [ 61.903437] pgoff 7ffffffdd file (null) private_data (null) [ 61.909129] flags: 0x100173(read|write|mayread|maywrite|mayexec|growsdown|account) [akpm@linux-foundation.org: make dump_vma() require CONFIG_DEBUG_VM] [swarren@nvidia.com: fix dump_vma() compilation] Signed-off-by: Sasha Levin Reviewed-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Minchan Kim Signed-off-by: Stephen Warren Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2f348d02f640..dfb93333fc62 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -4,10 +4,12 @@ #include struct page; +struct vm_area_struct; extern void dump_page(struct page *page, const char *reason); extern void dump_page_badflags(struct page *page, const char *reason, unsigned long badflags); +void dump_vma(const struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) -- cgit v1.2.3 From fa3759ccd5651c4235f572302d58c8ec9ddf1c4b Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:08 -0700 Subject: mm: introduce VM_BUG_ON_VMA Very similar to VM_BUG_ON_PAGE but dumps VMA information instead. Signed-off-by: Sasha Levin Reviewed-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmdebug.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index dfb93333fc62..569e4c8d0ebb 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -20,12 +20,20 @@ void dump_vma(const struct vm_area_struct *vma); BUG(); \ } \ } while (0) +#define VM_BUG_ON_VMA(cond, vma) \ + do { \ + if (unlikely(cond)) { \ + dump_vma(vma); \ + BUG(); \ + } \ + } while (0) #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) #define VM_WARN_ONCE(cond, format...) WARN_ONCE(cond, format) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +#define VM_BUG_ON_VMA(cond, vma) VM_BUG_ON(cond) #define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #define VM_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) -- cgit v1.2.3 From 81d1b09c6be66afac7d41ee52279d9bccbce56d8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 9 Oct 2014 15:28:10 -0700 Subject: mm: convert a few VM_BUG_ON callers to VM_BUG_ON_VMA Trivially convert a few VM_BUG_ON calls to VM_BUG_ON_VMA to extract more information when they trigger. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Sasha Levin Reviewed-by: Naoya Horiguchi Cc: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Rik van Riel Cc: Mel Gorman Cc: Michal Hocko Cc: Hugh Dickins Cc: Vlastimil Babka Cc: Michel Lespinasse Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 +- include/linux/rmap.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 63579cb8d3dc..ad9051bab267 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -132,7 +132,7 @@ extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, spinlock_t **ptl) { - VM_BUG_ON(!