From 4b094b7851bf4bf551ad456195d3f26e1c03bd74 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:33:55 -0800 Subject: mm/page_alloc.c: initialize memmap of unavailable memory directly Let's make sure that all memory holes are actually marked PageReserved(), that page_to_pfn() produces reliable results, and that these pages are not detected as "mmap" pages due to the mapcount. E.g., booting a x86-64 QEMU guest with 4160 MB: [ 0.010585] Early memory node ranges [ 0.010586] node 0: [mem 0x0000000000001000-0x000000000009efff] [ 0.010588] node 0: [mem 0x0000000000100000-0x00000000bffdefff] [ 0.010589] node 0: [mem 0x0000000100000000-0x0000000143ffffff] max_pfn is 0x144000. Before this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000800 16384 64 ___________M_______________________________ mmap total 16384 64 After this change: [root@localhost ~]# ./page-types -r -a 0x144000, flags page-count MB symbolic-flags long-symbolic-flags 0x0000000100000000 16384 64 ___________________________r_______________ reserved total 16384 64 IOW, especially the unavailable physical memory ("memory hole") in the last section would not get properly marked PageReserved() and is indicated to be "mmap" memory. Drop the trace of that function from include/linux/mm.h - nobody else needs it, and rename it accordingly. Note: The fake zone/node might not be covered by the zone/node span. This is not an urgent issue (for now, we had the same node/zone due to the zeroing). We'll need a clean way to mark memory holes (e.g., using a page type PageHole() if possible or a fake ZONE_INVALID) and eventually stop marking these memory holes PageReserved(). Link: http://lkml.kernel.org/r/20191211163201.17179-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Oscar Salvador Cc: Michal Hocko Cc: Dan Williams Cc: Alexey Dobriyan Cc: Bob Picco Cc: Daniel Jordan Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Pavel Tatashin Cc: Stephen Rothwell Cc: Steven Sistare Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 73a044ed6981..080f8ac8bfb7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2182,12 +2182,6 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif -#if !defined(CONFIG_FLAT_NODE_MEM_MAP) -void zero_resv_unavail(void); -#else -static inline void zero_resv_unavail(void) {} -#endif - extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context, struct vmem_altmap *); -- cgit v1.2.3 From 4c6058814ec4460c25111e29452ef596acdcd61b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:02 -0800 Subject: mm: factor out next_present_section_nr() Let's move it to the header and use the shorter variant from mm/page_alloc.c (the original one will also check "__highest_present_section_nr + 1", which is not necessary). While at it, make the section_nr in next_pfn() const. In next_pfn(), we now return section_nr_to_pfn(-1) instead of -1 once we exceed __highest_present_section_nr, which doesn't make a difference in the caller as it is big enough (>= all sane end_pfn). Link: http://lkml.kernel.org/r/20200113144035.10848-3-david@redhat.com Signed-off-by: David Hildenbrand Acked-by: Kirill A. Shutemov Cc: Baoquan He Cc: Dan Williams Cc: "Jin, Zhi" Cc: "Kirill A. Shutemov" Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Oscar Salvador Cc: Pavel Tatashin Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c2bc309d1634..462f6873905a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1379,6 +1379,16 @@ static inline int pfn_present(unsigned long pfn) return present_section(__nr_to_section(pfn_to_section_nr(pfn))); } +static inline unsigned long next_present_section_nr(unsigned long section_nr) +{ + while (++section_nr <= __highest_present_section_nr) { + if (present_section_nr(section_nr)) + return section_nr; + } + + return -1; +} + /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate -- cgit v1.2.3 From 92917998849eea951707c8fea2dc3007bb2ad2cd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 3 Feb 2020 17:34:26 -0800 Subject: mm/memory_hotplug: drop valid_start/valid_end from test_pages_in_a_zone() The callers are only interested in the actual zone, they don't care about boundaries. Return the zone instead to simplify. Link: http://lkml.kernel.org/r/20200110183308.11849-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Rafael J. Wysocki" Cc: Michal Hocko Cc: Oscar Salvador Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index ffa6ad12d84a..f4d59155f3d4 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -96,8 +96,8 @@ extern int add_one_highpage(struct page *page, int pfn, int bad_ppro); /* VM interface that may be used by firmware interface */ extern int online_pages(unsigned long pfn, unsigned long nr_pages, int online_type, int nid); -extern int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn, - unsigned long *valid_start, unsigned long *valid_end); +extern struct zone *test_pages_in_a_zone(unsigned long start_pfn, + unsigned long end_pfn); extern unsigned long __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn); -- cgit v1.2.3 From 1c948715a159d0d02c1e1c9228327ba3c408795c Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 3 Feb 2020 17:34:58 -0800 Subject: mm: remove __krealloc Since 5.5-rc1 the last user of this function is gone, so remove the functionality. See commit 2ad9d7747c10 ("netfilter: conntrack: free extension area immediately") for details. Link: http://lkml.kernel.org/r/20191212223442.22141-1-fw@strlen.de Signed-off-by: Florian Westphal Acked-by: Andrew Morton Acked-by: David Rientjes Reviewed-by: David Hildenbrand Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 877a95c6a2d2..03a389358562 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -184,7 +184,6 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *, struct mem_cgroup *); /* * Common kmalloc functions provided by all allocators */ -void * __must_check __krealloc(const void *, size_t, gfp_t); void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kzfree(const void *); -- cgit v1.2.3 From 3afc423632a194d7d6afef34e4bb98f804cd071d Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:45 -0800 Subject: mm: pagewalk: add p4d_entry() and pgd_entry() pgd_entry() and pud_entry() were removed by commit 0b1fbfe50006c410 ("mm/pagewalk: remove pgd_entry() and pud_entry()") because there were no users. We're about to add users so reintroduce them, along with p4d_entry() as we now have 5 levels of tables. Note that commit a00cc7d9dd93d66a ("mm, x86: add support for PUD-sized transparent hugepages") already re-added pud_entry() but with different semantics to the other callbacks. This commit reverts the semantics back to match the other callbacks. To support hmm.c which now uses the new semantics of pud_entry() a new member ('action') of struct mm_walk is added which allows the callbacks to either descend (ACTION_SUBTREE, the default), skip (ACTION_CONTINUE) or repeat the callback (ACTION_AGAIN). hmm.c is then updated to call pud_trans_huge_lock() itself and make use of the splitting/retry logic of the core code. After this change pud_entry() is called for all entries, not just transparent huge pages. [arnd@arndb.de: fix unused variable warning] Link: http://lkml.kernel.org/r/20200107204607.1533842-1-arnd@arndb.de Link: http://lkml.kernel.org/r/20191218162402.45610-12-steven.price@arm.com Signed-off-by: Steven Price Signed-off-by: Arnd Bergmann Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 6ec82e92c87f..aa6a0b63964e 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -8,15 +8,15 @@ struct mm_walk; /** * mm_walk_ops - callbacks for walk_page_range - * @pud_entry: if set, called for each non-empty PUD (2nd-level) entry - * this handler should only handle pud_trans_huge() puds. - * the pmd_entry or pte_entry callbacks will be used for - * regular PUDs. - * @pmd_entry: if set, called for each non-empty PMD (3rd-level) entry + * @pgd_entry: if set, called for each non-empty PGD (top-level) entry + * @p4d_entry: if set, called for each non-empty P4D entry + * @pud_entry: if set, called for each non-empty PUD entry + * @pmd_entry: if set, called for each non-empty PMD entry * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (4th-level) entry + * @pte_entry: if set, called for each non-empty PTE (lowest-level) + * entry * @pte_hole: if set, called for each hole at all levels * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether @@ -27,8 +27,15 @@ struct mm_walk; * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. + * + * p?d_entry callbacks are called even if those levels are folded on a + * particular architecture/configuration. */ struct mm_walk_ops { + int (*pgd_entry)(pgd_t *pgd, unsigned long addr, + unsigned long next, struct mm_walk *walk); + int (*p4d_entry)(p4d_t *p4d, unsigned long addr, + unsigned long next, struct mm_walk *walk); int (*pud_entry)(pud_t *pud, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pmd_entry)(pmd_t *pmd, unsigned long addr, @@ -47,11 +54,25 @@ struct mm_walk_ops { void (*post_vma)(struct mm_walk *walk); }; +/* + * Action for pud_entry / pmd_entry callbacks. + * ACTION_SUBTREE is the default + */ +enum page_walk_action { + /* Descend to next level, splitting huge pages if needed and possible */ + ACTION_SUBTREE = 0, + /* Continue to next entry at this level (ignoring any subtree) */ + ACTION_CONTINUE = 1, + /* Call again for this entry */ + ACTION_AGAIN = 2 +}; + /** * mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) + * @action: next action to perform (see enum page_walk_action) * @private: private data for callbacks' usage * * (see the comment on walk_page_range() for more details) @@ -60,6 +81,7 @@ struct mm_walk { const struct mm_walk_ops *ops; struct mm_struct *mm; struct vm_area_struct *vma; + enum page_walk_action action; void *private; }; -- cgit v1.2.3 From 488ae6a2b933cb538b5d91b1c0a3420188d28771 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:35:50 -0800 Subject: mm: pagewalk: allow walking without vma Since 48684a65b4e3: "mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP)", page_table_walk() will report any kernel area as a hole, because it lacks a vma. This means each arch has re-implemented page table walking when needed, for example in the per-arch ptdump walker. Remove the requirement to have a vma in the generic code and add a new function walk_page_range_novma() which ignores the VMAs and simply walks the page tables. Link: http://lkml.kernel.org/r/20191218162402.45610-13-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index aa6a0b63964e..d5d07f7a9c14 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -73,6 +73,7 @@ enum page_walk_action { * @mm: mm_struct representing the target process of page table walk * @vma: vma currently walked (NULL if walking outside vmas) * @action: next action to perform (see enum page_walk_action) + * @no_vma: walk ignoring vmas (vma will always be NULL) * @private: private data for callbacks' usage * * (see the comment on walk_page_range() for more details) @@ -82,12 +83,16 @@ struct mm_walk { struct mm_struct *mm; struct vm_area_struct *vma; enum page_walk_action action; + bool no_vma; void *private; }; int walk_page_range(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private); +int walk_page_range_novma(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, -- cgit v1.2.3 From b7a16c7ad790d0ecb44dcb08a6a75d0d0455ab5f Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:03 -0800 Subject: mm: pagewalk: add 'depth' parameter to pte_hole The pte_hole() callback is called at multiple levels of the page tables. Code dumping the kernel page tables needs to know what at what depth the missing entry is. Add this is an extra parameter to pte_hole(). When the depth isn't know (e.g. processing a vma) then -1 is passed. The depth that is reported is the actual level where the entry is missing (ignoring any folding that is in place), i.e. any levels where PTRS_PER_P?D is set to 1 are ignored. Note that depth starts at 0 for a PGD so that PUD/PMD/PTE retain their natural numbers as levels 2/3/4. Link: http://lkml.kernel.org/r/20191218162402.45610-16-steven.price@arm.com Signed-off-by: Steven Price Tested-by: Zong Li Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index d5d07f7a9c14..745a654c6ea7 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -17,7 +17,10 @@ struct mm_walk; * split_huge_page() instead of handling it explicitly. * @pte_entry: if set, called for each non-empty PTE (lowest-level) * entry - * @pte_hole: if set, called for each hole at all levels + * @pte_hole: if set, called for each hole at all levels, + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD + * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal + * to 1) are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means @@ -43,7 +46,7 @@ struct mm_walk_ops { int (*pte_entry)(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk); int (*pte_hole)(unsigned long addr, unsigned long next, - struct mm_walk *walk); + int depth, struct mm_walk *walk); int (*hugetlb_entry)(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long next, struct mm_walk *walk); -- cgit v1.2.3 From 30d621f6723b1c98a142861f7a52849d286bc7fa Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:20 -0800 Subject: mm: add generic ptdump Add a generic version of page table dumping that architectures can opt-in to. Link: http://lkml.kernel.org/r/20191218162402.45610-20-steven.price@arm.com Signed-off-by: Steven Price Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptdump.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 include/linux/ptdump.h (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h new file mode 100644 index 000000000000..a0fb8dd2be97 --- /dev/null +++ b/include/linux/ptdump.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _LINUX_PTDUMP_H +#define _LINUX_PTDUMP_H + +#include + +struct ptdump_range { + unsigned long start; + unsigned long end; +}; + +struct ptdump_state { + void (*note_page)(struct ptdump_state *st, unsigned long addr, + int level, unsigned long val); + const struct ptdump_range *range; +}; + +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm); + +#endif /* _LINUX_PTDUMP_H */ -- cgit v1.2.3 From f8f0d0b6fa203bfa363d30f34f6fecce9e5cc2f7 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:38 -0800 Subject: mm: ptdump: reduce level numbers by 1 in note_page() Rather than having to increment the 'depth' number by 1 in ptdump_hole(), let's change the meaning of 'level' in note_page() since that makes the code simplier. Note that for x86, the level numbers were previously increased by 1 in commit 45dcd2091363 ("x86/mm/dump_pagetables: Fix printout of p4d level") and the comment "Bit 7 has a different meaning" was not updated, so this change also makes the code match the comment again. Link: http://lkml.kernel.org/r/20191218162402.45610-24-steven.price@arm.com Signed-off-by: Steven Price Reviewed-by: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ptdump.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index a0fb8dd2be97..b28f3f2acf90 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -11,6 +11,7 @@ struct ptdump_range { }; struct ptdump_state { + /* level is 0:PGD to 4:PTE, or -1 if unknown */ void (*note_page)(struct ptdump_state *st, unsigned long addr, int level, unsigned long val); const struct ptdump_range *range; -- cgit v1.2.3 From e47690d756a760579141560ded06ec1020dd85e8 Mon Sep 17 00:00:00 2001 From: Steven Price Date: Mon, 3 Feb 2020 17:36:42 -0800 Subject: x86: mm: avoid allocating struct mm_struct on the stack struct mm_struct is quite large (~1664 bytes) and so allocating on the stack may cause problems as the kernel stack size is small. Since ptdump_walk_pgd_level_core() was only allocating the structure so that it could modify the pgd argument we can instead introduce a pgd override in struct mm_walk and pass this down the call stack to where it is needed. Since the correct mm_struct is now being passed down, it is now also unnecessary to take the mmap_sem semaphore because ptdump_walk_pgd() will now take the semaphore on the real mm. [steven.price@arm.com: restore missed arm64 changes] Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Link: http://lkml.kernel.org/r/20200108145710.34314-1-steven.price@arm.com Signed-off-by: Steven Price Reported-by: Stephen Rothwell Cc: Catalin Marinas Cc: Albert Ou Cc: Alexandre Ghiti Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christian Borntraeger Cc: Dave Hansen Cc: David S. Miller Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: James Hogan Cc: James Morse Cc: Jerome Glisse Cc: "Liang, Kan" Cc: Mark Rutland Cc: Michael Ellerman Cc: Paul Burton Cc: Paul Mackerras Cc: Paul Walmsley Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Russell King Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vineet Gupta Cc: Will Deacon Cc: Zong Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagewalk.h | 3 +++ include/linux/ptdump.h | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 745a654c6ea7..b1cb6b753abb 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -74,6 +74,7 @@ enum page_walk_action { * mm_walk - walk_page_range data * @ops: operation to call during the walk * @mm: mm_struct representing the target process of page table walk + * @pgd: pointer to PGD; only valid with no_vma (otherwise set to NULL) * @vma: vma currently walked (NULL if walking outside vmas) * @action: next action to perform (see enum page_walk_action) * @no_vma: walk ignoring vmas (vma will always be NULL) @@ -84,6 +85,7 @@ enum page_walk_action { struct mm_walk { const struct mm_walk_ops *ops; struct mm_struct *mm; + pgd_t *pgd; struct vm_area_struct *vma; enum page_walk_action action; bool no_vma; @@ -95,6 +97,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, void *private); int walk_page_range_novma(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, + pgd_t *pgd, void *private); int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, void *private); diff --git a/include/linux/ptdump.h b/include/linux/ptdump.h index b28f3f2acf90..a67065c403c3 100644 --- a/include/linux/ptdump.h +++ b/include/linux/ptdump.h @@ -17,6 +17,6 @@ struct ptdump_state { const struct ptdump_range *range; }; -void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm); +void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd); #endif /* _LINUX_PTDUMP_H */ -- cgit v1.2.3 From d56c0d45f0e27f814e87a1676b6bdccccbc252e9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:14 -0800 Subject: proc: decouple proc from VFS with "struct proc_ops" Currently core /proc code uses "struct file_operations" for custom hooks, however, VFS doesn't directly call them. Every time VFS expands file_operations hook set, /proc code bloats for no reason. Introduce "struct proc_ops" which contains only those hooks which /proc allows to call into (open, release, read, write, ioctl, mmap, poll). It doesn't contain module pointer as well. Save ~184 bytes per usage: add/remove: 26/26 grow/shrink: 1/4 up/down: 1922/-6674 (-4752) Function old new delta sysvipc_proc_ops - 72 +72 ... config_gz_proc_ops - 72 +72 proc_get_inode 289 339 +50 proc_reg_get_unmapped_area 110 107 -3 close_pdeo 227 224 -3 proc_reg_open 289 284 -5 proc_create_data 60 53 -7 rt_cpu_seq_fops 256 - -256 ... default_affinity_proc_fops 256 - -256 Total: Before=5430095, After=5425343, chg -0.09% Link: http://lkml.kernel.org/r/20191225172228.GA13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/proc_fs.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 0640be56dcbd..3dfa92633af3 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -12,6 +12,21 @@ struct proc_dir_entry; struct seq_file; struct seq_operations; +struct proc_ops { + int (*proc_open)(struct inode *, struct file *); + ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); + ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); + loff_t (*proc_lseek)(struct file *, loff_t, int); + int (*proc_release)(struct inode *, struct file *); + __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); + long (*proc_ioctl)(struct file *, unsigned int, unsigned long); +#ifdef CONFIG_COMPAT + long (*proc_compat_ioctl)(struct file *, unsigned int, unsigned long); +#endif + int (*proc_mmap)(struct file *, struct vm_area_struct *); + unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +}; + #ifdef CONFIG_PROC_FS typedef int (*proc_write_t)(struct file *, char *, size_t); @@ -43,10 +58,10 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *, - const struct file_operations *, + const struct proc_ops *, void *); -struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct file_operations *proc_fops); +struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); extern void *PDE_DATA(const struct inode *); @@ -108,8 +123,8 @@ static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, #define proc_create_seq(name, mode, parent, ops) ({NULL;}) #define proc_create_single(name, mode, parent, show) ({NULL;}) #define proc_create_single_data(name, mode, parent, show, data) ({NULL;}) -#define proc_create(name, mode, parent, proc_fops) ({NULL;}) -#define proc_create_data(name, mode, parent, proc_fops, data) ({NULL;}) +#define proc_create(name, mode, parent, proc_ops) ({NULL;}) +#define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;}) static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} -- cgit v1.2.3 From 97a32539b9568bb653683349e5a76d02ff3c3e2c Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 3 Feb 2020 17:37:17 -0800 Subject: proc: convert everything to "struct proc_ops" The most notable change is DEFINE_SHOW_ATTRIBUTE macro split in seq_file.h. Conversion rule is: llseek => proc_lseek unlocked_ioctl => proc_ioctl xxx => proc_xxx delete ".owner = THIS_MODULE" line [akpm@linux-foundation.org: fix drivers/isdn/capi/kcapi_proc.c] [sfr@canb.auug.org.au: fix kernel/sched/psi.c] Link: http://lkml.kernel.org/r/20200122180545.36222f50@canb.auug.org.au Link: http://lkml.kernel.org/r/20191225172546.GB13378@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/seq_file.h | 13 +++++++++++++ include/linux/sunrpc/stats.h | 4 ++-- 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 5998e1f4ff06..770c2bf3aa43 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -160,6 +160,19 @@ static const struct file_operations __name ## _fops = { \ .release = single_release, \ } +#define DEFINE_PROC_SHOW_ATTRIBUTE(__name) \ +static int __name ## _open(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, __name ## _show, inode->i_private); \ +} \ + \ +static const struct proc_ops __name ## _proc_ops = { \ + .proc_open = __name ## _open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ +} + static inline struct user_namespace *seq_user_ns(struct seq_file *seq) { #ifdef CONFIG_USER_NS diff --git a/include/linux/sunrpc/stats.h b/include/linux/sunrpc/stats.h index 84b92b4ad1c0..d94d4f410507 100644 --- a/include/linux/sunrpc/stats.h +++ b/include/linux/sunrpc/stats.h @@ -63,7 +63,7 @@ struct proc_dir_entry * rpc_proc_register(struct net *,struct rpc_stat *); void rpc_proc_unregister(struct net *,const char *); void rpc_proc_zero(const struct rpc_program *); struct proc_dir_entry * svc_proc_register(struct net *, struct svc_stat *, - const struct file_operations *); + const struct proc_ops *); void svc_proc_unregister(struct net *, const char *); void svc_seq_show(struct seq_file *, @@ -75,7 +75,7 @@ static inline void rpc_proc_unregister(struct net *net, const char *p) {} static inline void rpc_proc_zero(const struct rpc_program *p) {} static inline struct proc_dir_entry *svc_proc_register(struct net *net, struct svc_stat *s, - const struct file_operations *f) { return NULL; } + const struct proc_ops *proc_ops) { return NULL; } static inline void svc_proc_unregister(struct net *net, const char *p) {} static inline void svc_seq_show(struct seq_file *seq, -- cgit v1.2.3 From 0bee0cece2a6a71ccc347fdc1d46cf638cd5fd1c Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:20 -0800 Subject: lib/string: add strnchrnul() Patch series "lib: rework bitmap_parse", v5. Similarl to the recently revisited bitmap_parselist(), bitmap_parse() is ineffective and overcomplicated. This series reworks it, aligns its interface with bitmap_parselist() and makes it simpler to use. The series also adds a test for the function and fixes usage of it in cpumask_parse() according to the new design - drops the calculating of length of an input string. bitmap_parse() takes the array of numbers to be put into the map in the BE order which is reversed to the natural LE order for bitmaps. For example, to construct bitmap containing a bit on the position 42, we have to put a line '400,0'. Current implementation reads chunk one by one from the beginning ('400' before '0') and makes bitmap shift after each successful parse. It makes the complexity of the whole process as O(n^2). We can do it in reverse direction ('0' before '400') and avoid shifting, but it requires reverse parsing helpers. This patch (of 7): New function works like strchrnul() with a length limited string. Link: http://lkml.kernel.org/r/20200102043031.30357-2-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Rasmus Villemoes Cc: Amritha Nambiar Cc: Willem de Bruijn Cc: Kees Cook Cc: Matthew Wilcox Cc: "Tobin C . Harding" Cc: Will Deacon Cc: Miklos Szeredi Cc: Vineet Gupta Cc: Chris Wilson Cc: Arnaldo Carvalho de Melo Cc: Steffen Klassert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index 02894e417565..6dfbb2efa815 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -62,6 +62,7 @@ extern char * strchr(const char *,int); #ifndef __HAVE_ARCH_STRCHRNUL extern char * strchrnul(const char *,int); #endif +extern char * strnchrnul(const char *, size_t, int); #ifndef __HAVE_ARCH_STRNCHR extern char * strnchr(const char *, size_t, int); #endif -- cgit v1.2.3 From 0bddc1bd05d6973fee9303005abab6567f743ea7 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:24 -0800 Subject: bitops: more BITS_TO_* macros Introduce BITS_TO_U64, BITS_TO_U32 and BITS_TO_BYTES as they are handy in the following patches (BITS_TO_U32 specifically). Reimplement tools/ version of the macros according to the kernel implementation. Also fix indentation for BITS_PER_TYPE definition. Link: http://lkml.kernel.org/r/20200102043031.30357-3-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Amritha Nambiar Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitops.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 6c7c4133c25c..47f54b459c26 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -11,8 +11,10 @@ # define aligned_byte_mask(n) (~0xffUL << (BITS_PER_LONG - 8 - 8*(n))) #endif -#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) +#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE) #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(long)) +#define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u64)) +#define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(u32)) #define BITS_TO_BYTES(nr) DIV_ROUND_UP(nr, BITS_PER_TYPE(char)) extern unsigned int __sw_hweight8(unsigned int w); -- cgit v1.2.3 From 2d6261583be005a91e4933aa53bbd678ef98e4c4 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:34 -0800 Subject: lib: rework bitmap_parse() bitmap_parse() is ineffective and full of opaque variables and opencoded parts. It leads to hard understanding and usage of it. This rework includes: - remove bitmap_shift_left() call from the cycle. Now it makes the complexity of the algorithm as O(nbits^2). In the suggested approach the input string is parsed in reverse direction, so no shifts needed; - relax requirement on a single comma and no white spaces between chunks. It is considered useful in scripting, and it aligns with bitmap_parselist(); - split bitmap_parse() to small readable helpers; - make an explicit calculation of the end of input line at the beginning, so users of the bitmap_parse() won't bother doing this. Link: http://lkml.kernel.org/r/20200102043031.30357-6-yury.norov@gmail.com Signed-off-by: Yury Norov Cc: Amritha Nambiar Cc: Andy Shevchenko Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bitmap.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/bitmap.h b/include/linux/bitmap.h index 80ad521116d7..e52ceb1a73d3 100644 --- a/include/linux/bitmap.h +++ b/include/linux/bitmap.h @@ -186,7 +186,7 @@ bitmap_find_next_zero_area(unsigned long *map, align_mask, 0); } -extern int __bitmap_parse(const char *buf, unsigned int buflen, int is_user, +extern int bitmap_parse(const char *buf, unsigned int buflen, unsigned long *dst, int nbits); extern int bitmap_parse_user(const char __user *ubuf, unsigned int ulen, unsigned long *dst, int nbits); @@ -454,12 +454,6 @@ static inline void bitmap_replace(unsigned long *dst, __bitmap_replace(dst, old, new, mask, nbits); } -static inline int bitmap_parse(const char *buf, unsigned int buflen, - unsigned long *maskp, int nmaskbits) -{ - return __bitmap_parse(buf, buflen, 0, maskp, nmaskbits); -} - static inline void bitmap_next_clear_region(unsigned long *bitmap, unsigned int *rs, unsigned int *re, unsigned int end) -- cgit v1.2.3 From 190535f7cf50f2d6d6e603715201c58cd6ec696b Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Mon, 3 Feb 2020 17:37:41 -0800 Subject: include/linux/cpumask.h: don't calculate length of the input string New design of inner bitmap_parse() allows to avoid calculating the size of a null-terminated string. Link: http://lkml.kernel.org/r/20200102043031.30357-8-yury.norov@gmail.com Signed-off-by: Yury Norov Reviewed-by: Andy Shevchenko Cc: Amritha Nambiar Cc: Arnaldo Carvalho de Melo Cc: Chris Wilson Cc: Kees Cook Cc: Matthew Wilcox Cc: Miklos Szeredi Cc: Rasmus Villemoes Cc: Steffen Klassert Cc: "Tobin C . Harding" Cc: Vineet Gupta Cc: Will Deacon Cc: Willem de Bruijn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cpumask.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 78a73eba64dd..d5cc88514aee 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -663,9 +663,7 @@ static inline int cpumask_parselist_user(const char __user *buf, int len, */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { - unsigned int len = strchrnul(buf, '\n') - buf; - - return bitmap_parse(buf, len, cpumask_bits(dstp), nr_cpumask_bits); + return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** -- cgit v1.2.3