From a19621ed4e0ac9e1d296d07dc8ff8c27d5c03b43 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:06 +0800 Subject: mm: add folio_alloc_mpol() Patch series "mm: convert to folio_alloc_mpol()". This patch (of 4): This adds a new folio_alloc_mpol() like folio_alloc() but allocate folio according to NUMA mempolicy. Link: https://lkml.kernel.org/r/20240515070709.78529-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240515070709.78529-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f9691d375f0..f53f76e0b17e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,8 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -319,6 +321,11 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc_noprof(gfp, order); +} #define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ folio_alloc_noprof(gfp, order) #endif @@ -326,6 +333,7 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) -- cgit v1.2.3 From 7f83bf14603ef41a44dc907594d749a283e22c37 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 15 May 2024 10:47:54 +0800 Subject: mm/huge_memory: mark racy access onhuge_anon_orders_always huge_anon_orders_always is accessed lockless, it is better to use the READ_ONCE() wrapper. This is not fixing any visible bug, hopefully this can cease some KCSAN complains in the future. Also do that for huge_anon_orders_madvise. Link: https://lkml.kernel.org/r/20240515104754889HqrahFPePOIE1UlANHVAh@zte.com.cn Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Reviewed-by: Lu Zhongjun Reviewed-by: xu xin Cc: Yang Yang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2aa986a5cd1b..088d66a54643 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -134,8 +134,8 @@ static inline bool hugepage_flags_enabled(void) * So we don't need to look at huge_anon_orders_inherit. */ return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; + READ_ONCE(huge_anon_orders_always) || + READ_ONCE(huge_anon_orders_madvise); } static inline int highest_order(unsigned long orders) -- cgit v1.2.3 From 564a2ee9f9f66ceef135b7208cff705d48ad76c4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:51 +0800 Subject: mm: remove page_file_offset and folio_file_pos These two helpers were useful for mixed usage of swap cache and page cache, which help retrieve the corresponding file or swap device offset of a page or folio. They were introduced in commit f981c5950fa8 ("mm: methods for teaching filesystems about PG_swapcache pages") and used in commit d56b4ddf7781 ("nfs: teach the NFS client how to treat PG_swapcache pages"), suppose to be used with direct_IO for swap over fs. But after commit e1209d3a7a67 ("mm: introduce ->swap_rw and use it for reads from SWP_FS_OPS swap-space"), swap with direct_IO is no more, and swap cache mapping is never exposed to fs. Now we have dropped all users of page_file_offset and folio_file_pos, so they can be deleted. Link: https://lkml.kernel.org/r/20240521175854.96038-10-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 59f1df0cde5a..78f2cd8c73ff 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -932,11 +932,6 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_SHIFT; } -static inline loff_t page_file_offset(struct page *page) -{ - return ((loff_t)page_index(page)) << PAGE_SHIFT; -} - /** * folio_pos - Returns the byte position of this folio in its file. * @folio: The folio. @@ -946,18 +941,6 @@ static inline loff_t folio_pos(struct folio *folio) return page_offset(&folio->page); } -/** - * folio_file_pos - Returns the byte position of this folio in its file. - * @folio: The folio. - * - * This differs from folio_pos() for folios which belong to a swap file. - * NFS is the only filesystem today which needs to use folio_file_pos(). - */ -static inline loff_t folio_file_pos(struct folio *folio) -{ - return page_file_offset(&folio->page); -} - /* * Get the offset in PAGE_SIZE (even for hugetlb folios). */ -- cgit v1.2.3 From 05b0c7edad9b8a5ccf1b46b01e1b96fcd10b50d8 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:52 +0800 Subject: mm: drop page_index and simplify folio_index There are two helpers for retrieving the index within address space for mixed usage of swap cache and page cache: - page_index - folio_index This commit drops page_index, as we have eliminated all users, and converts folio_index's helper __page_file_index to use folio to avoid the page conversion. Link: https://lkml.kernel.org/r/20240521175854.96038-11-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: "Huang, Ying" Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 ------------- include/linux/pagemap.h | 8 ++++---- 2 files changed, 4 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index eb7c96d24ac0..99174fac3d47 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2295,19 +2295,6 @@ static inline void *folio_address(const struct folio *folio) return page_address(&folio->page); } -extern pgoff_t __page_file_index(struct page *page); - -/* - * Return the pagecache index of the passed page. Regular pagecache pages - * use ->index whereas swapcache pages use swp_offset(->private) - */ -static inline pgoff_t page_index(struct page *page) -{ - if (unlikely(PageSwapCache(page))) - return __page_file_index(page); - return page->index; -} - /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 78f2cd8c73ff..1586b5c21e81 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -792,7 +792,7 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, mapping_gfp_mask(mapping)); } -#define swapcache_index(folio) __page_file_index(&(folio)->page) +extern pgoff_t __folio_swap_cache_index(struct folio *folio); /** * folio_index - File index of a folio. @@ -807,9 +807,9 @@ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, */ static inline pgoff_t folio_index(struct folio *folio) { - if (unlikely(folio_test_swapcache(folio))) - return swapcache_index(folio); - return folio->index; + if (unlikely(folio_test_swapcache(folio))) + return __folio_swap_cache_index(folio); + return folio->index; } /** -- cgit v1.2.3 From 63818aaf0da8c036d600424ac961620dcdc13ce2 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Mon, 20 May 2024 15:44:07 -0700 Subject: mm/hugetlb: remove {Set,Clear}Hpage macros All users have been converted to use the folio version of these macros, we can safely remove the page based interface. Link: https://lkml.kernel.org/r/20240520224407.110062-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Muchun Song Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2b3c3a404769..15a58f69782c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -625,18 +625,14 @@ static __always_inline \ void folio_set_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ set_bit(HPG_##flname, private); \ - } \ -static inline void SetHPage##uname(struct page *page) \ - { set_bit(HPG_##flname, &(page->private)); } + } #define CLEARHPAGEFLAG(uname, flname) \ static __always_inline \ void folio_clear_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ clear_bit(HPG_##flname, private); \ - } \ -static inline void ClearHPage##uname(struct page *page) \ - { clear_bit(HPG_##flname, &(page->private)); } + } #else #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ @@ -648,15 +644,11 @@ static inline int HPage##uname(struct page *page) \ #define SETHPAGEFLAG(uname, flname) \ static inline void \ folio_set_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void SetHPage##uname(struct page *page) \ { } #define CLEARHPAGEFLAG(uname, flname) \ static inline void \ folio_clear_hugetlb_##flname(struct folio *folio) \ - { } \ -static inline void ClearHPage##uname(struct page *page) \ { } #endif -- cgit v1.2.3 From 6ad28e7e52e2bae76b1d3eb4466999d04d50db92 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Wed, 22 May 2024 14:57:13 +0200 Subject: mm/rmap: sanity check that zeropages are not passed to RMAP Using insert_page() we might have previously ended up passing the zeropage into rmap code. Make sure that won't happen again. Note that we won't check the huge zeropage for now, which might still end up in RMAP code. Link: https://lkml.kernel.org/r/20240522125713.775114-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Dan Williams Cc: Vincent Donnefort Signed-off-by: Andrew Morton --- include/linux/rmap.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 7229b9baf20d..5cb0d419a1d7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -200,6 +200,9 @@ static inline void __folio_rmap_sanity_checks(struct folio *folio, /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); + /* When (un)mapping zeropages, we should never touch ref+mapcount. */ + VM_WARN_ON_FOLIO(is_zero_folio(folio), folio); + /* * TODO: we get driver-allocated folios that have nothing to do with * the rmap using vm_insert_page(); therefore, we cannot assume that -- cgit v1.2.3 From 23b1b44e6c61295084284aa7d87db863a7802b92 Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:02 +0800 Subject: mm: add update_mmu_tlb_range() Patch series "Add update_mmu_tlb_range() to simplify code", v4. This series of commits mainly adds the update_mmu_tlb_range() to batch update tlb in an address range and implement update_mmu_tlb() using update_mmu_tlb_range(). After commit 19eaf44954df ("mm: thp: support allocation of anonymous multi-size THP"), We may need to batch update tlb of a certain address range by calling update_mmu_tlb() in a loop. Using the update_mmu_tlb_range(), we can simplify the code and possibly reduce the execution of some unnecessary code in some architectures. This patch (of 3): Add update_mmu_tlb_range(), we can batch update tlb of an address range. Link: https://lkml.kernel.org/r/20240522061204.117421-1-libang.li@antgroup.com Link: https://lkml.kernel.org/r/20240522061204.117421-2-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 18019f037bae..17d1caee39ab 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -729,6 +729,13 @@ static inline void clear_full_ptes(struct mm_struct *mm, unsigned long addr, * fault. This function updates TLB only, do nothing with cache or others. * It is the difference with function update_mmu_cache. */ +#ifndef update_mmu_tlb_range +static inline void update_mmu_tlb_range(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, unsigned int nr) +{ +} +#endif + #ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) -- cgit v1.2.3 From 8f65aa32239f1c3f11b7a25bd5921223bafc5fed Mon Sep 17 00:00:00 2001 From: Bang Li Date: Wed, 22 May 2024 14:12:03 +0800 Subject: mm: implement update_mmu_tlb() using update_mmu_tlb_range() Let's make update_mmu_tlb() simply a generic wrapper around update_mmu_tlb_range(). Only the latter can now be overridden by the architecture. We can now remove __HAVE_ARCH_UPDATE_MMU_TLB as well. Link: https://lkml.kernel.org/r/20240522061204.117421-3-libang.li@antgroup.com Signed-off-by: Bang Li Acked-by: David Hildenbrand Cc: Chris Zankel Cc: Huacai Chen Cc: Lance Yang Cc: Max Filippov Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Ryan Roberts Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 17d1caee39ab..117b807e3f89 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -736,13 +736,11 @@ static inline void update_mmu_tlb_range(struct vm_area_struct *vma, } #endif -#ifndef __HAVE_ARCH_UPDATE_MMU_TLB static inline void update_mmu_tlb(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { + update_mmu_tlb_range(vma, address, ptep, 1); } -#define __HAVE_ARCH_UPDATE_MMU_TLB -#endif /* * Some architectures may be able to avoid expensive synchronization -- cgit v1.2.3 From b8b9488d50b7150bd4830dfff487e8d4ef6589ba Mon Sep 17 00:00:00 2001 From: Jane Chu Date: Fri, 24 May 2024 15:53:04 -0600 Subject: mm/memory-failure: improve memory failure action_result messages Added two explicit MF_MSG messages describing failure in get_hwpoison_page. Attemped to document the definition of various action names, and made a few adjustment to the action_result() calls. Link: https://lkml.kernel.org/r/20240524215306.2705454-4-jane.chu@oracle.com Signed-off-by: Jane Chu Reviewed-by: Oscar Salvador Acked-by: Miaohe Lin Cc: Naoya Horiguchi Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 99174fac3d47..f4bf94ca99e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4096,6 +4096,7 @@ enum mf_action_page_type { MF_MSG_DIFFERENT_COMPOUND, MF_MSG_HUGE, MF_MSG_FREE_HUGE, + MF_MSG_GET_HWPOISON, MF_MSG_UNMAP_FAILED, MF_MSG_DIRTY_SWAPCACHE, MF_MSG_CLEAN_SWAPCACHE, @@ -4109,6 +4110,7 @@ enum mf_action_page_type { MF_MSG_BUDDY, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, + MF_MSG_ALREADY_POISONED, MF_MSG_UNKNOWN, }; -- cgit v1.2.3 From 188f87f2648b13f5de17d5e068f18d317e0c1f98 Mon Sep 17 00:00:00 2001 From: Eric Chanudet Date: Wed, 22 May 2024 16:38:01 -0400 Subject: mm/mm_init: use node's number of cpus in deferred_page_init_max_threads x86_64 is already using the node's cpu as maximum threads. Make that the default for all archs setting DEFERRED_STRUCT_PAGE_INIT. This returns to the behavior prior making the function arch-specific with commit ecd096506922 ("mm: make deferred init's max threads arch-specific"). Setting DEFERRED_STRUCT_PAGE_INIT and testing on a few arm64 platforms shows faster deferred_init_memmap completions: | | x13s | SA8775p-ride | Ampere R137-P31 | Ampere HR330 | | | Metal, 32GB | VM, 36GB | VM, 58GB | Metal, 128GB | | | 8cpus | 8cpus | 8cpus | 32cpus | |---------|-------------|--------------|-----------------|--------------| | threads | ms (%) | ms (%) | ms (%) | ms (%) | |---------|-------------|--------------|-----------------|--------------| | 1 | 108 (0%) | 72 (0%) | 224 (0%) | 324 (0%) | | cpus | 24 (-77%) | 36 (-50%) | 40 (-82%) | 56 (-82%) | Michael Ellerman reported: : On a machine here (1TB, 40 cores, 4KB pages) the existing code gives: : : [ 0.500124] node 2 deferred pages initialised in 210ms : [ 0.515790] node 3 deferred pages initialised in 230ms : [ 0.516061] node 0 deferred pages initialised in 230ms : [ 0.516522] node 7 deferred pages initialised in 230ms : [ 0.516672] node 4 deferred pages initialised in 230ms : [ 0.516798] node 6 deferred pages initialised in 230ms : [ 0.517051] node 5 deferred pages initialised in 230ms : [ 0.523887] node 1 deferred pages initialised in 240ms : : vs with the patch: : : [ 0.379613] node 0 deferred pages initialised in 90ms : [ 0.380388] node 1 deferred pages initialised in 90ms : [ 0.380540] node 4 deferred pages initialised in 100ms : [ 0.390239] node 6 deferred pages initialised in 100ms : [ 0.390249] node 2 deferred pages initialised in 100ms : [ 0.390786] node 3 deferred pages initialised in 110ms : [ 0.396721] node 5 deferred pages initialised in 110ms : [ 0.397095] node 7 deferred pages initialised in 110ms : : Which is a nice speedup. [echanude@redhat.com: v3] Link: https://lkml.kernel.org/r/20240528185455.643227-4-echanude@redhat.com Link: https://lkml.kernel.org/r/20240522203758.626932-4-echanude@redhat.com Signed-off-by: Eric Chanudet Tested-by: Michael Ellerman (powerpc) Reviewed-by: Baoquan He Acked-by: Alexander Gordeev Acked-by: Mike Rapoport (IBM) Cc: Andy Lutomirski Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Nicholas Piggin Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton --- include/linux/memblock.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index e2082240586d..40c62aca36ec 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -335,8 +335,6 @@ void __next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone, for (; i != U64_MAX; \ __next_mem_pfn_range_in_zone(&i, zone, p_start, p_end)) -int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask); - #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ /** -- cgit v1.2.3 From ffc3c8a631eeadd0de63605cecfb6ba544245352 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 09:49:50 +0800 Subject: mm: memcontrol: remove page_memcg() The page_memcg() only called by mod_memcg_page_state(), so squash it to cleanup page_memcg(). Link: https://lkml.kernel.org/r/20240524014950.187805-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 030d34e9d117..3d1599146afe 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -443,11 +443,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return __folio_memcg(folio); } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return folio_memcg(page_folio(page)); -} - /** * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -1014,7 +1009,7 @@ static inline void mod_memcg_page_state(struct page *page, return; rcu_read_lock(); - memcg = page_memcg(page); + memcg = folio_memcg(page_folio(page)); if (memcg) mod_memcg_state(memcg, idx, val); rcu_read_unlock(); @@ -1133,11 +1128,6 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *page_memcg(struct page *page) -{ - return NULL; -} - static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) { WARN_ON_ONCE(!rcu_read_lock_held()); @@ -1636,7 +1626,7 @@ static inline void unlock_page_lruvec_irqrestore(struct lruvec *lruvec, spin_unlock_irqrestore(&lruvec->lru_lock, flags); } -/* Test requires a stable page->memcg binding, see page_memcg() */ +/* Test requires a stable folio->memcg binding, see folio_memcg() */ static inline bool folio_matches_lruvec(struct folio *folio, struct lruvec *lruvec) { -- cgit v1.2.3 From 06668257a355a05483c188e35a18c80471d06987 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 24 May 2024 19:18:10 +0100 Subject: mm: remove page_mapping() All callers are now converted, delete this compatibility wrapper. Also fix up some comments which referred to page_mapping. Link: https://lkml.kernel.org/r/20240423225552.4113447-7-willy@infradead.org Link: https://lkml.kernel.org/r/20240524181813.698813-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: David Hildenbrand Cc: Eric Biggers Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 2 +- include/linux/page-flags.h | 23 ++++++++++++----------- include/linux/pagemap.h | 1 - 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index e022e40b099e..14acf1bbe0ce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -53,7 +53,7 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); * filesystem and block layers. Nowadays the basic I/O unit * is the bio, and buffer_heads are used for extracting block * mappings (via a get_block_t call), for tracking state within - * a page (via a page_mapping) and for wrapping bio submission + * a folio (via a folio_mapping) and for wrapping bio submission * for backward compatibility reasons (e.g. submit_bh). */ struct buffer_head { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b9e914e1face..813eea2efe26 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -655,27 +655,28 @@ PAGEFLAG_FALSE(VmemmapSelfHosted, vmemmap_self_hosted) #endif /* - * On an anonymous page mapped into a user virtual memory area, - * page->mapping points to its anon_vma, not to a struct address_space; + * On an anonymous folio mapped into a user virtual memory area, + * folio->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON - * bit; and then page->mapping points, not to an anon_vma, but to a private + * bit; and then folio->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable - * page and then page->mapping points to a struct movable_operations. + * page and then folio->mapping points to a struct movable_operations. * - * Please note that, confusingly, "page_mapping" refers to the inode - * address_space which maps the page from disk; whereas "page_mapped" - * refers to user virtual address space into which the page is mapped. + * Please note that, confusingly, "folio_mapping" refers to the inode + * address_space which maps the folio from disk; whereas "folio_mapped" + * refers to user virtual address space into which the folio is mapped. * * For slab pages, since slab reuses the bits in struct page to store its - * internal states, the page->mapping does not exist as such, nor do these - * flags below. So in order to avoid testing non-existent bits, please - * make sure that PageSlab(page) actually evaluates to false before calling - * the following functions (e.g., PageAnon). See mm/slab.h. + * internal states, the folio->mapping does not exist as such, nor do + * these flags below. So in order to avoid testing non-existent bits, + * please make sure that folio_test_slab(folio) actually evaluates to + * false before calling the following functions (e.g., folio_test_anon). + * See mm/slab.h. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 1586b5c21e81..e37e16ebff7a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -426,7 +426,6 @@ static inline void filemap_nr_thps_dec(struct address_space *mapping) #endif } -struct address_space *page_mapping(struct page *); struct address_space *folio_mapping(struct folio *); struct address_space *swapcache_mapping(struct folio *); -- cgit v1.2.3 From 1df07243483c1a32c8f2f93b06dc52a583a4dd1c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:36:18 +0800 Subject: rmap: remove DEFINE_PAGE_VMA_WALK() This are no users since commit 40d707f33db5 ("mm/ksm: use folio in write_protect_page"), so remove DEFINE_PAGE_VMA_WALK(). Link: https://lkml.kernel.org/r/20240524053618.208895-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: David Hildenbrand Cc: Alex Shi Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/rmap.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 5cb0d419a1d7..bb53e5920b88 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -684,16 +684,6 @@ struct page_vma_mapped_walk { unsigned int flags; }; -#define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ - struct page_vma_mapped_walk name = { \ - .pfn = page_to_pfn(_page), \ - .nr_pages = compound_nr(_page), \ - .pgoff = page_to_pgoff(_page), \ - .vma = _vma, \ - .address = _address, \ - .flags = _flags, \ - } - #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ struct page_vma_mapped_walk name = { \ .pfn = folio_pfn(_folio), \ -- cgit v1.2.3 From 940d6683c79950b21b3762124eabfa9b2f6fee96 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:42 +0800 Subject: mm: migrate: remove migrate_folio_extra() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_folio_extra() is only called in migrate.c now, convert it a static function and take a new src_private argument which could be shared by migrate_folio() and filemap_migrate_folio() to simplify code a bit. Link: https://lkml.kernel.org/r/20240524052843.182275-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 2ce13e8a309b..517f70b70620 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -63,8 +63,6 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION void putback_movable_pages(struct list_head *l); -int migrate_folio_extra(struct address_space *mapping, struct folio *dst, - struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); int migrate_pages(struct list_head *l, new_folio_t new, free_folio_t free, -- cgit v1.2.3 From 906632843d00a4a42072b19c423b30a9e7adef3c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 24 May 2024 13:28:43 +0800 Subject: mm: remove MIGRATE_SYNC_NO_COPY mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 2916ecc0f9d4 ("mm/migrate: new migrate mode MIGRATE_SYNC_NO_COPY") introduce a new MIGRATE_SYNC_NO_COPY mode to allow to offload the copy to a device DMA engine, which is only used __migrate_device_pages() to decide whether or not copy the old page, and the MIGRATE_SYNC_NO_COPY mode only set in hmm, as the MIGRATE_SYNC_NO_COPY set is removed by previous cleanup, it seems that we could remove the unnecessary MIGRATE_SYNC_NO_COPY. Link: https://lkml.kernel.org/r/20240524052843.182275-6-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Jane Chu Cc: Alistair Popple Cc: Benjamin LaHaise Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jérôme Glisse Cc: Jiaqi Yan Cc: Matthew Wilcox (Oracle) Cc: Miaohe Lin Cc: Muchun Song Cc: Naoya Horiguchi Cc: Tony Luck Cc: Vishal Moola (Oracle) Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/migrate_mode.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/migrate_mode.h b/include/linux/migrate_mode.h index f37cc03f9369..9fb482bb7323 100644 --- a/include/linux/migrate_mode.h +++ b/include/linux/migrate_mode.h @@ -7,16 +7,11 @@ * on most operations but not ->writepage as the potential stall time * is too significant * MIGRATE_SYNC will block when migrating pages - * MIGRATE_SYNC_NO_COPY will block when migrating pages but will not copy pages - * with the CPU. Instead, page copy happens outside the migratepage() - * callback and is likely using a DMA engine. See migrate_vma() and HMM - * (mm/hmm.c) for users of this mode. */ enum migrate_mode { MIGRATE_ASYNC, MIGRATE_SYNC_LIGHT, MIGRATE_SYNC, - MIGRATE_SYNC_NO_COPY, }; enum migrate_reason { -- cgit v1.2.3 From ebfba0045176cb013f49cb3e5bd9f0b16eba203c Mon Sep 17 00:00:00 2001 From: Chuanhua Han Date: Wed, 29 May 2024 20:28:19 +1200 Subject: mm: swap: introduce swap_free_nr() for batched swap_free() Patch series "large folios swap-in: handle refault cases first", v5. This patchset is extracted from the large folio swapin series[1], primarily addressing the handling of scenarios involving large folios in the swap cache. Currently, it is particularly focused on addressing the refaulting of mTHP, which is still undergoing reclamation. This approach aims to streamline code review and expedite the integration of this segment into the MM tree. It relies on Ryan's swap-out series[2], leveraging the helper function swap_pte_batch() introduced by that series. Presently, do_swap_page only encounters a large folio in the swap cache before the large folio is released by vmscan. However, the code should remain equally useful once we support large folio swap-in via swapin_readahead(). This approach can effectively reduce page faults and eliminate most redundant checks and early exits for MTE restoration in recent MTE patchset[3]. The large folio swap-in for SWP_SYNCHRONOUS_IO and swapin_readahead() will be split into separate patch sets and sent at a later time. [1] https://lore.kernel.org/linux-mm/20240304081348.197341-1-21cnbao@gmail.com/ [2] https://lore.kernel.org/linux-mm/20240408183946.2991168-1-ryan.roberts@arm.com/ [3] https://lore.kernel.org/linux-mm/20240322114136.61386-1-21cnbao@gmail.com/ This patch (of 6): While swapping in a large folio, we need to free swaps related to the whole folio. To avoid frequently acquiring and releasing swap locks, it is better to introduce an API for batched free. Furthermore, this new function, swap_free_nr(), is designed to efficiently handle various scenarios for releasing a specified number, nr, of swap entries. Link: https://lkml.kernel.org/r/20240529082824.150954-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20240529082824.150954-2-21cnbao@gmail.com Signed-off-by: Chuanhua Han Co-developed-by: Barry Song Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Acked-by: Chris Li Reviewed-by: "Huang, Ying" Cc: Baolin Wang Cc: David Hildenbrand Cc: Gao Xiang Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Cc: Andreas Larsson Cc: Christoph Hellwig Cc: "David S. Miller" Cc: Khalid Aziz Cc: Len Brown Cc: Pavel Machek Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index bd450023b9a4..0f41fe49c9dc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -478,6 +478,7 @@ extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); +extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); int swap_type_of(dev_t device, sector_t offset); @@ -559,6 +560,10 @@ static inline void swap_free(swp_entry_t swp) { } +static inline void swap_free_nr(swp_entry_t entry, int nr_pages) +{ +} + static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } -- cgit v1.2.3 From 54f7a49c20ebb5189980c53e6e66709d22bee572 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:20 +1200 Subject: mm: remove the implementation of swap_free() and always use swap_free_nr() To streamline maintenance efforts, we propose removing the implementation of swap_free(). Instead, we can simply invoke swap_free_nr() with nr set to 1. swap_free_nr() is designed with a bitmap consisting of only one long, resulting in overhead that can be ignored for cases where nr equals 1. A prime candidate for leveraging swap_free_nr() lies within kernel/power/swap.c. Implementing this change facilitates the adoption of batch processing for hibernation. Link: https://lkml.kernel.org/r/20240529082824.150954-3-21cnbao@gmail.com Signed-off-by: Barry Song Suggested-by: "Huang, Ying" Reviewed-by: "Huang, Ying" Acked-by: Chris Li Reviewed-by: Ryan Roberts Cc: "Rafael J. Wysocki" Cc: Pavel Machek Cc: Len Brown Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Andreas Larsson Cc: Baolin Wang Cc: Chuanhua Han Cc: David Hildenbrand Cc: "David S. Miller" Cc: Gao Xiang Cc: Johannes Weiner Cc: Kairui Song Cc: Khalid Aziz Cc: Matthew Wilcox (Oracle) Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 0f41fe49c9dc..d33ce740b695 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -477,7 +477,6 @@ extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); -extern void swap_free(swp_entry_t); extern void swap_free_nr(swp_entry_t entry, int nr_pages); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern void free_swap_and_cache_nr(swp_entry_t entry, int nr); @@ -556,10 +555,6 @@ static inline int swapcache_prepare(swp_entry_t swp) return 0; } -static inline void swap_free(swp_entry_t swp) -{ -} - static inline void swap_free_nr(swp_entry_t entry, int nr_pages) { } @@ -608,6 +603,11 @@ static inline void free_swap_and_cache(swp_entry_t entry) free_swap_and_cache_nr(entry, 1); } +static inline void swap_free(swp_entry_t entry) +{ + swap_free_nr(entry, 1); +} + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { -- cgit v1.2.3 From 29f252cdc293f4a50b5d3dcbed53701d8444614d Mon Sep 17 00:00:00 2001 From: Barry Song Date: Wed, 29 May 2024 20:28:22 +1200 Subject: mm: introduce arch_do_swap_page_nr() which allows restore metadata for nr pages Should do_swap_page() have the capability to directly map a large folio, metadata restoration becomes necessary for a specified number of pages denoted as nr. It's important to highlight that metadata restoration is solely required by the SPARC platform, which, however, does not enable THP_SWAP. Consequently, in the present kernel configuration, there exists no practical scenario where users necessitate the restoration of nr metadata. Platforms implementing THP_SWAP might invoke this function with nr values exceeding 1, subsequent to do_swap_page() successfully mapping an entire large folio. Nonetheless, their arch_do_swap_page_nr() functions remain empty. Link: https://lkml.kernel.org/r/20240529082824.150954-5-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Ryan Roberts Reviewed-by: Khalid Aziz Cc: "David S. Miller" Cc: Andreas Larsson Cc: Baolin Wang Cc: Chris Li Cc: Christoph Hellwig Cc: Chuanhua Han Cc: David Hildenbrand Cc: Gao Xiang Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Johannes Weiner Cc: Kairui Song Cc: Len Brown Cc: Matthew Wilcox (Oracle) Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: Suren Baghdasaryan Cc: Yosry Ahmed Cc: Yu Zhao Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/pgtable.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 117b807e3f89..2f32eaccf0b9 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1089,6 +1089,15 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) }) #ifndef __HAVE_ARCH_DO_SWAP_PAGE +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + +} +#else /* * Some architectures support metadata associated with a page. When a * page is being swapped out, this metadata must be saved so it can be @@ -1097,12 +1106,17 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) * page as metadata for the page. arch_do_swap_page() can restore this * metadata when a page is swapped back in. */ -static inline void arch_do_swap_page(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long addr, - pte_t pte, pte_t oldpte) -{ - +static inline void arch_do_swap_page_nr(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long addr, + pte_t pte, pte_t oldpte, + int nr) +{ + for (int i = 0; i < nr; i++) { + arch_do_swap_page(vma->vm_mm, vma, addr + i * PAGE_SIZE, + pte_advance_pfn(pte, i), + pte_advance_pfn(oldpte, i)); + } } #endif -- cgit v1.2.3 From 16540dae959d862909716d5c854e9b3aee285609 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Thu, 30 May 2024 10:14:27 -0700 Subject: mm/hugetlb: mm/memory_hotplug: use a folio in scan_movable_pages() By using a folio in scan_movable_pages() we convert the last user of the page-based hugetlb information macro functions to the folio version. After this conversion, we can safely remove the page-based definitions from include/linux/hugetlb.h. Link: https://lkml.kernel.org/r/20240530171427.242018-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Acked-by: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Muchun Song Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 15a58f69782c..279aca379b95 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -616,9 +616,7 @@ static __always_inline \ bool folio_test_hugetlb_##flname(struct folio *folio) \ { void *private = &folio->private; \ return test_bit(HPG_##flname, private); \ - } \ -static inline int HPage##uname(struct page *page) \ - { return test_bit(HPG_##flname, &(page->private)); } + } #define SETHPAGEFLAG(uname, flname) \ static __always_inline \ @@ -637,8 +635,6 @@ void folio_clear_hugetlb_##flname(struct folio *folio) \ #define TESTHPAGEFLAG(uname, flname) \ static inline bool \ folio_test_hugetlb_##flname(struct folio *folio) \ - { return 0; } \ -static inline int HPage##uname(struct page *page) \ { return 0; } #define SETHPAGEFLAG(uname, flname) \ -- cgit v1.2.3 From fefc6e6631ff43427e81f08c8e49f7787ff0213a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 28 May 2024 09:40:50 -0700 Subject: memcg: rearrange fields of mem_cgroup_per_node Kernel test robot reported [1] performance regression for will-it-scale test suite's page_fault2 test case for the commit 70a64b7919cb ("memcg: dynamically allocate lruvec_stats"). After inspection it seems like the commit has unintentionally introduced false cache sharing. After the commit the fields of mem_cgroup_per_node which get read on the performance critical path share the cacheline with the fields which get updated often on LRU page allocations or deallocations. This has caused contention on that cacheline and the workloads which manipulates a lot of LRU pages are regressed as reported by the test report. The solution is to rearrange the fields of mem_cgroup_per_node such that the false sharing is eliminated. Let's move all the read only pointers at the start of the struct, followed by memcg-v1 only fields and at the end fields which get updated often. Experiment setup: Ran fallocate1, fallocate2, page_fault1, page_fault2 and page_fault3 from the will-it-scale test suite inside a three level memcg with /tmp mounted as tmpfs on two different machines, one a single numa node and the other one, two node machine. $ ./[testcase]_processes -t $NR_CPUS -s 50 Results for single node, 52 CPU machine: Testcase base with-patch fallocate1 1031081 1431291 (38.80 %) fallocate2 1029993 1421421 (38.00 %) page_fault1 2269440 3405788 (50.07 %) page_fault2 2375799 3572868 (50.30 %) page_fault3 28641143 28673950 ( 0.11 %) Results for dual node, 80 CPU machine: Testcase base with-patch fallocate1 2976288 3641185 (22.33 %) fallocate2 2979366 3638181 (22.11 %) page_fault1 6221790 7748245 (24.53 %) page_fault2 6482854 7847698 (21.05 %) page_fault3 28804324 28991870 ( 0.65 %) Link: https://lkml.kernel.org/r/20240528164050.2625718-1-shakeel.butt@linux.dev Fixes: 70a64b7919cb ("memcg: dynamically allocate lruvec_stats") Signed-off-by: Shakeel Butt Reported-by: kernel test robot Reviewed-by: Yosry Ahmed Reviewed-by: Roman Gushchin Cc: Feng Tang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Yin Fengwei Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 3d1599146afe..7403dd5926eb 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -96,23 +96,29 @@ struct mem_cgroup_reclaim_iter { * per-node information in memory controller. */ struct mem_cgroup_per_node { - struct lruvec lruvec; + /* Keep the read-only fields at the start */ + struct mem_cgroup *memcg; /* Back pointer, we cannot */ + /* use container_of */ struct lruvec_stats_percpu __percpu *lruvec_stats_percpu; struct lruvec_stats *lruvec_stats; - - unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; - - struct mem_cgroup_reclaim_iter iter; - struct shrinker_info __rcu *shrinker_info; + /* + * Memcg-v1 only stuff in middle as buffer between read mostly fields + * and update often fields to avoid false sharing. Once v1 stuff is + * moved in a separate struct, an explicit padding is needed. + */ + struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; - struct mem_cgroup *memcg; /* Back pointer, we cannot */ - /* use container_of */ + + /* Fields which get updated often at the end. */ + struct lruvec lruvec; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; + struct mem_cgroup_reclaim_iter iter; }; struct mem_cgroup_threshold { -- cgit v1.2.3 From 21664442be1bf79094dd9d21b8833acbfbd80ea7 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Tue, 28 May 2024 16:43:13 +0200 Subject: percpu: add __this_cpu_try_cmpxchg() Add __this_cpu_try_cmpxchg() version of the percpu op. Link: https://lkml.kernel.org/r/20240528144345.5980-1-ubizjak@gmail.com Signed-off-by: Uros Bizjak Reviewed-by: Uladzislau Rezki (Sony) Acked-by: Dennis Zhou Cc: Christoph Hellwig Cc: Lorenzo Stoakes Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton --- include/linux/percpu-defs.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h index ec3573119923..8efce7414fad 100644 --- a/include/linux/percpu-defs.h +++ b/include/linux/percpu-defs.h @@ -475,6 +475,12 @@ do { \ raw_cpu_cmpxchg(pcp, oval, nval); \ }) +#define __this_cpu_try_cmpxchg(pcp, ovalp, nval) \ +({ \ + __this_cpu_preempt_check("try_cmpxchg"); \ + raw_cpu_try_cmpxchg(pcp, ovalp, nval); \ +}) + #define __this_cpu_sub(pcp, val) __this_cpu_add(pcp, -(typeof(pcp))(val)) #define __this_cpu_inc(pcp) __this_cpu_add(pcp, 1) #define __this_cpu_dec(pcp) __this_cpu_sub(pcp, 1) -- cgit v1.2.3 From 4b98995530b77a97912230d8e1564ba7738db19c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:07 +0800 Subject: mm: shmem: add multi-size THP sysfs interface for anonymous shmem To support the use of mTHP with anonymous shmem, add a new sysfs interface 'shmem_enabled' in the '/sys/kernel/mm/transparent_hugepage/hugepages-kB/' directory for each mTHP to control whether shmem is enabled for that mTHP, with a value similar to the top level 'shmem_enabled', which can be set to: "always", "inherit (to inherit the top level setting)", "within_size", "advise", "never". An 'inherit' option is added to ensure compatibility with these global settings, and the options 'force' and 'deny' are dropped, which are rather testing artifacts from the old ages. By default, PMD-sized hugepages have enabled="inherit" and all other hugepage sizes have enabled="never" for '/sys/kernel/mm/transparent_hugepage/hugepages-xxkB/shmem_enabled'. In addition, if top level value is 'force', then only PMD-sized hugepages have enabled="inherit", otherwise configuration will be failed and vice versa. That means now we will avoid using non-PMD sized THP to override the global huge allocation. [baolin.wang@linux.alibaba.com: fix transhuge.rst indentation] Link: https://lkml.kernel.org/r/b189d815-998b-4dfd-ba89-218ff51313f8@linux.alibaba.com [akpm@linux-foundation.org: reflow transhuge.rst addition to 80 cols] [baolin.wang@linux.alibaba.com: move huge_shmem_orders_lock under CONFIG_SYSFS] Link: https://lkml.kernel.org/r/eb34da66-7f12-44f3-a39e-2bcc90c33354@linux.alibaba.com [akpm@linux-foundation.org: huge_memory.c needs mm_types.h] Link: https://lkml.kernel.org/r/ffddfa8b3cb4266ff963099ab78cfd7184c57ac7.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 088d66a54643..6a1edd752968 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -6,6 +6,7 @@ #include #include /* only for vma_is_dax() */ +#include vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, @@ -63,6 +64,7 @@ ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; +extern struct kobj_attribute thpsize_shmem_enabled_attr; /* * Mask of all large folio orders supported for anonymous THP; all orders up to @@ -265,6 +267,14 @@ unsigned long thp_vma_allowable_orders(struct vm_area_struct *vma, return __thp_vma_allowable_orders(vma, vm_flags, tva_flags, orders); } +struct thpsize { + struct kobject kobj; + struct list_head node; + int order; +}; + +#define to_thpsize(kobj) container_of(kobj, struct thpsize, kobj) + enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, -- cgit v1.2.3 From e7a2ab7b3bb5d87f99f2ea3d4481d52fc5ceb52d Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:08 +0800 Subject: mm: shmem: add mTHP support for anonymous shmem Commit 19eaf44954df adds multi-size THP (mTHP) for anonymous pages, that can allow THP to be configured through the sysfs interface located at '/sys/kernel/mm/transparent_hugepage/hugepage-XXkb/enabled'. However, the anonymous shmem will ignore the anonymous mTHP rule configured through the sysfs interface, and can only use the PMD-mapped THP, that is not reasonable. Users expect to apply the mTHP rule for all anonymous pages, including the anonymous shmem, in order to enjoy the benefits of mTHP. For example, lower latency than PMD-mapped THP, smaller memory bloat than PMD-mapped THP, contiguous PTEs on ARM architecture to reduce TLB miss etc. In addition, the mTHP interfaces can be extended to support all shmem/tmpfs scenarios in the future, especially for the shmem mmap() case. The primary strategy is similar to supporting anonymous mTHP. Introduce a new interface '/mm/transparent_hugepage/hugepage-XXkb/shmem_enabled', which can have almost the same values as the top-level '/sys/kernel/mm/transparent_hugepage/shmem_enabled', with adding a new additional "inherit" option and dropping the testing options 'force' and 'deny'. By default all sizes will be set to "never" except PMD size, which is set to "inherit". This ensures backward compatibility with the anonymous shmem enabled of the top level, meanwhile also allows independent control of anonymous shmem enabled for each mTHP. Link: https://lkml.kernel.org/r/65796c1e72e51e15f3410195b5c2d5b6c160d411.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Lance Yang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 6a1edd752968..53b7a137f460 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -560,6 +560,16 @@ static inline bool thp_migration_supported(void) { return false; } + +static inline int highest_order(unsigned long orders) +{ + return 0; +} + +static inline int next_order(unsigned long *orders, int prev) +{ + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, -- cgit v1.2.3 From 66f44583f9b617d74ffa2487e75a9c3adf344ddb Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Tue, 11 Jun 2024 18:11:10 +0800 Subject: mm: shmem: add mTHP counters for anonymous shmem Add mTHP counters for anonymous shmem. [baolin.wang@linux.alibaba.com: update Documentation/admin-guide/mm/transhuge.rst] Link: https://lkml.kernel.org/r/d86e2e7f-4141-432b-b2ba-c6691f36ef0b@linux.alibaba.com Link: https://lkml.kernel.org/r/4fd9e467d49ae4a747e428bcd821c7d13125ae67.1718090413.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Lance Yang Cc: Barry Song Cc: Daniel Gomez Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Kefeng Wang Cc: Pankaj Raghav Cc: Ryan Roberts Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 53b7a137f460..7ad41de5eaea 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -281,6 +281,9 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, + MTHP_STAT_FILE_ALLOC, + MTHP_STAT_FILE_FALLBACK, + MTHP_STAT_FILE_FALLBACK_CHARGE, __MTHP_STAT_COUNT }; -- cgit v1.2.3 From cdd9a571b7d8465353fe2e2d8d1edd8a58d82021 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Fri, 7 Jun 2024 14:23:56 +0200 Subject: fs/proc: move page_mapcount() to fs/proc/internal.h ... and rename it to folio_precise_page_mapcount(). fs/proc is the last remaining user, and that should stay that way. While at it, cleanup kpagecount_read() a bit: there are still some legacy leftovers -- when the interface was introduced it returned the page refcount, but was changed briefly afterwards to return the page mapcount. Further, some simple folio conversion. Once we stop using the per-page mapcounts of large folios, all folio_precise_page_mapcount() users will have to implement an alternative way to achieve what they are trying to achieve, possibly in a less precise way. [dan.carpenter@linaro.org: fix uninitialized variable in pagemap_pmd_range()] Link: https://lkml.kernel.org/r/9d6eaba7-92f8-4a70-8765-38a519680a87@moroto.mountain Link: https://lkml.kernel.org/r/20240607122357.115423-6-david@redhat.com Signed-off-by: David Hildenbrand Signed-off-by: Dan Carpenter Cc: Alexey Dobriyan Cc: Jonathan Corbet Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mm.h | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index f4bf94ca99e3..7f864d5f52b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1202,8 +1202,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) /* * How many times the entire folio is mapped as a single unit (eg by a * PMD or PUD entry). This is probably not what you want, except for - * debugging purposes - it does not include PTE-mapped sub-pages; look - * at folio_mapcount() or page_mapcount() instead. + * debugging purposes or implementation of other core folio_*() primitives. */ static inline int folio_entire_mapcount(const struct folio *folio) { @@ -1221,30 +1220,6 @@ static inline void page_mapcount_reset(struct page *page) atomic_set(&(page)->_mapcount, -1); } -/** - * page_mapcount() - Number of times this precise page is mapped. - * @page: The page. - * - * The number of times this page is mapped. If this page is part of - * a large folio, it includes the number of times this page is mapped - * as part of that folio. - * - * Will report 0 for pages which cannot be mapped into userspace, eg - * slab, page tables and similar. - */ -static inline int page_mapcount(struct page *page) -{ - int mapcount = atomic