From 5cec4eb7fad6fb1e9a3dd8403b558d1eff7490ff Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 26 Jan 2024 16:19:44 +0800 Subject: mm and cache_info: remove unnecessary CPU cache info update For each CPU hotplug event, we will update per-CPU data slice size and corresponding PCP configuration for every online CPU to make the implementation simple. But, Kyle reported that this takes tens seconds during boot on a machine with 34 zones and 3840 CPUs. So, in this patch, for each CPU hotplug event, we only update per-CPU data slice size and corresponding PCP configuration for the CPUs that share caches with the hotplugged CPU. With the patch, the system boot time reduces 67 seconds on the machine. Link: https://lkml.kernel.org/r/20240126081944.414520-1-ying.huang@intel.com Fixes: 362d37a106dd ("mm, pcp: reduce lock contention for draining high-order pages") Signed-off-by: "Huang, Ying" Originally-by: Kyle Meyer Reported-and-tested-by: Kyle Meyer Cc: Sudeep Holla Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 150d4f23b010..9faca05d124e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5572,37 +5572,34 @@ static void zone_pcp_update(struct zone *zone, int cpu_online) mutex_unlock(&pcp_batch_high_lock); } -static void zone_pcp_update_cacheinfo(struct zone *zone) +static void zone_pcp_update_cacheinfo(struct zone *zone, unsigned int cpu) { - int cpu; struct per_cpu_pages *pcp; struct cpu_cacheinfo *cci; - for_each_online_cpu(cpu) { - pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); - cci = get_cpu_cacheinfo(cpu); - /* - * If data cache slice of CPU is large enough, "pcp->batch" - * pages can be preserved in PCP before draining PCP for - * consecutive high-order pages freeing without allocation. - * This can reduce zone lock contention without hurting - * cache-hot pages sharing. - */ - spin_lock(&pcp->lock); - if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) - pcp->flags |= PCPF_FREE_HIGH_BATCH; - else - pcp->flags &= ~PCPF_FREE_HIGH_BATCH; - spin_unlock(&pcp->lock); - } + pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); + cci = get_cpu_cacheinfo(cpu); + /* + * If data cache slice of CPU is large enough, "pcp->batch" + * pages can be preserved in PCP before draining PCP for + * consecutive high-order pages freeing without allocation. + * This can reduce zone lock contention without hurting + * cache-hot pages sharing. + */ + spin_lock(&pcp->lock); + if ((cci->per_cpu_data_slice_size >> PAGE_SHIFT) > 3 * pcp->batch) + pcp->flags |= PCPF_FREE_HIGH_BATCH; + else + pcp->flags &= ~PCPF_FREE_HIGH_BATCH; + spin_unlock(&pcp->lock); } -void setup_pcp_cacheinfo(void) +void setup_pcp_cacheinfo(unsigned int cpu) { struct zone *zone; for_each_populated_zone(zone) - zone_pcp_update_cacheinfo(zone); + zone_pcp_update_cacheinfo(zone, cpu); } /* -- cgit v1.2.3 From 5267fe5d092e80a83740e5a1f6d5638d88ac7309 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:17 -0500 Subject: mm/page_alloc: remove unused fpi_flags in free_pages_prepare() Patch series "Enable >0 order folio memory compaction", v7. This patchset enables >0 order folio memory compaction, which is one of the prerequisitions for large folio support[1]. I am aware of that split free pages is necessary for folio migration in compaction, since if >0 order free pages are never split and no order-0 free page is scanned, compaction will end prematurely due to migration returns -ENOMEM. Free page split becomes a must instead of an optimization. lkp ncompare results (on a 8-CPU (Intel Xeon E5-2650 v4 @2.20GHz) 16G VM) for default LRU (-no-mglru) and CONFIG_LRU_GEN are shown at the bottom, copied from V3[4]. In sum, most of vm-scalability applications do not see performance change, and the others see ~4% to ~26% performance boost under default LRU and ~2% to ~6% performance boost under CONFIG_LRU_GEN. Overview === To support >0 order folio compaction, the patchset changes how free pages used for migration are kept during compaction. Free pages used to be split into order-0 pages that are post allocation processed (i.e., PageBuddy flag cleared, page order stored in page->private is zeroed, and page reference is set to 1). Now all free pages are kept in a NR_PAGE_ORDER array of page lists based on their order without post allocation process. When migrate_pages() asks for a new page, one of the free pages, based on the requested page order, is then processed and given out. And THP <2MB would need this feature. [1] https://lore.kernel.org/linux-mm/f8d47176-03a8-99bf-a813-b5942830fd73@arm.com/ [2] https://lore.kernel.org/linux-mm/20231113170157.280181-1-zi.yan@sent.com/ [3] https://lore.kernel.org/linux-mm/20240123034636.1095672-1-zi.yan@sent.com/ [4] https://lore.kernel.org/linux-mm/20240202161554.565023-1-zi.yan@sent.com/ [5] https://lore.kernel.org/linux-mm/20240212163510.859822-1-zi.yan@sent.com/ [6] https://lore.kernel.org/linux-mm/20240214220420.1229173-1-zi.yan@sent.com/ [7] https://lore.kernel.org/linux-mm/20240216170432.1268753-1-zi.yan@sent.com/ This patch (of 4): Commit 0a54864f8dfb ("kasan: remove PG_skip_kasan_poison flag") removes the use of fpi_flags in should_skip_kasan_poison() and fpi_flags is only passed to should_skip_kasan_poison() in free_pages_prepare(). Remove the unused parameter. Link: https://lkml.kernel.org/r/20240220183220.1451315-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20240220183220.1451315-2-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Cc: Adam Manzanares Cc: Baolin Wang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Yin Fengwei Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9faca05d124e..dc59fb225cbf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1061,7 +1061,7 @@ out: * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) +static inline bool should_skip_kasan_poison(struct page *page) { if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return deferred_pages_enabled(); @@ -1081,10 +1081,10 @@ static void kernel_init_pages(struct page *page, int numpages) } static __always_inline bool free_pages_prepare(struct page *page, - unsigned int order, fpi_t fpi_flags) + unsigned int order) { int bad = 0; - bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); + bool skip_kasan_poison = should_skip_kasan_poison(page); bool init = want_init_on_free(); bool compound = PageCompound(page); @@ -1266,7 +1266,7 @@ static void __free_pages_ok(struct page *page, unsigned int order, unsigned long pfn = page_to_pfn(page); struct zone *zone = page_zone(page); - if (!free_pages_prepare(page, order, fpi_flags)) + if (!free_pages_prepare(page, order)) return; /* @@ -2343,7 +2343,7 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn, { int migratetype; - if (!free_pages_prepare(page, order, FPI_NONE)) + if (!free_pages_prepare(page, order)) return false; migratetype = get_pfnblock_migratetype(page, pfn); -- cgit v1.2.3 From 733aea0b3a7bba0451dfc19322665de13a5b7af4 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Tue, 20 Feb 2024 13:32:19 -0500 Subject: mm/compaction: add support for >0 order folio memory compaction. Before last commit, memory compaction only migrates order-0 folios and skips >0 order folios. Last commit splits all >0 order folios during compaction. This commit migrates >0 order folios during compaction by keeping isolated free pages at their original size without splitting them into order-0 pages and using them directly during migration process. What is different from the prior implementation: 1. All isolated free pages are kept in a NR_PAGE_ORDERS array of page lists, where each page list stores free pages in the same order. 2. All free pages are not post_alloc_hook() processed nor buddy pages, although their orders are stored in first page's private like buddy pages. 3. During migration, in new page allocation time (i.e., in compaction_alloc()), free pages are then processed by post_alloc_hook(). When migration fails and a new page is returned (i.e., in compaction_free()), free pages are restored by reversing the post_alloc_hook() operations using newly added free_pages_prepare_fpi_none(). Step 3 is done for a latter optimization that splitting and/or merging free pages during compaction becomes easier. Note: without splitting free pages, compaction can end prematurely due to migration will return -ENOMEM even if there is free pages. This happens when no order-0 free page exist and compaction_alloc() return NULL. Link: https://lkml.kernel.org/r/20240220183220.1451315-4-zi.yan@sent.com Signed-off-by: Zi Yan Reviewed-by: Baolin Wang Reviewed-by: Vlastimil Babka Tested-by: Baolin Wang Tested-by: Yu Zhao Cc: Adam Manzanares Cc: David Hildenbrand Cc: Huang Ying Cc: Johannes Weiner Cc: Kemeng Shi Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Cc: Vlastimil Babka Cc: Yin Fengwei Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc59fb225cbf..51e13aa605ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1080,7 +1080,7 @@ static void kernel_init_pages(struct page *page, int numpages) kasan_enable_current(); } -static __always_inline bool free_pages_prepare(struct page *page, +__always_inline bool free_pages_prepare(struct page *page, unsigned int order) { int bad = 0; -- cgit v1.2.3 From 5bb1421422fa8955e741097e6371a1d3d0a9e54e Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Wed, 21 Feb 2024 15:32:27 +0800 Subject: mm/page_alloc: make bad_range() return bool bad_range() can return bool, so let us change it. Link: https://lkml.kernel.org/r/20240221073227.276234-1-gehao@kylinos.cn Signed-off-by: Hao Ge Signed-off-by: Andrew Morton --- mm/page_alloc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 51e13aa605ec..a9581cdf9649 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -464,19 +464,19 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) /* * Temporary debugging check for pages not lying within a given zone. */ -static int __maybe_unused bad_range(struct zone *zone, struct page *page) +static bool __maybe_unused bad_range(struct zone *zone, struct page *page) { if (page_outside_zone_boundaries(zone, page)) - return 1; + return true; if (zone != page_zone(page)) - return 1; + return true; - return 0; + return false; } #else -static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) +static inline bool __maybe_unused bad_range(struct zone *zone, struct page *page) { - return 0; + return false; } #endif -- cgit v1.2.3 From 77c7a095644efb388ac412d1affcbde75302e52c Mon Sep 17 00:00:00 2001 From: Hao Ge Date: Thu, 22 Feb 2024 17:19:32 +0800 Subject: mm/page_alloc: make check_new_page() return bool Make check_new_page() return bool like check_new_pages() Link: https://lkml.kernel.org/r/20240222091932.54799-1-gehao@kylinos.cn Signed-off-by: Hao Ge Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9581cdf9649..5ab921e3d7d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1422,14 +1422,14 @@ static void check_new_page_bad(struct page *page) /* * This page is about to be returned from the page allocator */ -static int check_new_page(struct page *page) +static bool check_new_page(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) - return 0; + return false; check_new_page_bad(page); - return 1; + return true; } static inline bool check_new_pages(struct page *page, unsigned int order) -- cgit v1.2.3 From 502003bb76b83649cd4ff7f701987ac5cf43bc4b Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:29 -0500 Subject: mm/memcg: use order instead of nr in split_page_memcg() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-4-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5ab921e3d7d1..44ffe1304878 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2617,7 +2617,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4802,7 +4802,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, 1 << order); - split_page_memcg(page, 1 << order); + split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); -- cgit v1.2.3 From 9a581c12cddb06696fe4811239934fcde57ceb91 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:30 -0500 Subject: mm/page_owner: use order instead of nr in split_page_owner() We do not have non power of two pages, using nr is error prone if nr is not power-of-two. Use page order instead. Link: https://lkml.kernel.org/r/20240226205534.1603748-5-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44ffe1304878..a9d1af34f4d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2616,7 +2616,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); } EXPORT_SYMBOL_GPL(split_page); @@ -4801,7 +4801,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, 1 << order); + split_page_owner(page, order); split_page_memcg(page, order); while (page < --last) set_page_refcounted(last); -- cgit v1.2.3 From b8791381d7edae3706edde207f52d9e483ed400c Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:31 -0500 Subject: mm: memcg: make memcg huge page split support any order split It sets memcg information for the pages after the split. A new parameter new_order is added to tell the order of subpages in the new page, always 0 for now. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-6-zi.yan@sent.com Signed-off-by: Zi Yan Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9d1af34f4d6..3b3dae66d49d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2617,7 +2617,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4802,7 +4802,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *last = page + nr; split_page_owner(page, order); - split_page_memcg(page, order); + split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); -- cgit v1.2.3 From 46d44d09d24c5b451d6ab8f0fca5a40f651e3837 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Mon, 26 Feb 2024 15:55:32 -0500 Subject: mm: page_owner: add support for splitting to any order in split page_owner It adds a new_order parameter to set new page order in page owner. It prepares for upcoming changes to support split huge page to any lower order. Link: https://lkml.kernel.org/r/20240226205534.1603748-7-zi.yan@sent.com Signed-off-by: Zi Yan Cc: David Hildenbrand Acked-by: David Hildenbrand Cc: Kirill A. Shutemov Cc: Luis Chamberlain Cc: "Matthew Wilcox (Oracle)" Cc: Michal Koutny Cc: Roman Gushchin Cc: Ryan Roberts Cc: Yang Shi Cc: Yu Zhao Cc: Zach O'Keefe Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3b3dae66d49d..ff1f159251df 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2616,7 +2616,7 @@ void split_page(struct page *page, unsigned int order) for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); } EXPORT_SYMBOL_GPL(split_page); @@ -4801,7 +4801,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order, struct page *page = virt_to_page((void *)addr); struct page *last = page + nr; - split_page_owner(page, order); + split_page_owner(page, order, 0); split_page_memcg(page, order, 0); while (page < --last) set_page_refcounted(last); -- cgit v1.2.3 From 7c76d92253dbb7c53ba03a4cd6639113cd1f7d3a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:36 +0000 Subject: mm: convert free_unref_page_list() to use folios Most of its callees are not yet ready to accept a folio, but we know all of the pages passed in are actually folios because they're linked through ->lru. Link: https://lkml.kernel.org/r/20240227174254.710559-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ff1f159251df..20d4ba095ad2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2520,17 +2520,17 @@ void free_unref_page(struct page *page, unsigned int order) void free_unref_page_list(struct list_head *list) { unsigned long __maybe_unused UP_flags; - struct page *page, *next; + struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; int batch_count = 0; int migratetype; /* Prepare pages for freeing */ - list_for_each_entry_safe(page, next, list, lru) { - unsigned long pfn = page_to_pfn(page); - if (!free_unref_page_prepare(page, pfn, 0)) { - list_del(&page->lru); + list_for_each_entry_safe(folio, next, list, lru) { + unsigned long pfn = folio_pfn(folio); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) { + list_del(&folio->lru); continue; } @@ -2538,24 +2538,25 @@ void free_unref_page_list(struct list_head *list) * Free isolated pages directly to the allocator, see * comment in free_unref_page. */ - migratetype = get_pcppage_migratetype(page); + migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&page->lru); - free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE); + list_del(&folio->lru); + free_one_page(folio_zone(folio), &folio->page, pfn, + 0, migratetype, FPI_NONE); continue; } } - list_for_each_entry_safe(page, next, list, lru) { - struct zone *zone = page_zone(page); + list_for_each_entry_safe(folio, next, list, lru) { + struct zone *zone = folio_zone(folio); - list_del(&page->lru); - migratetype = get_pcppage_migratetype(page); + list_del(&folio->lru); + migratetype = get_pcppage_migratetype(&folio->page); /* * Either different zone requiring a different pcp lock or * excessive lock hold times when freeing a large list of - * pages. + * folios. */ if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { if (pcp) { @@ -2566,15 +2567,16 @@ void free_unref_page_list(struct list_head *list) batch_count = 0; /* - * trylock is necessary as pages may be getting freed + * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. */ pcp_trylock_prepare(UP_flags); pcp = pcp_spin_trylock(zone->per_cpu_pageset); if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); - free_one_page(zone, page, page_to_pfn(page), - 0, migratetype, FPI_NONE); + free_one_page(zone, &folio->page, + folio_pfn(folio), 0, + migratetype, FPI_NONE); locked_zone = NULL; continue; } @@ -2588,8 +2590,8 @@ void free_unref_page_list(struct list_head *list) if (unlikely(migratetype >= MIGRATE_PCPTYPES)) migratetype = MIGRATE_MOVABLE; - trace_mm_page_free_batched(page); - free_unref_page_commit(zone, pcp, page, migratetype, 0); + trace_mm_page_free_batched(&folio->page); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); batch_count++; } -- cgit v1.2.3 From 90491d87dd46a4c843dae775b9e72c91624c5a7b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:37 +0000 Subject: mm: add free_unref_folios() Iterate over a folio_batch rather than a linked list. This is easier for the CPU to prefetch and has a batch count naturally built in so we don't need to track it. Again, this lowers the maximum lock hold time from 32 folios to 15, but I do not expect this to have a significant effect. Link: https://lkml.kernel.org/r/20240227174254.710559-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/page_alloc.c | 59 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 36 insertions(+), 23 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 20d4ba095ad2..31d97322feea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -2515,57 +2516,51 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a list of 0-order pages + * Free a batch of 0-order pages */ -void free_unref_page_list(struct list_head *list) +void free_unref_folios(struct folio_batch *folios) { unsigned long __maybe_unused UP_flags; - struct folio *folio, *next; struct per_cpu_pages *pcp = NULL; struct zone *locked_zone = NULL; - int batch_count = 0; - int migratetype; + int i, j, migratetype; - /* Prepare pages for freeing */ - list_for_each_entry_safe(folio, next, list, lru) { + /* Prepare folios for freeing */ + for (i = 0, j = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) { - list_del(&folio->lru); + if (!free_unref_page_prepare(&folio->page, pfn, 0)) continue; - } /* - * Free isolated pages directly to the allocator, see + * Free isolated folios directly to the allocator, see * comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); if (unlikely(is_migrate_isolate(migratetype))) { - list_del(&folio->lru); free_one_page(folio_zone(folio), &folio->page, pfn, 0, migratetype, FPI_NONE); continue; } + if (j != i) + folios->folios[j] = folio; + j++; } + folios->nr = j; - list_for_each_entry_safe(folio, next, list, lru) { + for (i = 0; i < folios->nr; i++) { + struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); - list_del(&folio->lru); migratetype = get_pcppage_migratetype(&folio->page); - /* - * Either different zone requiring a different pcp lock or - * excessive lock hold times when freeing a large list of - * folios. - */ - if (zone != locked_zone || batch_count == SWAP_CLUSTER_MAX) { + /* Different zone requires a different pcp lock */ + if (zone != locked_zone) { if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } - batch_count = 0; - /* * trylock is necessary as folios may be getting freed * from IRQ or SoftIRQ context after an IO completion. @@ -2592,13 +2587,31 @@ void free_unref_page_list(struct list_head *list) trace_mm_page_free_batched(&folio->page); free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); - batch_count++; } if (pcp) { pcp_spin_unlock(pcp); pcp_trylock_finish(UP_flags); } + folio_batch_reinit(folios); +} + +void free_unref_page_list(struct list_head *list) +{ + struct folio_batch fbatch; + + folio_batch_init(&fbatch); + while (!list_empty(list)) { + struct folio *folio = list_first_entry(list, struct folio, lru); + + list_del(&folio->lru); + if (folio_batch_add(&fbatch, folio) > 0) + continue; + free_unref_folios(&fbatch); + } + + if (fbatch.nr) + free_unref_folios(&fbatch); } /* -- cgit v1.2.3 From 31b2ff82aefb33ce92496a1becddd6ce51060db2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:43 +0000 Subject: mm: handle large folios in free_unref_folios() Call folio_undo_large_rmappable() if needed. free_unref_page_prepare() destroys the ability to call folio_order(), so stash the order in folio->private for the benefit of the second loop. Link: https://lkml.kernel.org/r/20240227174254.710559-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: David Hildenbrand Cc: Mel Gorman Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/page_alloc.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 31d97322feea..025ad1a7df7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2516,7 +2516,7 @@ void free_unref_page(struct page *page, unsigned int order) } /* - * Free a batch of 0-order pages + * Free a batch of folios */ void free_unref_folios(struct folio_batch *folios) { @@ -2529,19 +2529,25 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0, j = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; unsigned long pfn = folio_pfn(folio); - if (!free_unref_page_prepare(&folio->page, pfn, 0)) + unsigned int order = folio_order(folio); + + if (order > 0 && folio_test_large_rmappable(folio)) + folio_undo_large_rmappable(folio); + if (!free_unref_page_prepare(&folio->page, pfn, order)) continue; /* - * Free isolated folios directly to the allocator, see - * comment in free_unref_page. + * Free isolated folios and orders not handled on the PCP + * directly to the allocator, see comment in free_unref_page. */ migratetype = get_pcppage_migratetype(&folio->page); - if (unlikely(is_migrate_isolate(migratetype))) { + if (!pcp_allowed_order(order) || + is_migrate_isolate(migratetype)) { free_one_page(folio_zone(folio), &folio->page, pfn, - 0, migratetype, FPI_NONE); + order, migratetype, FPI_NONE); continue; } + folio->private = (void *)(unsigned long)order; if (j != i) folios->folios[j] = folio; j++; @@ -2551,7 +2557,9 @@ void free_unref_folios(struct folio_batch *folios) for (i = 0; i < folios->nr; i++) { struct folio *folio = folios->folios[i]; struct zone *zone = folio_zone(folio); + unsigned int order = (unsigned long)folio->private; + folio->private = NULL; migratetype = get_pcppage_migratetype(&folio->page); /* Different zone requires a different pcp lock */ @@ -2570,7 +2578,7 @@ void free_unref_folios(struct folio_batch *folios) if (unlikely(!pcp)) { pcp_trylock_finish(UP_flags); free_one_page(zone, &folio->page, - folio_pfn(folio), 0, + folio_pfn(folio), order, migratetype, FPI_NONE); locked_zone = NULL; continue; @@ -2586,7 +2594,8 @@ void free_unref_folios(struct folio_batch *folios) migratetype = MIGRATE_MOVABLE; trace_mm_page_free_batched(&folio->page); - free_unref_page_commit(zone, pcp, &folio->page, migratetype, 0); + free_unref_page_commit(zone, pcp, &folio->page, migratetype, + order); } if (pcp) { -- cgit v1.2.3 From 8b7b0a5eee22e3cd0468944d0720120c36340a2b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 27 Feb 2024 17:42:48 +0000 Subject: mm: remove free_unref_page_list() All callers now use free_unref_folios() so we can delete this function. Link: https://lkml.kernel.org/r/20240227174254.710559-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 025ad1a7df7b..7873e9375802 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2605,24 +2605,6 @@ void free_unref_folios(struct folio_batch *folios) folio_batch_reinit(folios); } -void free_unref_page_list(struct list_head *list) -{ - struct folio_batch fbatch; - - folio_batch_init(&fbatch); - while (!list_empty(list)) { - struct folio *folio = list_first_entry(list, struct folio, lru); - - list_del(&folio->lru); - if (folio_batch_add(&fbatch, folio) > 0) - continue; - free_unref_folios(&fbatch); - } - - if (fbatch.nr) - free_unref_folios(&fbatch); -} - /* * split_page takes a non-compound higher-order page, and splits it into * n (1< Date: Wed, 28 Feb 2024 05:11:17 +0000 Subject: mm: add alloc_contig_migrate_range allocation statistics alloc_contig_migrate_range has every information to be able to understand big contiguous allocation latency. For example, how many pages are migrated, how many times they were needed to unmap from page tables. This patch adds the trace event to collect the allocation statistics. In the field, it was quite useful to understand CMA allocation latency. [akpm@linux-foundation.org: a/trace_mm_alloc_config_migrate_range_info_enabled/trace_mm_alloc_contig_migrate_range_info_enabled] Link: https://lkml.kernel.org/r/20240228051127.2859472-1-richardycc@google.com Signed-off-by: Richard Chang Reviewed-by: Steven Rostedt (Google) Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Cc: Minchan Kim Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/page_alloc.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7873e9375802..29c982542dcc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6222,9 +6222,14 @@ static void alloc_contig_dump_pages(struct list_head *page_list) } } -/* [start, end) must belong to a single zone. */ +/* + * [start, end) must belong to a single zone. + * @migratetype: using migratetype to filter the type of migration in + * trace_mm_alloc_contig_migrate_range_info. + */ int __alloc_contig_migrate_range(struct compact_control *cc, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int migratetype) { /* This function is based on compact_zone() from compaction.c. */ unsigned int nr_reclaimed; @@ -6235,6 +6240,10 @@ int __alloc_contig_migrate_range(struct compact_control *cc, .nid = zone_to_nid(cc->zone), .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + struct page *page; + unsigned long total_mapped = 0; + unsigned long total_migrated = 0; + unsigned long total_reclaimed = 0; lru_cache_disable(); @@ -6260,9 +6269,18 @@ int __alloc_contig_migrate_range(struct compact_control *cc, &cc->migratepages); cc->nr_migratepages -= nr_reclaimed; + if (trace_mm_alloc_contig_migrate_range_info_enabled()) { + total_reclaimed += nr_reclaimed; + list_for_each_entry(page, &cc->migratepages, lru) + total_mapped += page_mapcount(page); + } + ret = migrate_pages(&cc->migratepages, alloc_migration_target, NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE, NULL); + if (trace_mm_alloc_contig_migrate_range_info_enabled() && !ret) + total_migrated += cc->nr_migratepages; + /* * On -ENOMEM, migrate_pages() bails out right away. It is pointless * to retry again over this error, so do the same here. @@ -6276,9 +6294,13 @@ int __alloc_contig_migrate_range(struct compact_control *cc, if (!(cc->gfp_mask & __GFP_NOWARN) && ret == -EBUSY) alloc_contig_dump_pages(&cc->migratepages); putback_movable_pages(&cc->migratepages); - return ret; } - return 0; + + trace_mm_alloc_contig_migrate_range_info(start, end, migratetype, + total_migrated, + total_reclaimed, + total_mapped); + return (ret < 0) ? ret : 0; } /** @@ -6358,7 +6380,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, * allocated. So, if we fall through be sure to clear ret so that * -EBUSY is not accidentally used or returned to caller. */ - ret = __alloc_contig_migrate_range(&cc, start, end); + ret = __alloc_contig_migrate_range(&cc, start, end, migratetype); if (ret && ret != -EBUSY) goto done; ret = 0; -- cgit v1.2.3 From 72741db6836b4fc3c810e33a53f7cb9cf2cd48a7 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 28 Feb 2024 23:49:12 +0100 Subject: mm: page_alloc: use div64_ul() instead of do_div() Fixes Coccinelle/coccicheck warning reported by do_div.cocci. Compared to do_div(), div64_ul() does not implicitly cast the divisor and does not unnecessarily calculate the remainder. Link: https://lkml.kernel.org/r/20240228224911.1164-2-thorsten.blum@toblux.com Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm/page_alloc.c') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 29c982542dcc..cc7f9b322193 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5848,7 +5848,7 @@ static void __setup_per_zone_wmarks(void) spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone_managed_pages(zone); - do_div(tmp, lowmem_pages); + tmp = div64_ul(tmp, lowmem_pages); if (is_highmem(zone) || zone_idx(zone) == ZONE_MOVABLE) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't -- cgit v1.2.3