From 525c30304928ff0efee4dfab8319a9d4f254ab46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Sat, 18 May 2024 16:00:04 +0200 Subject: mm/hugetlb: constify ctl_table arguments of utility functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysctl core is preparing to only expose instances of struct ctl_table as "const". This will also affect the ctl_table argument of sysctl handlers. As the function prototype of all sysctl handlers throughout the tree needs to stay consistent that change will be done in one commit. To reduce the size of that final commit, switch utility functions which are not bound by "typedef proc_handler" to "const struct ctl_table". No functional change. Link: https://lkml.kernel.org/r/20240518-sysctl-const-handler-hugetlb-v1-1-47e34e2871b2@weissschuh.net Signed-off-by: Thomas Weißschuh Reviewed-by: Muchun Song Cc: Joel Granados Cc: Luis Chamberlain Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f35abff8be60..04f8d0ac069c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4911,7 +4911,7 @@ static unsigned int allowed_mems_nr(struct hstate *h) } #ifdef CONFIG_SYSCTL -static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, +static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos, unsigned long *out) { @@ -4928,7 +4928,7 @@ static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write, } static int hugetlb_sysctl_handler_common(bool obey_mempolicy, - struct ctl_table *table, int write, + const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; -- cgit v1.2.3 From 0ba5e806e14e97a4dd34e21ae2994693bcdd0406 Mon Sep 17 00:00:00 2001 From: Illia Ostapyshyn Date: Fri, 17 May 2024 11:13:48 +0200 Subject: mm/vmscan: update stale references to shrink_page_list Commit 49fd9b6df54e ("mm/vmscan: fix a lot of comments") renamed shrink_page_list() to shrink_folio_list(). Fix up the remaining references to the old name in comments and documentation. Link: https://lkml.kernel.org/r/20240517091348.1185566-1-illia@yshyn.com Signed-off-by: Illia Ostapyshyn Cc: Jonathan Corbet Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/unevictable-lru.rst | 10 +++++----- mm/memory.c | 2 +- mm/swap_state.c | 2 +- mm/truncate.c | 2 +- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Documentation/mm/unevictable-lru.rst b/Documentation/mm/unevictable-lru.rst index b6a07a26b10d..2feb2ed51ae2 100644 --- a/Documentation/mm/unevictable-lru.rst +++ b/Documentation/mm/unevictable-lru.rst @@ -191,13 +191,13 @@ have become evictable again (via munlock() for example) and have been "rescued" from the unevictable list. However, there may be situations where we decide, for the sake of expediency, to leave an unevictable folio on one of the regular active/inactive LRU lists for vmscan to deal with. vmscan checks for such -folios in all of the shrink_{active|inactive|page}_list() functions and will +folios in all of the shrink_{active|inactive|folio}_list() functions and will "cull" such folios that it encounters: that is, it diverts those folios to the unevictable list for the memory cgroup and node being scanned. There may be situations where a folio is mapped into a VM_LOCKED VMA, but the folio does not have the mlocked flag set. Such folios will make -it all the way to shrink_active_list() or shrink_page_list() where they +it all the way to shrink_active_list() or shrink_folio_list() where they will be detected when vmscan walks the reverse map in folio_referenced() or try_to_unmap(). The folio is culled to the unevictable list when it is released by the shrinker. @@ -269,7 +269,7 @@ the LRU. Such pages can be "noticed" by memory management in several places: (4) in the fault path and when a VM_LOCKED stack segment is expanded; or - (5) as mentioned above, in vmscan:shrink_page_list() when attempting to + (5) as mentioned above, in vmscan:shrink_folio_list() when attempting to reclaim a page in a VM_LOCKED VMA by folio_referenced() or try_to_unmap(). mlocked pages become unlocked and rescued from the unevictable list when: @@ -548,12 +548,12 @@ Some examples of these unevictable pages on the LRU lists are: (3) pages still mapped into VM_LOCKED VMAs, which should be marked mlocked, but events left mlock_count too low, so they were munlocked too early. -vmscan's shrink_inactive_list() and shrink_page_list() also divert obviously +vmscan's shrink_inactive_list() and shrink_folio_list() also divert obviously unevictable pages found on the inactive lists to the appropriate memory cgroup and node unevictable list. rmap's folio_referenced_one(), called via vmscan's shrink_active_list() or -shrink_page_list(), and rmap's try_to_unmap_one() called via shrink_page_list(), +shrink_folio_list(), and rmap's try_to_unmap_one() called via shrink_folio_list(), check for (3) pages still mapped into VM_LOCKED VMAs, and call mlock_vma_folio() to correct them. Such pages are culled to the unevictable list when released by the shrinker. diff --git a/mm/memory.c b/mm/memory.c index d10e616d7389..2ba8ccdd5a85 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4541,7 +4541,7 @@ static vm_fault_t __do_fault(struct vm_fault *vmf) * lock_page(B) * lock_page(B) * pte_alloc_one - * shrink_page_list + * shrink_folio_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) diff --git a/mm/swap_state.c b/mm/swap_state.c index 642c30d8376c..6498491e3ad8 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -28,7 +28,7 @@ /* * swapper_space is a fiction, retained to simplify the path through - * vmscan's shrink_page_list. + * vmscan's shrink_folio_list. */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, diff --git a/mm/truncate.c b/mm/truncate.c index e99085bf3d34..5ce62a939e55 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -554,7 +554,7 @@ EXPORT_SYMBOL(invalidate_mapping_pages); * This is like mapping_evict_folio(), except it ignores the folio's * refcount. We do this because invalidate_inode_pages2() needs stronger * invalidation guarantees, and cannot afford to leave folios behind because - * shrink_page_list() has a temp ref on them, or because they're transiently + * shrink_folio_list() has a temp ref on them, or because they're transiently * sitting in the folio_add_lru() caches. */ static int invalidate_complete_folio2(struct address_space *mapping, -- cgit v1.2.3 From 6584a14a377d087b91eeb53feb40a971cca51d6d Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Thu, 16 May 2024 10:10:35 +0200 Subject: mm/hugetlb: drop node_alloc_noretry from alloc_fresh_hugetlb_folio Since commit d67e32f26713 ("hugetlb: restructure pool allocations"), the parameter node_alloc_noretry from alloc_fresh_hugetlb_folio() is not used, so drop it. Link: https://lkml.kernel.org/r/20240516081035.5651-1-osalvador@suse.de Signed-off-by: Oscar Salvador Reviewed-by: Sidhartha Kumar Reviewed-by: Anshuman Khandual Reviewed-by: Vishal Moola (Oracle) Reviewed-by: Muchun Song Cc: Oscar Salvador Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 04f8d0ac069c..3518321f6598 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2289,13 +2289,11 @@ static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h, * pages is zero. */ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask, - nodemask_t *node_alloc_noretry) + gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct folio *folio; - folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, - node_alloc_noretry); + folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); if (!folio) return NULL; @@ -2513,7 +2511,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, goto out_unlock; spin_unlock_irq(&hugetlb_lock); - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; @@ -2549,7 +2547,7 @@ static struct folio *alloc_migrate_hugetlb_folio(struct hstate *h, gfp_t gfp_mas if (hstate_is_gigantic(h)) return NULL; - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL); + folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask); if (!folio) return NULL; @@ -3474,7 +3472,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, - &node_states[N_MEMORY], NULL); + &node_states[N_MEMORY]); if (!folio) break; free_huge_folio(folio); /* free it into the hugepage allocator */ -- cgit v1.2.3 From a19621ed4e0ac9e1d296d07dc8ff8c27d5c03b43 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:06 +0800 Subject: mm: add folio_alloc_mpol() Patch series "mm: convert to folio_alloc_mpol()". This patch (of 4): This adds a new folio_alloc_mpol() like folio_alloc() but allocate folio according to NUMA mempolicy. Link: https://lkml.kernel.org/r/20240515070709.78529-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20240515070709.78529-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- include/linux/gfp.h | 8 ++++++++ mm/mempolicy.c | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f9691d375f0..f53f76e0b17e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -303,6 +303,8 @@ struct page *alloc_pages_noprof(gfp_t gfp, unsigned int order); struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -319,6 +321,11 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } +static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return folio_alloc_noprof(gfp, order); +} #define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ folio_alloc_noprof(gfp, order) #endif @@ -326,6 +333,7 @@ static inline struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order) #define alloc_pages(...) alloc_hooks(alloc_pages_noprof(__VA_ARGS__)) #define alloc_pages_mpol(...) alloc_hooks(alloc_pages_mpol_noprof(__VA_ARGS__)) #define folio_alloc(...) alloc_hooks(folio_alloc_noprof(__VA_ARGS__)) +#define folio_alloc_mpol(...) alloc_hooks(folio_alloc_mpol_noprof(__VA_ARGS__)) #define vma_alloc_folio(...) alloc_hooks(vma_alloc_folio_noprof(__VA_ARGS__)) #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index aec756ae5637..69c431ef15d5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2277,6 +2277,13 @@ struct page *alloc_pages_mpol_noprof(gfp_t gfp, unsigned int order, return page; } +struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) +{ + return page_rmappable_folio(alloc_pages_mpol_noprof(gfp | __GFP_COMP, + order, pol, ilx, nid)); +} + /** * vma_alloc_folio - Allocate a folio for a VMA. * @gfp: GFP flags. -- cgit v1.2.3 From 3174d70cf694c7c1c506fecffdefa0d26a78cf60 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:07 +0800 Subject: mm: mempolicy: use folio_alloc_mpol_noprof() in vma_alloc_folio_noprof() Convert to use folio_alloc_mpol_noprof() to make vma_alloc_folio_noprof() to use folio throughout. Link: https://lkml.kernel.org/r/20240515070709.78529-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/mempolicy.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 69c431ef15d5..205d129c6744 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2305,13 +2305,12 @@ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct { struct mempolicy *pol; pgoff_t ilx; - struct page *page; + struct folio *folio; pol = get_vma_policy(vma, addr, order, &ilx); - page = alloc_pages_mpol_noprof(gfp | __GFP_COMP, order, - pol, ilx, numa_node_id()); + folio = folio_alloc_mpol_noprof(gfp, order, pol, ilx, numa_node_id()); mpol_cond_put(pol); - return page_rmappable_folio(page); + return folio; } EXPORT_SYMBOL(vma_alloc_folio_noprof); -- cgit v1.2.3 From 1d9cb7852bae9884eef413c213f82158ea84bebf Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:08 +0800 Subject: mm: mempolicy: use folio_alloc_mpol() in alloc_migration_target_by_mpol() Convert to use folio_alloc_mpol() to make vma_alloc_folio_noprof() to use folio throughout. Link: https://lkml.kernel.org/r/20240515070709.78529-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 205d129c6744..f73acb01ad45 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1211,7 +1211,6 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, struct migration_mpol *mmpol = (struct migration_mpol *)private; struct mempolicy *pol = mmpol->pol; pgoff_t ilx = mmpol->ilx; - struct page *page; unsigned int order; int nid = numa_node_id(); gfp_t gfp; @@ -1235,8 +1234,7 @@ static struct folio *alloc_migration_target_by_mpol(struct folio *src, else gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL | __GFP_COMP; - page = alloc_pages_mpol(gfp, order, pol, ilx, nid); - return page_rmappable_folio(page); + return folio_alloc_mpol(gfp, order, pol, ilx, nid); } #else -- cgit v1.2.3 From 6f775463d0027733caebfb75d62ec7c4f807f834 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 15 May 2024 15:07:09 +0800 Subject: mm: shmem: use folio_alloc_mpol() in shmem_alloc_folio() Let's change shmem_alloc_folio() to take a order and use folio_alloc_mpol() helper, then directly use it for normal or large folio to cleanup code. Link: https://lkml.kernel.org/r/20240515070709.78529-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/shmem.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index a8b181a63402..b2dce4103f0e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1603,32 +1603,18 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) return result; } -static struct folio *shmem_alloc_hugefolio(gfp_t gfp, +static struct folio *shmem_alloc_folio(gfp_t gfp, int order, struct shmem_inode_info *info, pgoff_t index) { struct mempolicy *mpol; pgoff_t ilx; - struct page *page; - - mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); - page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); - mpol_cond_put(mpol); - - return page_rmappable_folio(page); -} - -static struct folio *shmem_alloc_folio(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - struct mempolicy *mpol; - pgoff_t ilx; - struct page *page; + struct folio *folio; - mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); - page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol = shmem_get_pgoff_policy(info, index, order, &ilx); + folio = folio_alloc_mpol(gfp, order, mpol, ilx, numa_node_id()); mpol_cond_put(mpol); - return (struct folio *)page; + return folio; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1660,12 +1646,12 @@ static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, index + HPAGE_PMD_NR - 1, XA_PRESENT)) return ERR_PTR(-E2BIG); - folio = shmem_alloc_hugefolio(gfp, info, index); + folio = shmem_alloc_folio(gfp, HPAGE_PMD_ORDER, info, index); if (!folio) count_vm_event(THP_FILE_FALLBACK); } else { pages = 1; - folio = shmem_alloc_folio(gfp, info, index); + folio = shmem_alloc_folio(gfp, 0, info, index); } if (!folio) return ERR_PTR(-ENOMEM); @@ -1765,7 +1751,7 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, */ gfp &= ~GFP_CONSTRAINT_MASK; VM_BUG_ON_FOLIO(folio_test_large(old), old); - new = shmem_alloc_folio(gfp, info, index); + new = shmem_alloc_folio(gfp, 0, info, index); if (!new) return -ENOMEM; @@ -2633,7 +2619,7 @@ int shmem_mfill_atomic_pte(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; - folio = shmem_alloc_folio(gfp, info, pgoff); + folio = shmem_alloc_folio(gfp, 0, info, pgoff); if (!folio) goto out_unacct_blocks; -- cgit v1.2.3 From 7f83bf14603ef41a44dc907594d749a283e22c37 Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Wed, 15 May 2024 10:47:54 +0800 Subject: mm/huge_memory: mark racy access onhuge_anon_orders_always huge_anon_orders_always is accessed lockless, it is better to use the READ_ONCE() wrapper. This is not fixing any visible bug, hopefully this can cease some KCSAN complains in the future. Also do that for huge_anon_orders_madvise. Link: https://lkml.kernel.org/r/20240515104754889HqrahFPePOIE1UlANHVAh@zte.com.cn Signed-off-by: Ran Xiaokai Acked-by: David Hildenbrand Reviewed-by: Lu Zhongjun Reviewed-by: xu xin Cc: Yang Yang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 2aa986a5cd1b..088d66a54643 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -134,8 +134,8 @@ static inline bool hugepage_flags_enabled(void) * So we don't need to look at huge_anon_orders_inherit. */ return hugepage_global_enabled() || - huge_anon_orders_always || - huge_anon_orders_madvise; + READ_ONCE(huge_anon_orders_always) || + READ_ONCE(huge_anon_orders_madvise); } static inline int highest_order(unsigned long orders) -- cgit v1.2.3 From b82b530740b960eb949813043c4187c254039ac1 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 May 2024 16:26:41 -0400 Subject: mm: vmscan: restore incremental cgroup iteration Currently, reclaim always walks the entire cgroup tree in order to ensure fairness between groups. While overreclaim is limited in shrink_lruvec(), many of our systems have a sizable number of active groups, and an even bigger number of idle cgroups with cache left behind by previous jobs; the mere act of walking all these cgroups can impose significant latency on direct reclaimers. In the past, we've used a save-and-restore iterator that enabled incremental tree walks over multiple reclaim invocations. This ensured fairness, while keeping the work of individual reclaimers small. However, in edge cases with a lot of reclaim concurrency, individual reclaimers would sometimes not see enough of the cgroup tree to make forward progress and (prematurely) declare OOM. Consequently we switched to comprehensive walks in 1ba6fc9af35b ("mm: vmscan: do not share cgroup iteration between reclaimers"). To address the latency problem without bringing back the premature OOM issue, reinstate the shared iteration, but with a restart condition to do the full walk in the OOM case - similar to what we do for memory.low enforcement and active page protection. In the worst case, we do one more full tree walk before declaring OOM. But the vast majority of direct reclaim scans can then finish much quicker, while fairness across the tree is maintained: - Before this patch, we observed that direct reclaim always takes more than 100us and most direct reclaim time is spent in reclaim cycles lasting between 1ms and 1 second. Almost 40% of direct reclaim time was spent on reclaim cycles exceeding 100ms. - With this patch, almost all page reclaim cycles last less than 10ms, and a good amount of direct page reclaim finishes in under 100us. No page reclaim cycles lasting over 100ms were observed anymore. The shared iterator state is maintaned inside the target cgroup, so fair and incremental walks are performed during both global reclaim and cgroup limit reclaim of complex subtrees. Link: https://lkml.kernel.org/r/20240514202641.2821494-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Signed-off-by: Rik van Riel Reported-by: Rik van Riel Reviewed-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Facebook Kernel Team Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/vmscan.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 2e34de9cd0d4..a72864b4b620 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -128,6 +128,9 @@ struct scan_control { unsigned int memcg_low_reclaim:1; unsigned int memcg_low_skipped:1; + /* Shared cgroup tree walk failed, rescan the whole tree */ + unsigned int memcg_full_walk:1; + unsigned int hibernation_mode:1; /* One of the zones is ready for compaction */ @@ -5845,9 +5848,25 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + struct mem_cgroup_reclaim_cookie reclaim = { + .pgdat = pgdat, + }; + struct mem_cgroup_reclaim_cookie *partial = &reclaim; struct mem_cgroup *memcg; - memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + /* + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree, using an iterator state that + * persists across invocations. This strikes a balance between + * fairness and allocation latency. + * + * For kswapd, reliable forward progress is more important + * than a quick return to idle. Always do full walks. + */ + if (current_is_kswapd() || sc->memcg_full_walk) + partial = NULL; + + memcg = mem_cgroup_iter(target_memcg, NULL, partial); do { struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); unsigned long reclaimed; @@ -5897,7 +5916,12 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) sc->nr_scanned - scanned, sc->nr_reclaimed - reclaimed); - } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + /* If partial walks are allowed, bail once goal is reached */ + if (partial && sc->nr_reclaimed >= sc->nr_to_reclaim) { + mem_cgroup_iter_break(target_memcg, memcg); + break; + } + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, partial))); } static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) @@ -6270,6 +6294,20 @@ retry: if (sc->compaction_ready) return 1; + /* + * In most cases, direct reclaimers can do partial walks + * through the cgroup tree to meet the reclaim goal while + * keeping latency low. Since the iterator state is shared + * among all direct reclaim invocations (to retain fairness + * among cgroups), though, high concurrency can result in + * individual threads not seeing enough cgroups to make + * meaningful forward progress. Avoid false OOMs in this case. + */ + if (!sc->memcg_full_walk) { + sc->memcg_full_walk = 1; + goto retry; + } + /* * We make inactive:active ratio decisions based on the node's * composition of memory, but a restrictive reclaim_idx or a -- cgit v1.2.3 From 462966dc7d701b5c251f280e1c90e8fd301f11e3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 29 May 2024 08:49:11 -0700 Subject: mm: vmscan: reset sc->priority on retry The commit 6be5e186fd65 ("mm: vmscan: restore incremental cgroup iteration") added a retry reclaim heuristic to iterate all the cgroups before returning an unsuccessful reclaim but missed to reset the sc->priority. Let's fix it. Link: https://lkml.kernel.org/r/20240529154911.3008025-1-shakeel.butt@linux.dev Fixes: 6be5e186fd65 ("mm: vmscan: restore incremental cgroup iteration") Signed-off-by: Shakeel Butt Reported-by: syzbot+17416257cb95200cba44@syzkaller.appspotmail.com Tested-by: syzbot+17416257cb95200cba44@syzkaller.appspotmail.com Reviewed-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Signed-off-by: Andrew Morton --- mm/vmscan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index a72864b4b620..d1a87ceef0dc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6304,6 +6304,7 @@ retry: * meaningful forward progress. Avoid false OOMs in this case. */ if (!sc->memcg_full_walk) { + sc->priority = initial_priority; sc->memcg_full_walk = 1; goto retry; } -- cgit v1.2.3 From 7c0c629be5189b86acfd4a960c90e20eface9d84 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:47 +0800 Subject: writeback: factor out wb_bg_dirty_limits to remove repeated code Patch series "Add helper functions to remove repeated code and improve readability of cgroup writeback", v2. This series adds a lot of helpers to remove repeated code between domain and wb; dirty limit and dirty background; global domain and wb domain. The helpers also improve readability. More details can be found in the respective patches. A simple domain hierarchy is tested: global domain (> 20G) | cgroup domain1(10G) | wb1 | fio Test steps: /* make it easy to observe */ echo 300000 > /proc/sys/vm/dirty_expire_centisecs echo 3000 > /proc/sys/vm/dirty_writeback_centisecs /* create cgroup domain */ cd /sys/fs/cgroup echo "+memory +io" > cgroup.subtree_control mkdir group1 cd group1 echo 10G > memory.high echo 10G > memory.max echo $$ > cgroup.procs mkfs.ext4 -F /dev/vdb mount /dev/vdb /bdi1/ /* run fio to generate dirty pages */ fio -name test -filename=/bdi1/file -size=xxx -ioengine=libaio -bs=4K \ -iodepth=1 -rw=write -direct=0 --time_based -runtime=600 -invalidate=0 When fio size is 1G, the wb is in freerun state and dirty pages are only written back when dirty inode is expired after 30 seconds. When fio size is 2G, the dirty pages keep being written back and bandwidth of fio is limited. This patch (of 8): Similar to wb_dirty_limits which calculates dirty and thresh of wb, wb_bg_dirty_limits calculates background dirty and background thresh of wb. With wb_bg_dirty_limits, we could remove repeated code in wb_over_bg_thresh. Link: https://lkml.kernel.org/r/20240514125254.142203-1-shikemeng@huaweicloud.com Link: https://lkml.kernel.org/r/20240514125254.142203-2-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 12c9297ed4a7..4e5d933fd0c8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2087,6 +2087,21 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) } EXPORT_SYMBOL(balance_dirty_pages_ratelimited); +/* + * Similar to wb_dirty_limits, wb_bg_dirty_limits also calculates dirty + * and thresh, but it's for background writeback. + */ +static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) +{ + struct bdi_writeback *wb = dtc->wb; + + dtc->wb_bg_thresh = __wb_calc_thresh(dtc, dtc->bg_thresh); + if (dtc->wb_bg_thresh < 2 * wb_stat_error()) + dtc->wb_dirty = wb_stat_sum(wb, WB_RECLAIMABLE); + else + dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); +} + /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest @@ -2103,8 +2118,6 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) struct dirty_throttle_control * const gdtc = &gdtc_stor; struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; - unsigned long reclaimable; - unsigned long thresh; /* * Similar to balance_dirty_pages() but ignores pages being written @@ -2117,13 +2130,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (gdtc->dirty > gdtc->bg_thresh) return true; - thresh = __wb_calc_thresh(gdtc, gdtc->bg_thresh); - if (thresh < 2 * wb_stat_error()) - reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); - else - reclaimable = wb_stat(wb, WB_RECLAIMABLE); - - if (reclaimable > thresh) + wb_bg_dirty_limits(gdtc); + if (gdtc->wb_dirty > gdtc->wb_bg_thresh) return true; if (mdtc) { @@ -2137,13 +2145,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) if (mdtc->dirty > mdtc->bg_thresh) return true; - thresh = __wb_calc_thresh(mdtc, mdtc->bg_thresh); - if (thresh < 2 * wb_stat_error()) - reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); - else - reclaimable = wb_stat(wb, WB_RECLAIMABLE); - - if (reclaimable > thresh) + wb_bg_dirty_limits(mdtc); + if (mdtc->wb_dirty > mdtc->wb_bg_thresh) return true; } -- cgit v1.2.3 From ba62d5cfe181845ae850f5af45cf258b9b8f703f Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:48 +0800 Subject: writeback: add general function domain_dirty_avail to calculate dirty and avail of domain Add general function domain_dirty_avail to calculate dirty and avail for either dirty limit or background writeback in either global domain or wb domain. Link: https://lkml.kernel.org/r/20240514125254.142203-3-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 65 ++++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 31 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4e5d933fd0c8..fe5d5a1856f2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -837,6 +837,34 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, mdtc->avail = filepages + min(headroom, other_clean); } +static inline bool dtc_is_global(struct dirty_throttle_control *dtc) +{ + return mdtc_gdtc(dtc) == NULL; +} + +/* + * Dirty background will ignore pages being written as we're trying to + * decide whether to put more under writeback. + */ +static void domain_dirty_avail(struct dirty_throttle_control *dtc, + bool include_writeback) +{ + if (dtc_is_global(dtc)) { + dtc->avail = global_dirtyable_memory(); + dtc->dirty = global_node_page_state(NR_FILE_DIRTY); + if (include_writeback) + dtc->dirty += global_node_page_state(NR_WRITEBACK); + } else { + unsigned long filepages = 0, headroom = 0, writeback = 0; + + mem_cgroup_wb_stats(dtc->wb, &filepages, &headroom, &dtc->dirty, + &writeback); + if (include_writeback) + dtc->dirty += writeback; + mdtc_calc_avail(dtc, filepages, headroom); + } +} + /** * __wb_calc_thresh - @wb's share of dirty threshold * @dtc: dirty_throttle_context of interest @@ -899,16 +927,9 @@ unsigned long cgwb_calc_thresh(struct bdi_writeback *wb) { struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB }; struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; - unsigned long filepages = 0, headroom = 0, writeback = 0; - gdtc.avail = global_dirtyable_memory(); - gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_WRITEBACK); - - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc.dirty, &writeback); - mdtc.dirty += writeback; - mdtc_calc_avail(&mdtc, filepages, headroom); + domain_dirty_avail(&gdtc, true); + domain_dirty_avail(&mdtc, true); domain_dirty_limits(&mdtc); return __wb_calc_thresh(&mdtc, mdtc.thresh); @@ -1719,9 +1740,8 @@ static int balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_bg_thresh = 0; nr_dirty = global_node_page_state(NR_FILE_DIRTY); - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = nr_dirty + global_node_page_state(NR_WRITEBACK); + domain_dirty_avail(gdtc, true); domain_dirty_limits(gdtc); if (unlikely(strictlimit)) { @@ -1737,17 +1757,11 @@ static int balance_dirty_pages(struct bdi_writeback *wb, } if (mdtc) { - unsigned long filepages, headroom, writeback; - /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ - mem_cgroup_wb_stats(wb, &filepages, &headroom, - &mdtc->dirty, &writeback); - mdtc->dirty += writeback; - mdtc_calc_avail(mdtc, filepages, headroom); - + domain_dirty_avail(mdtc, true); domain_dirty_limits(mdtc); if (unlikely(strictlimit)) { @@ -2119,14 +2133,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; - /* - * Similar to balance_dirty_pages() but ignores pages being written - * as we're trying to decide whether to put more under writeback. - */ - gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); + domain_dirty_avail(gdtc, false); domain_dirty_limits(gdtc); - if (gdtc->dirty > gdtc->bg_thresh) return true; @@ -2135,13 +2143,8 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) return true; if (mdtc) { - unsigned long filepages, headroom, writeback; - - mem_cgroup_wb_stats(wb, &filepages, &headroom, &mdtc->dirty, - &writeback); - mdtc_calc_avail(mdtc, filepages, headroom); + domain_dirty_avail(mdtc, false); domain_dirty_limits(mdtc); /* ditto, ignore writeback */ - if (mdtc->dirty > mdtc->bg_thresh) return true; -- cgit v1.2.3 From 6e208329687e8a6a9db4f63ef351c0d0c43a39e0 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:49 +0800 Subject: writeback: factor out domain_over_bg_thresh to remove repeated code Factor out domain_over_bg_thresh from wb_over_bg_thresh to remove repeated code. Link: https://lkml.kernel.org/r/20240514125254.142203-4-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 41 +++++++++++++++++++---------------------- 1 file changed, 19 insertions(+), 22 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fe5d5a1856f2..daba24fbfc84 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2116,6 +2116,20 @@ static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) dtc->wb_dirty = wb_stat(wb, WB_RECLAIMABLE); } +static bool domain_over_bg_thresh(struct dirty_throttle_control *dtc) +{ + domain_dirty_avail(dtc, false); + domain_dirty_limits(dtc); + if (dtc->dirty > dtc->bg_thresh) + return true; + + wb_bg_dirty_limits(dtc); + if (dtc->wb_dirty > dtc->wb_bg_thresh) + return true; + + return false; +} + /** * wb_over_bg_thresh - does @wb need to be written back? * @wb: bdi_writeback of interest @@ -2127,31 +2141,14 @@ static void wb_bg_dirty_limits(struct dirty_throttle_control *dtc) */ bool wb_over_bg_thresh(struct bdi_writeback *wb) { - struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; - struct dirty_throttle_control mdtc_stor = { MDTC_INIT(wb, &gdtc_stor) }; - struct dirty_throttle_control * const gdtc = &gdtc_stor; - struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? - &mdtc_stor : NULL; - - domain_dirty_avail(gdtc, false); - domain_dirty_limits(gdtc); - if (gdtc->dirty > gdtc->bg_thresh) - return true; + struct dirty_throttle_control gdtc = { GDTC_INIT(wb) }; + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) }; - wb_bg_dirty_limits(gdtc); - if (gdtc->wb_dirty > gdtc->wb_bg_thresh) + if (domain_over_bg_thresh(&gdtc)) return true; - if (mdtc) { - domain_dirty_avail(mdtc, false); - domain_dirty_limits(mdtc); /* ditto, ignore writeback */ - if (mdtc->dirty > mdtc->bg_thresh) - return true; - - wb_bg_dirty_limits(mdtc); - if (mdtc->wb_dirty > mdtc->wb_bg_thresh) - return true; - } + if (mdtc_valid(&mdtc)) + return domain_over_bg_thresh(&mdtc); return false; } -- cgit v1.2.3 From 9bb48a70386326acf22e4523d1f72c912f5854fe Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:50 +0800 Subject: writeback: factor out code of freerun to remove repeated code Factor out code of freerun into new helper functions domain_poll_intv and domain_dirty_freerun to remove repeated code. Link: https://lkml.kernel.org/r/20240514125254.142203-5-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 89 +++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index daba24fbfc84..72d55c2fe432 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -139,6 +139,7 @@ struct dirty_throttle_control { unsigned long wb_bg_thresh; unsigned long pos_ratio; + bool freerun; }; /* @@ -1702,6 +1703,49 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) } } +static unsigned long domain_poll_intv(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh; + + if (strictlimit) { + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + } + + return dirty_poll_interval(dirty, thresh); +} + +/* + * Throttle it only when the background writeback cannot catch-up. This avoids + * (excessively) small writeouts when the wb limits are ramping up in case of + * !strictlimit. + * + * In strictlimit case make decision based on the wb counters and limits. Small + * writeouts when the wb limits are ramping up are the price we consciously pay + * for strictlimit-ing. + */ +static void domain_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + unsigned long dirty, thresh, bg_thresh; + + if (unlikely(strictlimit)) { + wb_dirty_limits(dtc); + dirty = dtc->wb_dirty; + thresh = dtc->wb_thresh; + bg_thresh = dtc->wb_bg_thresh; + } else { + dirty = dtc->dirty; + thresh = dtc->thresh; + bg_thresh = dtc->bg_thresh; + } + dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1734,27 +1778,12 @@ static int balance_dirty_pages(struct bdi_writeback *wb, for (;;) { unsigned long now = jiffies; - unsigned long dirty, thresh, bg_thresh; - unsigned long m_dirty = 0; /* stop bogus uninit warnings */ - unsigned long m_thresh = 0; - unsigned long m_bg_thresh = 0; nr_dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_avail(gdtc, true); domain_dirty_limits(gdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(gdtc); - - dirty = gdtc->wb_dirty; - thresh = gdtc->wb_thresh; - bg_thresh = gdtc->wb_bg_thresh; - } else { - dirty = gdtc->dirty; - thresh = gdtc->thresh; - bg_thresh = gdtc->bg_thresh; - } + domain_dirty_freerun(gdtc, strictlimit); if (mdtc) { /* @@ -1763,17 +1792,7 @@ static int balance_dirty_pages(struct bdi_writeback *wb, */ domain_dirty_avail(mdtc, true); domain_dirty_limits(mdtc); - - if (unlikely(strictlimit)) { - wb_dirty_limits(mdtc); - m_dirty = mdtc->wb_dirty; - m_thresh = mdtc->wb_thresh; - m_bg_thresh = mdtc->wb_bg_thresh; - } else { - m_dirty = mdtc->dirty; - m_thresh = mdtc->thresh; - m_bg_thresh = mdtc->bg_thresh; - } + domain_dirty_freerun(mdtc, strictlimit); } /* @@ -1790,31 +1809,21 @@ static int balance_dirty_pages(struct bdi_writeback *wb, wb_start_background_writeback(wb); /* - * Throttle it only when the background writeback cannot - * catch-up. This avoids (excessively) small writeouts - * when the wb limits are ramping up in case of !strictlimit. - * - * In strictlimit case make decision based on the wb counters - * and limits. Small writeouts when the wb limits are ramping - * up are the price we consciously pay for strictlimit-ing. - * * If memcg domain is in effect, @dirty should be under * both global and memcg freerun ceilings. */ - if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) && - (!mdtc || - m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) { + if (gdtc->freerun && (!mdtc || mdtc->freerun)) { unsigned long intv; unsigned long m_intv; free_running: - intv = dirty_poll_interval(dirty, thresh); + intv = domain_poll_intv(gdtc, strictlimit); m_intv = ULONG_MAX; current->dirty_paused_when = now; current->nr_dirtied = 0; if (mdtc) - m_intv = dirty_poll_interval(m_dirty, m_thresh); + m_intv = domain_poll_intv(mdtc, strictlimit); current->nr_dirtied_pause = min(intv, m_intv); break; } -- cgit v1.2.3 From 2530e2399b84b91b1dbb200e0c3b4eb76d3870ce Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:51 +0800 Subject: writeback: factor out wb_dirty_freerun to remove more repeated freerun code Factor out wb_dirty_freerun to remove more repeated freerun code. Link: https://lkml.kernel.org/r/20240514125254.142203-6-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 55 +++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 72d55c2fe432..a93505cadd4e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1746,6 +1746,27 @@ static void domain_dirty_freerun(struct dirty_throttle_control *dtc, dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); } +static void wb_dirty_freerun(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->freerun = false; + + /* was already handled in domain_dirty_freerun */ + if (strictlimit) + return; + + wb_dirty_limits(dtc); + /* + * LOCAL_THROTTLE tasks must not be throttled when below the per-wb + * freerun ceiling. + */ + if (!(current->flags & PF_LOCAL_THROTTLE)) + return; + + dtc->freerun = dtc->wb_dirty < + dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1838,19 +1859,9 @@ free_running: * Calculate global domain's pos_ratio and select the * global dtc by default. */ - if (!strictlimit) { - wb_dirty_limits(gdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - gdtc->wb_dirty < - dirty_freerun_ceiling(gdtc->wb_thresh, - gdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be throttled - * when below the per-wb freerun ceiling. - */ - goto free_running; - } + wb_dirty_freerun(gdtc, strictlimit); + if (gdtc->freerun) + goto free_running; dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && ((gdtc->dirty > gdtc->thresh) || strictlimit); @@ -1865,20 +1876,10 @@ free_running: * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - if (!strictlimit) { - wb_dirty_limits(mdtc); - - if ((current->flags & PF_LOCAL_THROTTLE) && - mdtc->wb_dirty < - dirty_freerun_ceiling(mdtc->wb_thresh, - mdtc->wb_bg_thresh)) - /* - * LOCAL_THROTTLE tasks must not be - * throttled when below the per-wb - * freerun ceiling. - */ - goto free_running; - } + wb_dirty_freerun(mdtc, strictlimit); + if (mdtc->freerun) + goto free_running; + dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && ((mdtc->dirty > mdtc->thresh) || strictlimit); -- cgit v1.2.3 From 8c9918dedf78e0ed11b236e528e3f3aed5cad458 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:52 +0800 Subject: writeback: factor out balance_domain_limits to remove repeated code Factor out balance_domain_limits to remove repeated code. Link: https://lkml.kernel.org/r/20240514125254.142203-7-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a93505cadd4e..e03beb95a62e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1746,6 +1746,14 @@ static void domain_dirty_freerun(struct dirty_throttle_control *dtc, dtc->freerun = dirty <= dirty_freerun_ceiling(thresh, bg_thresh); } +static void balance_domain_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + domain_dirty_avail(dtc, true); + domain_dirty_limits(dtc); + domain_dirty_freerun(dtc, strictlimit); +} + static void wb_dirty_freerun(struct dirty_throttle_control *dtc, bool strictlimit) { @@ -1802,18 +1810,13 @@ static int balance_dirty_pages(struct bdi_writeback *wb, nr_dirty = global_node_page_state(NR_FILE_DIRTY); - domain_dirty_avail(gdtc, true); - domain_dirty_limits(gdtc); - domain_dirty_freerun(gdtc, strictlimit); - + balance_domain_limits(gdtc, strictlimit); if (mdtc) { /* * If @wb belongs to !root memcg, repeat the same * basic calculations for the memcg domain. */ - domain_dirty_avail(mdtc, true); - domain_dirty_limits(mdtc); - domain_dirty_freerun(mdtc, strictlimit); + balance_domain_limits(mdtc, strictlimit); } /* -- cgit v1.2.3 From 236d0f16eb9d1773ea51ed53c3538c1f589591f1 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:53 +0800 Subject: writeback: factor out wb_dirty_exceeded to remove repeated code Factor out wb_dirty_exceeded to remove repeated code Link: https://lkml.kernel.org/r/20240514125254.142203-8-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Acked-by: Tejun Heo Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page-writeback.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index e03beb95a62e..bf050abd9053 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -140,6 +140,7 @@ struct dirty_throttle_control { unsigned long pos_ratio; bool freerun; + bool dirty_exceeded; }; /* @@ -1775,6 +1776,13 @@ static void wb_dirty_freerun(struct dirty_throttle_control *dtc, dirty_freerun_ceiling(dtc->wb_thresh, dtc->wb_bg_thresh); } +static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + dtc->dirty_exceeded = (dtc->wb_dirty > dtc->wb_thresh) && + ((dtc->dirty > dtc->thresh) || strictlimit); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1797,7 +1805,6 @@ static int balance_dirty_pages(struct bdi_writeback *wb, long max_pause; long min_pause; int nr_dirtied_pause; - bool dirty_exceeded = false; unsigned long task_ratelimit; unsigned long dirty_ratelimit; struct backing_dev_info *bdi = wb->bdi; @@ -1866,9 +1873,7 @@ free_running: if (gdtc->freerun) goto free_running; - dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) && - ((gdtc->dirty > gdtc->thresh) || strictlimit); - + wb_dirty_exceeded(gdtc, strictlimit); wb_position_ratio(gdtc); sdtc = gdtc; @@ -1883,17 +1888,14 @@ free_running: if (mdtc->freerun) goto free_running; - dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) && - ((mdtc->dirty > mdtc->thresh) || strictlimit); - + wb_dirty_exceeded(mdtc, strictlimit); wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } - if (dirty_exceeded != wb->dirty_exceeded) - wb->dirty_exceeded = dirty_exceeded; - + wb->dirty_exceeded = gdtc->dirty_exceeded || + (mdtc && mdtc->dirty_exceeded); if (time_is_before_jiffies(READ_ONCE(wb->bw_time_stamp) + BANDWIDTH_INTERVAL)) __wb_update_bandwidth(gdtc, mdtc, true); -- cgit v1.2.3 From 8246291eccdd44cc81684991259680d543bb29b7 Mon Sep 17 00:00:00 2001 From: Kemeng Shi Date: Tue, 14 May 2024 20:52:54 +0800 Subject: writeback: factor out balance_wb_limits to remove repeated code Factor out balance_wb_limits to remove repeated code [shikemeng@huaweicloud.com: add comment] Link: https://lkml.kernel.org/r/20240606033547.344376-1-shikemeng@huaweicloud.com [akpm@linux-foundation.org: s/fileds/fields/ in comment] Link: https://lkml.kernel.org/r/20240514125254.142203-9-shikemeng@huaweicloud.com Signed-off-by: Kemeng Shi Cc: Matthew Wilcox (Oracle) Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/page-writeback.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index bf050abd9053..7168e25f88e5 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1783,6 +1783,21 @@ static inline void wb_dirty_exceeded(struct dirty_throttle_control *dtc, ((dtc->dirty > dtc->thresh) || strictlimit); } +/* + * The limits fields dirty_exceeded and pos_ratio won't be updated if wb is + * in freerun state. Please don't use these invalid fields in freerun case. + */ +static void balance_wb_limits(struct dirty_throttle_control *dtc, + bool strictlimit) +{ + wb_dirty_freerun(dtc, strictlimit); + if (dtc->freerun) + return; + + wb_dirty_exceeded(dtc, strictlimit); + wb_position_ratio(dtc); +} + /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force @@ -1869,12 +1884,9 @@ free_running: * Calculate global domain's pos_ratio and select the * global dtc by default. */ - wb_dirty_freerun(gdtc, strictlimit); + balance_wb_limits(gdtc, strictlimit); if (gdtc->freerun) goto free_running; - - wb_dirty_exceeded(gdtc, strictlimit); - wb_position_ratio(gdtc); sdtc = gdtc; if (mdtc) { @@ -1884,12 +1896,9 @@ free_running: * both global and memcg domains. Choose the one * w/ lower pos_ratio. */ - wb_dirty_freerun(mdtc, strictlimit); + balance_wb_limits(mdtc, strictlimit); if (mdtc->freerun) goto free_running; - - wb_dirty_exceeded(mdtc, strictlimit); - wb_position_ratio(mdtc); if (mdtc->pos_ratio < gdtc->pos_ratio) sdtc = mdtc; } -- cgit v1.2.3 From 1f49c1476d09168eeaed462e587ed9161784a561 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:44 +0800 Subject: nilfs2: drop usage of page_index Patch series "mm/swap: clean up and optimize swap cache index", v6. Currently we use one swap_address_space for every 64M chunk to reduce lock contention, this is like having a set of smaller files inside a swap device. But when doing swap cache look up or insert, we are still using the offset of the whole large swap device. This is OK for correctness, as the offset (key) is unique. But Xarray is specially optimized for small indexes, it creates the redix tree levels lazily to be just enough to fit the largest key stored in one Xarray. So we are wasting tree nodes unnecessarily. For 64M chunk it should only take at most 3 level to contain everything. But if we are using the offset from the whole swap device, the offset (key) value will be way beyond 64M, and so will the tree level. Optimize this by reduce the swap cache search space into 64M scope. Test with `time memhog 128G` inside a 8G memcg using 128G swap (ramdisk with SWP_SYNCHRONOUS_IO dropped, tested 3 times, results are stable. The test result is similar but the improvement is smaller if SWP_SYNCHRONOUS_IO is enabled, as swap out path can never skip swap cache): Before: 6.07user 250.74system 4:17.26elapsed 99%CPU (0avgtext+0avgdata 8373376maxresident)k 0inputs+0outputs (55major+33555018minor)pagefaults 0swaps After (+1.8% faster): 6.08user 246.09system 4:12.58elapsed 99%CPU (0avgtext+0avgdata 8373248maxresident)k 0inputs+0outputs (54major+33555027minor)pagefaults 0swaps Similar result with MySQL and sysbench using swap: Before: 94055.61 qps After (+0.8% faster): 94834.91 qps There is alse a very slight drop of radix tree node slab usage: Before: 303952K After: 302224K For this series: There are multiple places that expect mixed type of pages (page cache or swap cache), eg. migration, huge memory split; There are four helpers for that: - page_index - page_file_offset - folio_index - folio_file_pos To keep the code clean and compatible, this series first cleaned up usage of them. page_file_offset and folio_file_pos are historical helpes that can be simply dropped after clean up. And page_index can be all converted to folio_index or folio->index. Then introduce two new helpers swap_cache_index and swap_dev_pos for swap. Replace swp_offset with swap_cache_index when used to retrieve folio from swap cache, and use swap_dev_pos when needed to retrieve the device position of a swap entry. This way, swap_cache_index can return the optimized value with no compatibility issue. The result is better performance and reduced LOC. Idealy, in the future, we may want to reduce SWAP_ADDRESS_SPACE_SHIFT from 14 to 12: Default Xarray chunk offset is 6, so we have 3 level trees instead of 2 level trees just for 2 extra bits. But swap cache is based on address_space struct, with 4 times more metadata sparsely distributed in memory it waste more cacheline, the performance gain from this series is almost canceled according to my test. So first, just have a cleaner seperation of offsets and smaller search space. This patch (of 10): page_index is only for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use page->index instead. It can't be a swap cache page here (being part of buffer head), so just drop it. And while we are at it, optimize the code by retrieving the offset of the buffer head within the folio directly using bh_offset, and get rid of the loop and usage of page helpers. Link: https://lkml.kernel.org/r/20240521175854.96038-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240521175854.96038-3-ryncsn@gmail.com Suggested-by: Matthew Wilcox Signed-off-by: Kairui Song Acked-by: Ryusuke Konishi Cc: Ryusuke Konishi Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Trond Myklebust Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/nilfs2/bmap.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 383f0afa2cea..cd14ea25968c 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -450,15 +450,9 @@ int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap) __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, const struct buffer_head *bh) { - struct buffer_head *pbh; - __u64 key; + loff_t pos = folio_pos(bh->b_folio) + bh_offset(bh); - key = page_index(bh->b_page) << (PAGE_SHIFT - - bmap->b_inode->i_blkbits); - for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) - key++; - - return key; + return pos >> bmap->b_inode->i_blkbits; } __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key) -- cgit v1.2.3 From 5e425300af15f8fcf6d0fec188c8bf5207c1456d Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:45 +0800 Subject: ceph: drop usage of page_index page_index is needed for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use page->index instead. It can't be a swap cache page here, so just drop it. Link: https://lkml.kernel.org/r/20240521175854.96038-4-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Xiubo Li Cc: Ilya Dryomov Cc: Jeff Layton Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Jaegeuk Kim Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Trond Myklebust Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/ceph/dir.c | 2 +- fs/ceph/inode.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 82a2e2a06a65..5aadc56e0cc0 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,7 +141,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != page_index(cache_ctl->page)) { + if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { ceph_readdir_cache_release(cache_ctl); cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); if (!cache_ctl->page) { diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 249ddfbb1b03..8f8de8f33abb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1863,7 +1863,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != page_index(ctl->page)) { + if (!ctl->page || pgoff != ctl->page->index) { ceph_readdir_cache_release(ctl); if (idx == 0) ctl->page = grab_cache_page(&dir->i_data, pgoff); -- cgit v1.2.3 From 8586be3ddf9a6133ec9f5915b35cd4e704d4131b Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:46 +0800 Subject: NFS: remove nfs_page_lengthg and usage of page_index This function is no longer used after commit 4fa7a717b432 ("NFS: Fix up nfs_vm_page_mkwrite() for folios"), all users have been converted to use folio instead, just delete it to remove usage of page_index. Link: https://lkml.kernel.org/r/20240521175854.96038-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: David Howells Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Jeff Layton Cc: Marc Dionne Cc: Matthew Wilcox (Oracle) Cc: Minchan Kim Cc: NeilBrown Cc: Ryan Roberts Cc: Ryusuke Konishi Cc: Xiubo Li Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/nfs/internal.h | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9f0f4534744b..f3a0930ab4e8 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -797,25 +797,6 @@ static inline void nfs_folio_mark_unstable(struct folio *folio, } } -/* - * Determine the number of bytes of data the page contains - */ -static inline -unsigned int nfs_page_length(struct page *page) -{ - loff_t i_size = i_size_read(page_file_mapping(page)->host); - - if (i_size > 0) { - pgoff_t index = page_index(page); - pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; - if (index < end_index) - return PAGE_SIZE; - if (index == end_index) - return ((i_size - 1) & ~PAGE_MASK) + 1; - } - return 0; -} - /* * Determine the number of bytes of data the page contains */ -- cgit v1.2.3 From d4f439865f97492c1931962c8a90fc9d7204d655 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 22 May 2024 01:58:47 +0800 Subject: afs: drop usage of folio_file_pos folio_file_pos is only needed for mixed usage of page cache and swap cache, for pure page cache usage, the caller can just use folio_pos instead. It can't be a swap cache page here. Swap mapping may only call into fs through swap_rw and that is not supported for afs. So just drop it and use folio_pos instead. Link: https://lkml.kernel.org/r/20240521175854.96038-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: David Howells Cc: Marc Dionne Cc: Anna Schumaker Cc: Barry Song Cc: Chao Yu Cc: Chris Li Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jaegeuk Kim