From 04a42e72d77a93a166b79c34b7bc862f55a53967 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Wed, 14 Dec 2022 22:17:57 -0800 Subject: mm: move folio_set_compound_order() to mm/internal.h folio_set_compound_order() is moved to an mm-internal location so external folio users cannot misuse this function. Change the name of the function to folio_set_order() and use WARN_ON_ONCE() rather than BUG_ON. Also, handle the case if a non-large folio is passed and add clarifying comments to the function. Link: https://lore.kernel.org/lkml/20221207223731.32784-1-sidhartha.kumar@oracle.com/T/ Link: https://lkml.kernel.org/r/20221215061757.223440-1-sidhartha.kumar@oracle.com Fixes: 9fd330582b2f ("mm: add folio dtor and order setter functions") Signed-off-by: Sidhartha Kumar Suggested-by: Mike Kravetz Suggested-by: Muchun Song Suggested-by: Matthew Wilcox Suggested-by: John Hubbard Reviewed-by: John Hubbard Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f857163ac89..253b2d7489e6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1019,22 +1019,6 @@ static inline void set_compound_order(struct page *page, unsigned int order) #endif } -/* - * folio_set_compound_order is generally passed a non-zero order to - * initialize a large folio. However, hugetlb code abuses this by - * passing in zero when 'dissolving' a large folio. - */ -static inline void folio_set_compound_order(struct folio *folio, - unsigned int order) -{ - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - - folio->_folio_order = order; -#ifdef CONFIG_64BIT - folio->_folio_nr_pages = order ? 1U << order : 0; -#endif -} - /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { -- cgit v1.2.3 From 105ff5339f498af74e60d7662c8f1c4d21f1342d Mon Sep 17 00:00:00 2001 From: Jeff Xu Date: Thu, 15 Dec 2022 00:12:03 +0000 Subject: mm/memfd: add MFD_NOEXEC_SEAL and MFD_EXEC The new MFD_NOEXEC_SEAL and MFD_EXEC flags allows application to set executable bit at creation time (memfd_create). When MFD_NOEXEC_SEAL is set, memfd is created without executable bit (mode:0666), and sealed with F_SEAL_EXEC, so it can't be chmod to be executable (mode: 0777) after creation. when MFD_EXEC flag is set, memfd is created with executable bit (mode:0777), this is the same as the old behavior of memfd_create. The new pid namespaced sysctl vm.memfd_noexec has 3 values: 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_EXEC was set. 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL acts like MFD_NOEXEC_SEAL was set. 2: memfd_create() without MFD_NOEXEC_SEAL will be rejected. The sysctl allows finer control of memfd_create for old-software that doesn't set the executable bit, for example, a container with vm.memfd_noexec=1 means the old-software will create non-executable memfd by default. Also, the value of memfd_noexec is passed to child namespace at creation time. For example, if the init namespace has vm.memfd_noexec=2, all its children namespaces will be created with 2. [akpm@linux-foundation.org: add stub functions to fix build] [akpm@linux-foundation.org: remove unneeded register_pid_ns_ctl_table_vm() stub, per Jeff] [akpm@linux-foundation.org: s/pr_warn_ratelimited/pr_warn_once/, per review] [akpm@linux-foundation.org: fix CONFIG_SYSCTL=n warning] Link: https://lkml.kernel.org/r/20221215001205.51969-4-jeffxu@google.com Signed-off-by: Jeff Xu Co-developed-by: Daniel Verkamp Signed-off-by: Daniel Verkamp Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: David Herrmann Cc: Dmitry Torokhov Cc: Hugh Dickins Cc: Jann Horn Cc: Jorge Lucangeli Obes Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/pid_namespace.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 07481bb87d4e..c758809d5bcf 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -16,6 +16,21 @@ struct fs_pin; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) +/* + * sysctl for vm.memfd_noexec + * 0: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_EXEC was set. + * 1: memfd_create() without MFD_EXEC nor MFD_NOEXEC_SEAL + * acts like MFD_NOEXEC_SEAL was set. + * 2: memfd_create() without MFD_NOEXEC_SEAL will be + * rejected. + */ +#define MEMFD_NOEXEC_SCOPE_EXEC 0 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_SEAL 1 +#define MEMFD_NOEXEC_SCOPE_NOEXEC_ENFORCED 2 +#endif + struct pid_namespace { struct idr idr; struct rcu_head rcu; @@ -31,6 +46,10 @@ struct pid_namespace { struct ucounts *ucounts; int reboot; /* group exit code if this pidns was rebooted */ struct ns_common ns; +#if defined(CONFIG_SYSCTL) && defined(CONFIG_MEMFD_CREATE) + /* sysctl for vm.memfd_noexec */ + int memfd_noexec_scope; +#endif } __randomize_layout; extern struct pid_namespace init_pid_ns; -- cgit v1.2.3 From fe7d4c6d5a42f5bdc63fdfdca2cad32c8a779e23 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:54 -0500 Subject: mm/hugetlb: document huge_pte_offset usage huge_pte_offset() is potentially a pgtable walker, looking up pte_t* for a hugetlb address. Normally, it's always safe to walk a generic pgtable as long as we're with the mmap lock held for either read or write, because that guarantees the pgtable pages will always be valid during the process. But it's not true for hugetlbfs, especially shared: hugetlbfs can have its pgtable freed by pmd unsharing, it means that even with mmap lock held for current mm, the PMD pgtable page can still go away from under us if pmd unsharing is possible during the walk. So we have two ways to make it safe even for a shared mapping: (1) If we're with the hugetlb vma lock held for either read/write, it's okay because pmd unshare cannot happen at all. (2) If we're with the i_mmap_rwsem lock held for either read/write, it's okay because even if pmd unshare can happen, the pgtable page cannot be freed from under us. Document it. Link: https://lkml.kernel.org/r/20221216155100.2043537-4-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Mike Kravetz Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 551834cd5299..d755e2a7c0db 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -192,6 +192,38 @@ extern struct list_head huge_boot_pages; pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long sz); +/* + * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. + * Returns the pte_t* if found, or NULL if the address is not mapped. + * + * Since this function will walk all the pgtable pages (including not only + * high-level pgtable page, but also PUD entry that can be unshared + * concurrently for VM_SHARED), the caller of this function should be + * responsible of its thread safety. One can follow this rule: + * + * (1) For private mappings: pmd unsharing is not possible, so holding the + * mmap_lock for either read or write is sufficient. Most callers + * already hold the mmap_lock, so normally, no special action is + * required. + * + * (2) For shared mappings: pmd unsharing is possible (so the PUD-ranged + * pgtable page can go away from under us! It can be done by a pmd + * unshare with a follow up munmap() on the other process), then we + * need either: + * + * (2.1) hugetlb vma lock read or write held, to make sure pmd unshare + * won't happen upon the range (it also makes sure the pte_t we + * read is the right and stable one), or, + * + * (2.2) hugetlb mapping i_mmap_rwsem lock held read or write, to make + * sure even if unshare happened the racy unmap() will wait until + * i_mmap_rwsem is released. + * + * Option (2.1) is the safest, which guarantees pte stability from pmd + * sharing pov, until the vma lock released. Option (2.2) doesn't protect + * a concurrent pmd unshare, but it makes sure the pgtable page is safe to + * access. + */ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz); unsigned long hugetlb_mask_last_page(struct hstate *h); -- cgit v1.2.3 From fcd48540d188876c917a377d81cd24c100332a62 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:50:55 -0500 Subject: mm/hugetlb: move swap entry handling into vma lock when faulted In hugetlb_fault(), there used to have a special path to handle swap entry at the entrance using huge_pte_offset(). That's unsafe because huge_pte_offset() for a pmd sharable range can access freed pgtables if without any lock to protect the pgtable from being freed after pmd unshare. Here the simplest solution to make it safe is to move the swap handling to be after the vma lock being held. We may need to take the fault mutex on either migration or hwpoison entries now (also the vma lock, but that's really needed), however neither of them is hot path. Note that the vma lock cannot be released in hugetlb_fault() when the migration entry is detected, because in migration_entry_wait_huge() the pgtable page will be used again (by taking the pgtable lock), so that also need to be protected by the vma lock. Modify migration_entry_wait_huge() so that it must be called with vma read lock held, and properly release the lock in __migration_entry_wait_huge(). Link: https://lkml.kernel.org/r/20221216155100.2043537-5-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/swapops.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index b982dd614572..3a451b7afcb3 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -337,7 +337,8 @@ extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); #ifdef CONFIG_HUGETLB_PAGE -extern void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl); +extern void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl); extern void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte); #endif /* CONFIG_HUGETLB_PAGE */ #else /* CONFIG_MIGRATION */ @@ -366,7 +367,8 @@ static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } #ifdef CONFIG_HUGETLB_PAGE -static inline void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl) { } +static inline void __migration_entry_wait_huge(struct vm_area_struct *vma, + pte_t *ptep, spinlock_t *ptl) { } static inline void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte) { } #endif /* CONFIG_HUGETLB_PAGE */ static inline int is_writable_migration_entry(swp_entry_t entry) -- cgit v1.2.3 From dd361e5033cf36c51acab996ea17748b81cedb38 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:26 -0500 Subject: mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to make sure the pgtable page will not be freed concurrently. Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Cc: Andrea Arcangeli Cc: David Hildenbrand Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index 959f52e5867d..27a6df448ee5 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -21,7 +21,16 @@ struct mm_walk; * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) * are skipped. - * @hugetlb_entry: if set, called for each hugetlb entry + * @hugetlb_entry: if set, called for each hugetlb entry. This hook + * function is called with the vma lock held, in order to + * protect against a concurrent freeing of the pte_t* or + * the ptl. In some cases, the hook function needs to drop + * and retake the vma lock in order to avoid deadlocks + * while calling other functions. In such cases the hook + * function must either refrain from accessing the pte or + * ptl after dropping the vma lock, or else revalidate + * those items after re-acquiring the vma lock and before + * accessing them. * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means * "do page table walk over the current vma", returning -- cgit v1.2.3 From 9c67a20704e763f9cb8cd262c3e45de7bd2816bc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 16 Dec 2022 10:52:29 -0500 Subject: mm/hugetlb: introduce hugetlb_walk() huge_pte_offset() is the main walker function for hugetlb pgtables. The name is not really representing what it does, though. Instead of renaming it, introduce a wrapper function called hugetlb_walk() which will use huge_pte_offset() inside. Assert on the locks when walking the pgtable. Note, the vma lock assertion will be a no-op for private mappings. Document the last special case in the page_vma_mapped_walk() path where we don't need any more lock to call hugetlb_walk(). Taking vma lock there is not needed because either: (1) potential callers of hugetlb pvmw holds i_mmap_rwsem already (from one rmap_walk()), or (2) the caller will not walk a hugetlb vma at all so the hugetlb code path not reachable (e.g. in ksm or uprobe paths). It's slightly implicit for future page_vma_mapped_walk() callers on that lock requirement. But anyway, when one day this rule breaks, one will get a straightforward warning in hugetlb_walk() with lockdep, then there'll be a way out. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20221216155229.2043750-1-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: John Hubbard Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: James Houghton Cc: Jann Horn Cc: Miaohe Lin Cc: Muchun Song Cc: Nadav Amit Cc: Rik van Riel Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'include/linux') diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d755e2a7c0db..b6b10101bea7 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -2,6 +2,7 @@ #ifndef _LINUX_HUGETLB_H #define _LINUX_HUGETLB_H +#include #include #include #include @@ -196,6 +197,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, * huge_pte_offset(): Walk the hugetlb pgtable until the last level PTE. * Returns the pte_t* if found, or NULL if the address is not mapped. * + * IMPORTANT: we should normally not directly call this function, instead + * this is only a common interface to implement arch-specific + * walker. Please use hugetlb_walk() instead, because that will attempt to + * verify the locking for you. + * * Since this function will walk all the pgtable pages (including not only * high-level pgtable page, but also PUD entry that can be unshared * concurrently for VM_SHARED), the caller of this function should be @@ -1229,4 +1235,35 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); #define flush_hugetlb_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end) #endif +static inline bool __vma_shareable_lock(struct vm_area_struct *vma) +{ + return (vma->vm_flags & VM_MAYSHARE) && vma->vm_private_data; +} + +/* + * Safe version of huge_pte_offset() to check the locks. See comments + * above huge_pte_offset(). + */ +static inline pte_t * +hugetlb_walk(struct vm_area_struct *vma, unsigned long addr, unsigned long sz) +{ +#if defined(CONFIG_HUGETLB_PAGE) && \ + defined(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && defined(CONFIG_LOCKDEP) + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * If pmd sharing possible, locking needed to safely walk the + * hugetlb pgtables. More information can be found at the comment + * above huge_pte_offset() in the same file. + * + * NOTE: lockdep_is_held() is only defined with CONFIG_LOCKDEP. + */ + if (__vma_shareable_lock(vma)) + WARN_ON_ONCE(!lockdep_is_held(&vma_lock->rw_sema) && + !lockdep_is_held( + &vma->vm_file->f_mapping->i_mmap_rwsem)); +#endif + return huge_pte_offset(vma->vm_mm, addr, sz); +} + #endif /* _LINUX_HUGETLB_H */ -- cgit v1.2.3 From d685c668b0695dff927c85e27ef27d4f404f16a3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 15 Dec 2022 21:43:51 +0000 Subject: buffer: add b_folio as an alias of b_page Patch series "Start converting buffer_heads to use folios". I was hoping that filesystems would convert from buffer_heads to iomap, but that's not happening particularly quickly. So the buffer_head infrastructure needs to be converted from being page-based to being folio-based. This patch (of 12): Buffer heads point to the allocation (ie the folio), not the page. This is currently the same thing for all filesystems that use buffer heads, so this is a safe transitional step. Link: https://lkml.kernel.org/r/20221215214402.3522366-1-willy@infradead.org Link: https://lkml.kernel.org/r/20221215214402.3522366-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jan Kara Signed-off-by: Andrew Morton --- include/linux/buffer_head.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 33fa5e94aa80..8f14dca5fed7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -61,7 +61,10 @@ typedef void (bh_end_io_t)(struct buffer_head *bh, int uptodate); struct buffer_head { unsigned long b_state; /* buffer state bitmap (see above) */ struct buffer_head *b_this_page;/* circular list of page's buffers */ - struct page *b_page; /* the page this bh is mapped to */ + union { + struct page *b_page; /* the page this bh is mapped to */ + struct folio *b_folio; /* the folio this bh is mapped to */ + }; sector_t b_blocknr; /* start block number */ size_t b_size; /* size of mapping */ -- cgit v1.2.3 From 6a6fe9ebd571a4092b7d5c1f11e4e1e15d296fa5 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 9 Dec 2022 10:06:18 +0800 Subject: mm: swap: convert mark_page_lazyfree() to folio_mark_lazyfree() mark_page_lazyfree() and the callers are converted to use folio, this rename and make it to take in a folio argument instead of calling page_folio(). Link: https://lkml.kernel.org/r/20221209020618.190306-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Vishal Moola (Oracle) Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 2787b84eaf12..93f1cebd8545 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -402,7 +402,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void deactivate_page(struct page *page); -extern void mark_page_lazyfree(struct page *page); +void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); extern void lru_cache_add_inactive_or_unevictable(struct page *page, -- cgit v1.2.3 From 98def236f63c66629fb6b2d4b69cecffc5b46539 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 5 Dec 2022 23:08:20 +0000 Subject: mm/damon/core: implement damos filter Patch series "implement DAMOS filtering for anon pages and/or specific memory cgroups" DAMOS let users do system operations in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. This patchset adds the support for all DAMOS actions that 'paddr' monitoring operations set supports ('pageout', 'lru_prio', and 'lru_deprio'), and the functionality is exposed via DAMON kernel API (damon.h) the DAMON sysfs interface (/sys/kernel/mm/damon/admins/), and DAMON_RECLAIM module parameters. Patches Sequence ---------------- First patch implements DAMOS filter interface to DAMON kernel API. Second patch makes the physical address space monitoring operations set to support the filters from all supporting DAMOS actions. Third patch adds anonymous pages filter support to DAMON_RECLAIM, and the fourth patch documents the DAMON_RECLAIM's new feature. Fifth to seventh patches implement DAMON sysfs files for support of the filters, and eighth patch connects the file to use DAMOS filters feature. Ninth patch adds simple self test cases for DAMOS filters of the sysfs interface. Finally, following two patches (tenth and eleventh) document the new features and interfaces. This patch (of 11): DAMOS lets users do system operation in a data access pattern oriented way. The data access pattern, which is extracted by DAMON, is somewhat accurate more than what user space could know in many cases. However, in some situation, users could know something more than the kernel about the pattern or some special requirements for some types of memory or processes. For example, some users would have slow swap devices and knows latency-ciritical processes and therefore want to use DAMON-based proactive reclamation (DAMON_RECLAIM) for only non-anonymous pages of non-latency-critical processes. For such restriction, users could exclude the memory regions from the initial monitoring regions and use non-dynamic monitoring regions update monitoring operations set including fvaddr and paddr. They could also adjust the DAMOS target access pattern. For dynamically changing memory layout and access pattern, those would be not enough. To help the case, add an interface, namely DAMOS filters, which can be used to avoid the DAMOS actions be applied to specific types of memory, to DAMON kernel API (damon.h). At the moment, it supports filtering anonymous pages and/or specific memory cgroups in or out for each DAMOS scheme. Note that this commit adds only the interface to the DAMON kernel API. The impelmentation should be made in the monitoring operations sets, and following commits will add that. Link: https://lkml.kernel.org/r/20221205230830.144349-1-sj@kernel.org Link: https://lkml.kernel.org/r/20221205230830.144349-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/damon.h | 51 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'include/linux') diff --git a/include/linux/damon.h b/include/linux/damon.h index ad15a5b88e3a..7907918ad2e0 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -8,6 +8,7 @@ #ifndef _DAMON_H_ #define _DAMON_H_ +#include #include #include #include @@ -215,6 +216,39 @@ struct damos_stat { unsigned long qt_exceeds; }; +/** + * enum damos_filter_type - Type of memory for &struct damos_filter + * @DAMOS_FILTER_TYPE_ANON: Anonymous pages. + * @DAMOS_FILTER_TYPE_MEMCG: Specific memcg's pages. + * @NR_DAMOS_FILTER_TYPES: Number of filter types. + */ +enum damos_filter_type { + DAMOS_FILTER_TYPE_ANON, + DAMOS_FILTER_TYPE_MEMCG, + NR_DAMOS_FILTER_TYPES, +}; + +/** + * struct damos_filter - DAMOS action target memory filter. + * @type: Type of the page. + * @matching: If the matching page should filtered out or in. + * @memcg_id: Memcg id of the question if @type is DAMOS_FILTER_MEMCG. + * @list: List head for siblings. + * + * Before applying the &damos->action to a memory region, DAMOS checks if each + * page of the region matches to this and avoid applying the action if so. + * Note that the check support is up to &struct damon_operations + * implementation. + */ +struct damos_filter { + enum damos_filter_type type; + bool matching; + union { + unsigned short memcg_id; + }; + struct list_head list; +}; + /** * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. @@ -239,6 +273,7 @@ struct damos_access_pattern { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. + * @filters: Additional set of &struct damos_filter for &action. * @stat: Statistics of this scheme. * @list: List head for siblings. * @@ -254,6 +289,10 @@ struct damos_access_pattern { * If all schemes that registered to a &struct damon_ctx are inactive, DAMON * stops monitoring and just repeatedly checks the watermarks. * + * Before applying the &action to a memory region, &struct damon_operations + * implementation could check pages of the region and skip &action to respect + * &filters + * * After applying the &action to each region, &stat_count and &stat_sz is * updated to reflect the number of regions and total size of regions that the * &action is applied. @@ -263,6 +302,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; + struct list_head filters; struct damos_stat stat; struct list_head list; }; @@ -516,6 +556,12 @@ static inline unsigned long damon_sz_region(struct damon_region *r) #define damon_for_each_scheme_safe(s, next, ctx) \ list_for_each_entry_safe(s, next, &(ctx)->schemes, list) +#define damos_for_each_filter(f, scheme) \ + list_for_each_entry(f, &(scheme)->filters, list) + +#define damos_for_each_filter_safe(f, next, scheme) \ + list_for_each_entry_safe(f, next, &(scheme)->filters, list) + #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); @@ -536,6 +582,11 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); +struct damos_filter *damos_new_filter(enum damos_filter_type type, + bool matching); +void damos_add_filter(struct damos *s, struct damos_filter *f); +void damos_destroy_filter(struct damos_filter *f); + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks); -- cgit v1.2.3 From 44383cef54c0ce1201f884d83cc2b367bc5aa4f7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 19 Dec 2022 19:09:18 +0100 Subject: kasan: allow sampling page_alloc allocations for HW_TAGS As Hardware Tag-Based KASAN is intended to be used in production, its performance impact is crucial. As page_alloc allocations tend to be big, tagging and checking all such allocations can introduce a significant slowdown. Add two new boot parameters that allow to alleviate that slowdown: - kasan.page_alloc.sample, which makes Hardware Tag-Based KASAN tag only every Nth page_alloc allocation with the order configured by the second added parameter (default: tag every such allocation). - kasan.page_alloc.sample.order, which makes sampling enabled by the first parameter only affect page_alloc allocations with the order equal or greater than the specified value (default: 3, see below). The exact performance improvement caused by using the new parameters depends on their values and the applied workload. The chosen default value for kasan.page_alloc.sample.order is 3, which matches both PAGE_ALLOC_COSTLY_ORDER and SKB_FRAG_PAGE_ORDER. This is done for two reasons: 1. PAGE_ALLOC_COSTLY_ORDER is "the order at which allocations are deemed costly to service", which corresponds to the idea that only large and thus costly allocations are supposed to sampled. 2. One of the workloads targeted by this patch is a benchmark that sends a large amount of data over a local loopback connection. Most multi-page data allocations in the networking subsystem have the order of SKB_FRAG_PAGE_ORDER (or PAGE_ALLOC_COSTLY_ORDER). When running a local loopback test on a testing MTE-enabled device in sync mode, enabling Hardware Tag-Based KASAN introduces a ~50% slowdown. Applying this patch and setting kasan.page_alloc.sampling to a value higher than 1 allows to lower the slowdown. The performance improvement saturates around the sampling interval value of 10 with the default sampling page order of 3. This lowers the slowdown to ~20%. The slowdown in real scenarios involving the network will likely be better. Enabling page_alloc sampling has a downside: KASAN misses bad accesses to a page_alloc allocation that has not been tagged. This lowers the value of KASAN as a security mitigation. However, based on measuring the number of page_alloc allocations of different orders during boot in a test build, sampling with the default kasan.page_alloc.sample.order value affects only ~7% of allocations. The rest ~93% of allocations are still checked deterministically. Link: https://lkml.kernel.org/r/129da0614123bb85ed4dd61ae30842b2dd7c903f.1671471846.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Jann Horn Cc: Mark Brand Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 96c9d56e5510..5ebbaf672009 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -120,12 +120,13 @@ static __always_inline void kasan_poison_pages(struct page *page, __kasan_poison_pages(page, order, init); } -void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); -static __always_inline void kasan_unpoison_pages(struct page *page, +bool __kasan_unpoison_pages(struct page *page, unsigned int order, bool init); +static __always_inline bool kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { if (kasan_enabled()) - __kasan_unpoison_pages(page, order, init); + return __kasan_unpoison_pages(page, order, init); + return false; } void __kasan_cache_create_kmalloc(struct kmem_cache *cache); @@ -249,8 +250,11 @@ static __always_inline bool kasan_check_byte(const void *addr) static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_unpoison_pages(struct page *page, unsigned int order, - bool init) {} +static inline bool kasan_unpoison_pages(struct page *page, unsigned int order, + bool init) +{ + return false; +} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, -- cgit v1.2.3 From 831978e37e93bd3e36612917a4b193278950daff Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:52 +0800 Subject: maple_tree: remove extra space and blank line Patch series "Clean up and refinement for maple tree", v2. This patchset cleans up and refines some maple tree code. A few small changes make the code easier to understand and for better readability. This patch (of 7): These extra space and blank lines are unnecessary, so drop them. Link: https://lkml.kernel.org/r/20221221060058.609003-1-vernon2gm@gmail.com Link: https://lkml.kernel.org/r/20221221060058.609003-2-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index e594db58a0f1..4ee5a969441c 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -517,7 +517,6 @@ static inline void mas_reset(struct ma_state *mas) * entry. * * Note: may return the zero entry. - * */ #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) @@ -639,7 +638,6 @@ static inline void mt_set_in_rcu(struct maple_tree *mt) } static inline unsigned int mt_height(const struct maple_tree *mt) - { return (mt->ma_flags & MT_FLAGS_HEIGHT_MASK) >> MT_FLAGS_HEIGHT_OFFSET; } -- cgit v1.2.3 From eabb305293835b191ffe60234587ae8bf5e4e9fd Mon Sep 17 00:00:00 2001 From: Vernon Yang Date: Wed, 21 Dec 2022 14:00:56 +0800 Subject: maple_tree: remove the redundant code The macros CONFIG_DEBUG_MAPLE_TREE_VERBOSE no one uses, functions mas_dup_tree() and mas_dup_store() are not implemented, just function declaration, so drop it. Link: https://lkml.kernel.org/r/20221221060058.609003-6-vernon2gm@gmail.com Signed-off-by: Vernon Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 4ee5a969441c..815a27661517 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -12,7 +12,6 @@ #include #include /* #define CONFIG_MAPLE_RCU_DISABLED */ -/* #define CONFIG_DEBUG_MAPLE_TREE_VERBOSE */ /* * Allocated nodes are mutable until they have been inserted into the tree, @@ -483,9 +482,6 @@ static inline bool mas_is_paused(struct ma_state *mas) return mas->node == MAS_PAUSE; } -void mas_dup_tree(struct ma_state *oldmas, struct ma_state *mas); -void mas_dup_store(struct ma_state *mas, void *entry); - /* * This finds an empty area from the highest address to the lowest. * AKA "Topdown" version, -- cgit v1.2.3 From 318e9342fbbb6888d903d86e83865609901a1c65 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:45 -0800 Subject: mm/memory: add vm_normal_folio() Patch series "Convert deactivate_page() to folio_deactivate()", v4. Deactivate_page() has already been converted to use folios. This patch series modifies the callers of deactivate_page() to use folios. It also introduces vm_normal_folio() to assist with folio conversions, and converts deactivate_page() to folio_deactivate() which takes in a folio. This patch (of 4): Introduce a wrapper function called vm_normal_folio(). This function calls vm_normal_page() and returns the folio of the page found, or null if no page is found. This function allows callers to get a folio from a pte, which will eventually allow them to completely replace their struct page variables with struct folio instead. Link: https://lkml.kernel.org/r/20221221180848.20774-1-vishal.moola@gmail.com Link: https://lkml.kernel.org/r/20221221180848.20774-2-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 253b2d7489e6..8e14183dfc58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1968,6 +1968,8 @@ static inline bool can_do_mlock(void) { return false; } extern int user_shm_lock(size_t, struct ucounts *); extern void user_shm_unlock(size_t, struct ucounts *); +struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr, + pte_t pte); struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, -- cgit v1.2.3 From 5a9e34747c9f731bbb6b7fd7521c4fec0d840593 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Wed, 21 Dec 2022 10:08:48 -0800 Subject: mm/swap: convert deactivate_page() to folio_deactivate() Deactivate_page() has already been converted to use folios, this change converts it to take in a folio argument instead of calling page_folio(). It also renames the function folio_deactivate() to be more consistent with other folio functions. [akpm@linux-foundation.org: fix left-over comments, per Yu Zhao] Link: https://lkml.kernel.org/r/20221221180848.20774-5-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Reviewed-by: Matthew Wilcox (Oracle) Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/swap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/swap.h b/include/linux/swap.h index 93f1cebd8545..87cecb8c0bdc 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -401,7 +401,7 @@ extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); -extern void deactivate_page(struct page *page); +void folio_deactivate(struct folio *folio); void folio_mark_lazyfree(struct folio *folio); extern void swap_setup(void); -- cgit v1.2.3 From 0b7b8704ddcee372099a2bc6781db6ab273a85d5 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Wed, 21 Dec 2022 22:42:45 +0800 Subject: mm: new primitive kvmemdup() Similar to kmemdup(), but support large amount of bytes with kvmalloc() and does *not* guarantee that the result will be physically contiguous. Use only in cases where kvmalloc() is needed and free it with kvfree(). Also adapt policy_unpack.c in case someone bisect into this. Link: https://lkml.kernel.org/r/20221221144245.27164-1-sunhao.th@gmail.com Signed-off-by: Hao Sun Suggested-by: Daniel Borkmann Cc: Nick Terrell Cc: John Johansen Cc: Paul Moore Cc: James Morris Cc: "Serge E. Hallyn" Signed-off-by: Andrew Morton --- include/linux/string.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/string.h b/include/linux/string.h index db28802ab0a6..c062c581a98b 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -177,6 +177,7 @@ extern char *kstrdup(const char *s, gfp_t gfp) __malloc; extern const char *kstrdup_const(const char *s, gfp_t gfp); extern char *kstrndup(const char *s, size_t len, gfp_t gfp); extern void *kmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); +extern void *kvmemdup(const void *src, size_t len, gfp_t gfp) __realloc_size(2); extern char *kmemdup_nul(const char *s, size_t len, gfp_t gfp); extern char **argv_split(gfp_t gfp, const char *str, int *argcp); -- cgit v1.2.3 From b5054174ac7c7d8fae15deee7ddc0e20fd604f30 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Dec 2022 21:24:54 +0000 Subject: mm: move FOLL_* defs to mm_types.h Move FOLL_* definitions to linux/mm_types.h to make them more accessible without having to drag in all of linux/mm.h and everything that drags in too[1]. Link: https://lkml.kernel.org/r/2161258.1671657894@warthog.procyon.org.uk Signed-off-by: David Howells Suggested-by: Matthew Wilcox Reviewed-by: John Hubbard Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton --- include/linux/mm.h | 75 ------------------------------------------------ include/linux/mm_types.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 75 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index 8e14183dfc58..d68579bf8484 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3057,81 +3057,6 @@ static inline vm_fault_t vmf_error(int err) struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags); -#define FOLL_WRITE 0x01 /* check pte is writable */ -#define FOLL_TOUCH 0x02 /* mark page accessed */ -#define FOLL_GET 0x04 /* do get_page on page */ -#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ -#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ -#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO - * and return without waiting upon it */ -#define FOLL_NOFAULT 0x80 /* do not fault in pages */ -#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ -#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_ANON 0x8000 /* don't do file mappings */ -#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ -#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ -#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ -#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ -#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ -#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ - -/* - * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each - * other. Here is what they mean, and how to use them: - * - * FOLL_LONGTERM indicates that the page will be held for an indefinite time - * period _often_ under userspace control. This is in contrast to - * iov_iter_get_pages(), whose usages are transient. - * - * FIXME: For pages which are part of a filesystem, mappings are subject to the - * lifetime enforced by the filesystem and we need guarantees that longterm - * users like RDMA and V4L2 only establish mappings which coordinate usage with - * the filesystem. Ideas for this coordination include revoking the longterm - * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was - * added after the problem with filesystems was found FS DAX VMAs are - * specifically failed. Filesystem pages are still subject to bugs and use of - * FOLL_LONGTERM should be avoided on those pages. - * - * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. - * Currently only get_user_pages() and get_user_pages_fast() support this flag - * and calls to get_user_pages_[un]locked are specifically not allowed. This - * is due to an incompatibility with the FS DAX check and - * FAULT_FLAG_ALLOW_RETRY. - * - * In the CMA case: long term pins in a CMA region would unnecessarily fragment - * that region. And so, CMA attempts to migrate the page before pinning, when - * FOLL_LONGTERM is specified. - * - * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, - * but an additional pin counting system) will be invoked. This is intended for - * anything that gets a page reference and then touches page data (for example, - * Direct IO). This lets the filesystem know that some non-file-system entity is - * potentially changing the pages' data. In contrast to FOLL_GET (whose pages - * are released via put_page()), FOLL_PIN pages must be released, ultimately, by - * a call to unpin_user_page(). - * - * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different - * and separate refcounting mechanisms, however, and that means that each has - * its own acquire and release mechanisms: - * - * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. - * - * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. - * - * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. - * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based - * calls applied to them, and that's perfectly OK. This is a constraint on the - * callers, not on the pages.) - * - * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never - * directly by the caller. That's in order to help avoid mismatches when - * releasing pages: get_user_pages*() pages must be released via put_page(), - * while pin_user_pages*() pages must be released via unpin_user_page(). - * - * Please see Documentation/core-api/pin_user_pages.rst for more information. - */ - static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags) { if (vm_fault & VM_FAULT_OOM) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9757067c3053..1118e381fcdc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1085,4 +1085,79 @@ enum fault_flag { typedef unsigned int __bitwise zap_flags_t; +/* + * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each + * other. Here is what they mean, and how to use them: + * + * FOLL_LONGTERM indicates that the page will be held for an indefinite time + * period _often_ under userspace control. This is in contrast to + * iov_iter_get_pages(), whose usages are transient. + * + * FIXME: For pages which are part of a filesystem, mappings are subject to the + * lifetime enforced by the filesystem and we need guarantees that longterm + * users like RDMA and V4L2 only establish mappings which coordinate usage with + * the filesystem. Ideas for this coordination include revoking the longterm + * pin, delaying writeback, bounce buffer page writeback, etc. As FS DAX was + * added after the problem with filesystems was found FS DAX VMAs are + * specifically failed. Filesystem pages are still subject to bugs and use of + * FOLL_LONGTERM should be avoided on those pages. + * + * FIXME: Also NOTE that FOLL_LONGTERM is not supported in every GUP call. + * Currently only get_user_pages() and get_user_pages_fast() support this flag + * and calls to get_user_pages_[un]locked are specifically not allowed. This + * is due to an incompatibility with the FS DAX check and + * FAULT_FLAG_ALLOW_RETRY. + * + * In the CMA case: long term pins in a CMA region would unnecessarily fragment + * that region. And so, CMA attempts to migrate the page before pinning, when + * FOLL_LONGTERM is specified. + * + * FOLL_PIN indicates that a special kind of tracking (not just page->_refcount, + * but an additional pin counting system) will be invoked. This is intended for + * anything that gets a page reference and then touches page data (for example, + * Direct IO). This lets the filesystem know that some non-file-system entity is + * potentially changing the pages' data. In contrast to FOLL_GET (whose pages + * are released via put_page()), FOLL_PIN pages must be released, ultimately, by + * a call to unpin_user_page(). + * + * FOLL_PIN is similar to FOLL_GET: both of these pin pages. They use different + * and separate refcounting mechanisms, however, and that means that each has + * its own acquire and release mechanisms: + * + * FOLL_GET: get_user_pages*() to acquire, and put_page() to release. + * + * FOLL_PIN: pin_user_pages*() to acquire, and unpin_user_pages to release. + * + * FOLL_PIN and FOLL_GET are mutually exclusive for a given function call. + * (The underlying pages may experience both FOLL_GET-based and FOLL_PIN-based + * calls applied to them, and that's perfectly OK. This is a constraint on the + * callers, not on the pages.) + * + * FOLL_PIN should be set internally by the pin_user_pages*() APIs, never + * directly by the caller. That's in order to help avoid mismatches when + * releasing pages: get_user_pages*() pages must be released via put_page(), + * while pin_user_pages*() pages must be released via unpin_user_page(). + * + * Please see Documentation/core-api/pin_user_pages.rst for more information. + */ + +#define FOLL_WRITE 0x01 /* check pte is writable */ +#define FOLL_TOUCH 0x02 /* mark page accessed */ +#define FOLL_GET 0x04 /* do get_page on page */ +#define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ +#define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_NOWAIT 0x20 /* if a disk transfer is needed, start the IO + * and return without waiting upon it */ +#define FOLL_NOFAULT 0x80 /* do not fault in pages */ +#define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ +#define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ +#define FOLL_ANON 0x8000 /* don't do file mappings */ +#define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ +#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ +#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */ +#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */ +#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */ +#define FOLL_INTERRUPTIBLE 0x200000 /* allow interrupts from generic signals */ + #endif /* _LINUX_MM_TYPES_H */ -- cgit v1.2.3 From 391655fe08d1f942359a11148aa9aaf3f99d6d6f Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:18:59 -0700 Subject: mm: multi-gen LRU: rename lru_gen_struct to lru_gen_folio Patch series "mm: multi-gen LRU: memcg LRU", v3. Overview ======== An memcg LRU is a per-node LRU of memcgs. It is also an LRU of LRUs, since each node and memcg combination has an LRU of folios (see mem_cgroup_lruvec()). Its goal is to improve the scalability of global reclaim, which is critical to system-wide memory overcommit in data centers. Note that memcg reclaim is currently out of scope. Its memory bloat is a pointer to each lruvec and negligible to each pglist_data. In terms of traversing memcgs during global reclaim, it improves the best-case complexity from O(n) to O(1) and does not affect the worst-case complexity O(n). Therefore, on average, it has a sublinear complexity in contrast to the current linear complexity. The basic structure of an memcg LRU can be understood by an analogy to the active/inactive LRU (of folios): 1. It has the young and the old (generations), i.e., the counterparts to the active and the inactive; 2. The increment of max_seq triggers promotion, i.e., the counterpart to activation; 3. Other events trigger similar operations, e.g., offlining an memcg triggers demotion, i.e., the counterpart to deactivation. In terms of global reclaim, it has two distinct features: 1. Sharding, which allows each thread to start at a random memcg (in the old generation) and improves parallelism; 2. Eventual fairness, which allows direct reclaim to bail out at will and reduces latency without affecting fairness over some time. The commit message in patch 6 details the workflow: https://lore.kernel.org/r/20221222041905.2431096-7-yuzhao@google.com/ The following is a simple test to quickly verify its effectiveness. Test design: 1. Create multiple memcgs. 2. Each memcg contains a job (fio). 3. All jobs access the same amount of memory randomly. 4. The system does not experience global memory pressure. 5. Periodically write to the root memory.reclaim. Desired outcome: 1. All memcgs have similar pgsteal counts, i.e., stddev(pgsteal) over mean(pgsteal) is close to 0%. 2. The total pgsteal is close to the total requested through memory.reclaim, i.e., sum(pgsteal) over sum(requested) is close to 100%. Actual outcome [1]: MGLRU off MGLRU on stddev(pgsteal) / mean(pgsteal) 75% 20% sum(pgsteal) / sum(requested) 425% 95% #################################################################### MEMCGS=128 for ((memcg = 0; memcg < $MEMCGS; memcg++)); do mkdir /sys/fs/cgroup/memcg$memcg done start() { echo $BASHPID > /sys/fs/cgroup/memcg$memcg/cgroup.procs fio -name=memcg$memcg --numjobs=1 --ioengine=mmap \ --filename=/dev/zero --size=1920M --rw=randrw \ --rate=64m,64m --random_distribution=random \ --fadvise_hint=0 --time_based --runtime=10h \ --group_reporting --minimal } for ((memcg = 0; memcg < $MEMCGS; memcg++)); do start & done sleep 600 for ((i = 0; i < 600; i++)); do echo 256m >/sys/fs/cgroup/memory.reclaim sleep 6 done for ((memcg = 0; memcg < $MEMCGS; memcg++)); do grep "pgsteal " /sys/fs/cgroup/memcg$memcg/memory.stat done #################################################################### [1]: This was obtained from running the above script (touches less than 256GB memory) on an EPYC 7B13 with 512GB DRAM for over an hour. This patch (of 8): The new name lru_gen_folio will be more distinct from the coming lru_gen_memcg. Link: https://lkml.kernel.org/r/20221222041905.2431096-1-yuzhao@google.com Link: https://lkml.kernel.org/r/20221222041905.2431096-2-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index ff3f3f23f649..177b8b1dd43c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -178,7 +178,7 @@ static inline void lru_gen_update_size(struct lruvec *lruvec, struct folio *foli int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); enum lru_list lru = type * LRU_INACTIVE_FILE; - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE(old_gen != -1 && old_gen >= MAX_NR_GENS); VM_WARN_ON_ONCE(new_gen != -1 && new_gen >= MAX_NR_GENS); @@ -224,7 +224,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); - struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen != -1, folio); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd28a100d9e4..1686fcc4ed01 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -404,7 +404,7 @@ enum { * The number of pages in each generation is eventually consistent and therefore * can be transiently negative when reset_batch_size() is pending. */ -struct lru_gen_struct { +struct lru_gen_folio { /* the aging increments the youngest generation number */ unsigned long max_seq; /* the eviction increments the oldest generation numbers */ @@ -461,7 +461,7 @@ struct lru_gen_mm_state { struct lru_gen_mm_walk { /* the lruvec under reclaim */ struct lruvec *lruvec; - /* unstable max_seq from lru_gen_struct */ + /* unstable max_seq from lru_gen_folio */ unsigned long max_seq; /* the next address within an mm to scan */ unsigned long next_addr; @@ -524,7 +524,7 @@ struct lruvec { unsigned long flags; #ifdef CONFIG_LRU_GEN /* evictable pages divided into generations */ - struct lru_gen_struct lrugen; + struct lru_gen_folio lrugen; /* to concurrently iterate lru_gen_mm_list */ struct lru_gen_mm_state mm_state; #endif -- cgit v1.2.3 From 6df1b2212950aae2b2188c6645ea18e2a9e3fdd5 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:00 -0700 Subject: mm: multi-gen LRU: rename lrugen->lists[] to lrugen->folios[] lru_gen_folio will be chained into per-node lists by the coming lrugen->list. Link: https://lkml.kernel.org/r/20221222041905.2431096-3-yuzhao@google.com Signed-off-by: Yu Zhao Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Michael Larabel Cc: Michal Hocko Cc: Mike Rapoport Cc: Roman Gushchin Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 4 ++-- include/linux/mmzone.h | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 177b8b1dd43c..eb8a2435ee80 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -256,9 +256,9 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ if (reclaiming) - list_add_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add_tail(&folio->lru, &lrugen->folios[gen][type][zone]); else - list_add(&folio->lru, &lrugen->lists[gen][type][zone]); + list_add(&folio->lru, &lrugen->folios[gen][type][zone]); return true; } diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1686fcc4ed01..6c96ee823dbd 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -312,7 +312,7 @@ enum lruvec_flags { * They form a sliding window of a variable size [MIN_NR_GENS, MAX_NR_GENS]. An * offset within MAX_NR_GENS, i.e., gen, indexes the LRU list of the * corresponding generation. The gen counter in folio->flags stores gen+1 while - * a page is on one of lrugen->lists[]. Otherwise it stores 0. + * a page is on one of lrugen->folios[]. Otherwise it stores 0. * * A page is added to the youngest generation on faulting. The aging needs to * check the accessed bit at least twice before handing this page over to the @@ -324,8 +324,8 @@ enum lruvec_flags { * rest of generations, if they exist, are considered inactive. See * lru_gen_is_active(). * - * PG_active is always cleared while a page is on one of lrugen->lists[] so that - * the aging needs not to worry about it. And it's set again when a page + * PG_active is always cleared while a page is on one of lrugen->folios[] so + * that the aging needs not to worry about it. And it's set again when a page * considered active is isolated for non-reclaiming purposes, e.g., migration. * See lru_gen_add_folio() and lru_gen_del_folio(). * @@ -412,7 +412,7 @@ struct lru_gen_folio { /* the birth time of each generation in jiffies */ unsigned long timestamps[MAX_NR_GENS]; /* the multi-gen LRU lists, lazily sorted on eviction */ - struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; + struct list_head folios[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the multi-gen LRU sizes, eventually consistent */ long nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; /* the exponential moving average of refaulted */ -- cgit v1.2.3 From e4dde56cd208674ce899b47589f263499e5b8cdc Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 21 Dec 2022 21:19:04 -0700 Subject: mm: multi-gen LRU: per-node lru_gen_folio lists For each node, memcgs are divided into two generations: the old and the young. For each generation, memcgs are randomly sharded into multiple bins to improve scalability. For each bin, an RCU hlist_nulls is virtually divided into three segments: the head, the tail and the default. An onlining memcg is added to the tail of a random bin in the old generation. The eviction starts at the head of a random bin in the old generation. The per-node memcg generation counter, whose reminder (mod 2) indexes the old generation, is incremented when all its bins become empty. There are four operations: 1. MEMCG_LRU_HEAD, which moves an memcg to the head of a random bin in its current generation (old or young) and updates its "seg" to "head"; 2. MEMCG_LRU_TAIL, which moves an memcg to the tail of a random bin in its current generation (old or young) and updates its "seg" to "tail"; 3. MEMCG_LRU_OLD, which moves an memcg to the head of a random bin in the old generation, updates its "gen